glam/scripts/match_pending_to_linkedin.py
2026-01-09 20:35:19 +01:00

381 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Match PENDING files to LinkedIn About pages and extract locations.
This script:
1. Builds a fuzzy lookup of LinkedIn About page organization names
2. Matches PENDING file emic_names to About pages
3. Extracts location data from matched About pages
4. Resolves PENDING files with proper GHCIDs
Usage:
python scripts/match_pending_to_linkedin.py --dry-run
python scripts/match_pending_to_linkedin.py
"""
import re
import json
import yaml
import shutil
import unicodedata
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, Optional, Tuple, List
from collections import defaultdict
# Paths
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
LINKEDIN_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual")
LOCATIONS_FILE = Path("/Users/kempersc/apps/glam/data/linkedin_locations.json")
# City mappings
CITY_TO_PROVINCE = {
'amsterdam': 'NH', 'haarlem': 'NH', 'alkmaar': 'NH', 'hilversum': 'NH',
'zaandam': 'NH', 'hoorn': 'NH', 'enkhuizen': 'NH', 'purmerend': 'NH',
'amstelveen': 'NH', 'heemstede': 'NH', 'bussum': 'NH', 'naarden': 'NH',
'muiden': 'NH', 'weesp': 'NH', 'edam': 'NH', 'volendam': 'NH',
'den helder': 'NH', 'laren': 'NH', 'blaricum': 'NH', 'castricum': 'NH',
'rotterdam': 'ZH', 'den haag': 'ZH', 'the hague': 'ZH', "'s-gravenhage": 'ZH',
'leiden': 'ZH', 'delft': 'ZH', 'dordrecht': 'ZH', 'gouda': 'ZH',
'schiedam': 'ZH', 'zoetermeer': 'ZH', 'vlaardingen': 'ZH',
'hoek van holland': 'ZH', 'maassluis': 'ZH', 'rijswijk': 'ZH',
'wassenaar': 'ZH', 'lisse': 'ZH', 'noordwijk': 'ZH', 'katwijk': 'ZH',
'utrecht': 'UT', 'amersfoort': 'UT', 'zeist': 'UT', 'nieuwegein': 'UT',
'veenendaal': 'UT', 'houten': 'UT', 'soest': 'UT', 'baarn': 'UT',
'doorn': 'UT', 'driebergen': 'UT', 'bunnik': 'UT',
'arnhem': 'GE', 'nijmegen': 'GE', 'apeldoorn': 'GE', 'ede': 'GE',
'wageningen': 'GE', 'doetinchem': 'GE', 'zutphen': 'GE', 'harderwijk': 'GE',
'tiel': 'GE', 'culemborg': 'GE', 'barneveld': 'GE', 'elburg': 'GE',
'winterswijk': 'GE', 'oosterbeek': 'GE', 'velp': 'GE', 'rhenen': 'GE',
'borculo': 'GE', 'lochem': 'GE', 'epe': 'GE',
'eindhoven': 'NB', 'tilburg': 'NB', 'breda': 'NB', "'s-hertogenbosch": 'NB',
'den bosch': 'NB', 'helmond': 'NB', 'oss': 'NB', 'roosendaal': 'NB',
'bergen op zoom': 'NB', 'waalwijk': 'NB', 'uden': 'NB', 'boxtel': 'NB',
'oisterwijk': 'NB', 'vught': 'NB', 'nuenen': 'NB', 'best': 'NB',
'etten-leur': 'NB', 'oosterhout': 'NB',
'maastricht': 'LI', 'venlo': 'LI', 'heerlen': 'LI', 'roermond': 'LI',
'sittard': 'LI', 'geleen': 'LI', 'weert': 'LI', 'valkenburg': 'LI',
'thorn': 'LI', 'venray': 'LI',
'zwolle': 'OV', 'deventer': 'OV', 'enschede': 'OV', 'hengelo': 'OV',
'almelo': 'OV', 'kampen': 'OV', 'oldenzaal': 'OV', 'rijssen': 'OV',
'staphorst': 'OV', 'giethoorn': 'OV', 'steenwijk': 'OV',
'leeuwarden': 'FR', 'drachten': 'FR', 'sneek': 'FR', 'heerenveen': 'FR',
'harlingen': 'FR', 'dokkum': 'FR', 'franeker': 'FR', 'joure': 'FR',
'workum': 'FR', 'makkum': 'FR', 'hindeloopen': 'FR',
'assen': 'DR', 'emmen': 'DR', 'hoogeveen': 'DR', 'meppel': 'DR',
'coevorden': 'DR', 'borger': 'DR', 'veenhuizen': 'DR',
'groningen': 'GR', 'veendam': 'GR', 'winschoten': 'GR',
'appingedam': 'GR', 'delfzijl': 'GR',
'middelburg': 'ZE', 'vlissingen': 'ZE', 'goes': 'ZE', 'terneuzen': 'ZE',
'zierikzee': 'ZE', 'veere': 'ZE',
'almere': 'FL', 'lelystad': 'FL', 'dronten': 'FL', 'urk': 'FL',
}
CITY_TO_CODE = {
'amsterdam': 'AMS', 'rotterdam': 'ROT', 'den haag': 'DHA', 'the hague': 'DHA',
'utrecht': 'UTR', 'eindhoven': 'EIN', 'tilburg': 'TIL', 'groningen': 'GRO',
'almere': 'ALM', 'breda': 'BRE', 'nijmegen': 'NIJ', 'apeldoorn': 'APE',
'haarlem': 'HAA', 'arnhem': 'ARN', 'enschede': 'ENS', 'amersfoort': 'AME',
'zaandam': 'ZAA', "'s-hertogenbosch": 'DBO', 'den bosch': 'DBO',
'zwolle': 'ZWO', 'leiden': 'LEI', 'maastricht': 'MAA', 'dordrecht': 'DOR',
'deventer': 'DEV', 'delft': 'DEL', 'alkmaar': 'ALK', 'leeuwarden': 'LEE',
'hilversum': 'HIL', 'assen': 'ASS', 'middelburg': 'MID', 'hoorn': 'HOO',
'enkhuizen': 'ENK', 'wageningen': 'WAG', 'gouda': 'GOU', 'venlo': 'VEN',
'heerlen': 'HEE', 'roermond': 'ROE', 'zeist': 'ZEI', 'ede': 'EDE',
'harderwijk': 'HAR', 'zutphen': 'ZUT', 'helmond': 'HEL', 'oss': 'OSS',
'schiedam': 'SCH', 'vlaardingen': 'VLA', 'hoek van holland': 'HVH',
'rijswijk': 'RIJ', 'wassenaar': 'WAS', 'sneek': 'SNE', 'dokkum': 'DOK',
'joure': 'JOU', 'meppel': 'MEP', 'coevorden': 'COE', 'lelystad': 'LEL',
}
def normalize_name(name: str) -> str:
"""Normalize organization name for matching."""
normalized = unicodedata.normalize('NFKD', name)
normalized = normalized.lower().strip()
# Remove punctuation except spaces
normalized = re.sub(r'[^\w\s]', '', normalized)
normalized = ' '.join(normalized.split())
return normalized
def extract_org_name_from_filename(filename: str) -> str:
"""Extract organization name from LinkedIn HTML filename."""
match = re.match(r'\(\d+\)\s*(.+?)_\s*(About|People)\s*_\s*LinkedIn', filename)
if match:
return match.group(1).strip()
match = re.match(r'(.+?)_\s*(About|People)\s*_\s*LinkedIn', filename)
if match:
return match.group(1).strip()
return filename
def build_linkedin_lookup() -> Dict[str, Path]:
"""Build normalized name -> About page path lookup."""
lookup = {}
for f in LINKEDIN_DIR.glob('*About*LinkedIn.html'):
org_name = extract_org_name_from_filename(f.name)
norm_name = normalize_name(org_name)
lookup[norm_name] = f
return lookup
def extract_location_from_about_page(filepath: Path) -> Optional[Dict]:
"""Extract location from LinkedIn About page."""
try:
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
# Pattern 1: org-locations-module with Primary
pattern = r'org-locations-module.*?Primary.*?<p[^>]*>(.*?)</p>'
match = re.search(pattern, content, re.DOTALL | re.IGNORECASE)
if match:
address = match.group(1).strip()
address = re.sub(r'<[^>]+>', '', address)
address = re.sub(r'\s+', ' ', address).strip()
return parse_address(address)
# Pattern 2: Locations section without Primary
pattern2 = r'Locations\s*\(\d+\).*?<p[^>]*class="[^"]*break-words[^"]*"[^>]*>(.*?)</p>'
match2 = re.search(pattern2, content, re.DOTALL | re.IGNORECASE)
if match2:
address = match2.group(1).strip()
address = re.sub(r'<[^>]+>', '', address)
address = re.sub(r'\s+', ' ', address).strip()
return parse_address(address)
return None
except Exception as e:
return None
def parse_address(address_text: str) -> Optional[Dict]:
"""Parse LinkedIn address string."""
parts = [p.strip() for p in address_text.split(',')]
if len(parts) < 2:
return None
result = {'raw': address_text}
# Country is last part (2-letter code)
country = parts[-1].upper().strip()
# Handle "Province Country" format
if len(country) > 2:
tokens = country.split()
for t in reversed(tokens):
if len(t) == 2 and t.isalpha():
country = t
break
result['country'] = country if len(country) == 2 else 'XX'
# Find city (usually second-to-last non-postal part)
for part in reversed(parts[:-1]):
part = part.strip()
# Skip parts that look like postal codes (have digits)
if not re.search(r'\d', part):
result['city'] = part
break
return result if 'city' in result else None
def normalize_country(country_raw: str) -> str:
"""Normalize country code."""
if not country_raw:
return 'XX'
country = country_raw.upper().strip()
if len(country) == 2 and country.isalpha():
return country
# Handle "Province Country" format
parts = country.split()
for p in reversed(parts):
if len(p) == 2 and p.isalpha():
return p
return 'XX'
def get_institution_type(data: Dict) -> str:
"""Get institution type code."""
type_map = {
'MUSEUM': 'M', 'ARCHIVE': 'A', 'LIBRARY': 'L', 'GALLERY': 'G',
'RESEARCH_CENTER': 'R', 'EDUCATION_PROVIDER': 'E', 'OFFICIAL_INSTITUTION': 'O',
'COLLECTING_SOCIETY': 'S', 'HOLY_SITES': 'H', 'DIGITAL_PLATFORM': 'D',
'BOTANICAL_ZOO': 'B', 'CORPORATION': 'C', 'FEATURES': 'F',
'INTANGIBLE_HERITAGE_GROUP': 'I', 'MIXED': 'X', 'PERSONAL_COLLECTION': 'P',
'NGO': 'N', 'TASTE_SMELL': 'T', 'UNKNOWN': 'U',
}
return type_map.get(data.get('institution_type', 'MUSEUM'), 'M')
def extract_abbreviation(name: str) -> str:
"""Extract abbreviation from name."""
skip_words = {'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der',
'en', 'of', 'the', 'a', 'an', 'and', 'or', 'museum', 'stichting'}
name_clean = re.sub(r'[^\w\s]', ' ', name)
words = [w for w in name_clean.split() if w.lower() not in skip_words and len(w) > 1]
if not words:
words = name_clean.split()[:3]
if len(words) == 1:
return words[0][:4].upper()
return ''.join(w[0] for w in words[:5]).upper()
def generate_ghcid(name: str, location: Dict, inst_type: str) -> str:
"""Generate GHCID from location data."""
country = normalize_country(location.get('country', 'XX'))
city = location.get('city', '').lower().strip()
province = CITY_TO_PROVINCE.get(city, 'XX')
city_code = CITY_TO_CODE.get(city)
if not city_code:
words = city.replace('-', ' ').split()
if len(words) == 1:
city_code = words[0][:3].upper()
else:
city_code = ''.join(w[0] for w in words[:3]).upper()
abbrev = extract_abbreviation(name)
return f"{country}-{province}-{city_code}-{inst_type}-{abbrev}"
def match_pending_to_linkedin(pending_files: List[Path], linkedin_lookup: Dict[str, Path]) -> Dict[Path, Tuple[Path, Dict]]:
"""Match PENDING files to LinkedIn About pages."""
matches = {}
for pending_path in pending_files:
try:
with open(pending_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
emic_name = data.get('custodian_name', {}).get('emic_name', '')
if not emic_name:
continue
norm_emic = normalize_name(emic_name)
# Exact match
if norm_emic in linkedin_lookup:
about_path = linkedin_lookup[norm_emic]
location = extract_location_from_about_page(about_path)
if location:
matches[pending_path] = (about_path, location)
continue
# Partial match - check if one contains the other
for norm_linkedin, about_path in linkedin_lookup.items():
if len(norm_emic) > 5 and len(norm_linkedin) > 5:
if norm_emic in norm_linkedin or norm_linkedin in norm_emic:
location = extract_location_from_about_page(about_path)
if location:
matches[pending_path] = (about_path, location)
break
except:
continue
return matches
def process_match(pending_path: Path, about_path: Path, location: Dict, dry_run: bool = True) -> Optional[str]:
"""Process a matched PENDING file."""
try:
with open(pending_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
emic_name = data.get('custodian_name', {}).get('emic_name', '')
inst_type = get_institution_type(data)
new_ghcid = generate_ghcid(emic_name, location, inst_type)
new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"
# Check collision
if new_path.exists() and new_path != pending_path:
name_slug = re.sub(r'[^\w]+', '-', emic_name.lower()).strip('-')[:30]
new_ghcid = f"{new_ghcid}-{name_slug}"
new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"
if dry_run:
print(f"[WOULD RESOLVE] {emic_name}")
print(f" LinkedIn: {about_path.name}")
print(f" Location: {location.get('city')}, {location.get('country')}")
print(f" -> {new_ghcid}.yaml")
return 'dry_run'
# Update data
data['ghcid_current'] = new_ghcid
if 'location' not in data:
data['location'] = {}
data['location']['city'] = location.get('city')
data['location']['country'] = location.get('country')
if 'ghcid_resolution' not in data:
data['ghcid_resolution'] = {}
data['ghcid_resolution']['method'] = 'linkedin_about_page_extraction'
data['ghcid_resolution']['resolved_at'] = datetime.now(timezone.utc).isoformat()
data['ghcid_resolution']['linkedin_source'] = about_path.name
with open(pending_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
shutil.move(pending_path, new_path)
print(f"[RESOLVED] {emic_name} -> {new_ghcid}.yaml")
return new_ghcid
except Exception as e:
print(f"[ERROR] {pending_path.name}: {e}")
return None
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true')
parser.add_argument('--limit', type=int, default=0)
args = parser.parse_args()
print("Building LinkedIn About page lookup...")
linkedin_lookup = build_linkedin_lookup()
print(f"Found {len(linkedin_lookup)} About pages")
print("\nFinding PENDING files...")
pending_files = list(CUSTODIAN_DIR.glob("*PENDING*.yaml"))
print(f"Found {len(pending_files)} PENDING files")
if args.limit:
pending_files = pending_files[:args.limit]
print("\nMatching PENDING files to LinkedIn About pages...")
matches = match_pending_to_linkedin(pending_files, linkedin_lookup)
print(f"Found {len(matches)} matches")
# Group by country
country_counts = defaultdict(int)
for pending_path, (about_path, location) in matches.items():
country = normalize_country(location.get('country', 'XX'))
country_counts[country] += 1
print("\nMatches by country:")
for country, count in sorted(country_counts.items(), key=lambda x: -x[1]):
print(f" {country}: {count}")
print(f"\n{'DRY RUN - ' if args.dry_run else ''}Processing matches...")
success = 0
failed = 0
for pending_path, (about_path, location) in matches.items():
result = process_match(pending_path, about_path, location, dry_run=args.dry_run)
if result:
success += 1
else:
failed += 1
print(f"\n{'Would resolve' if args.dry_run else 'Resolved'}: {success}")
if failed:
print(f"Failed: {failed}")
print(f"Remaining unmatched: {len(pending_files) - len(matches)}")
if __name__ == '__main__':
main()