#!/usr/bin/env python3 """ Match PENDING files to LinkedIn About pages and extract locations. This script: 1. Builds a fuzzy lookup of LinkedIn About page organization names 2. Matches PENDING file emic_names to About pages 3. Extracts location data from matched About pages 4. Resolves PENDING files with proper GHCIDs Usage: python scripts/match_pending_to_linkedin.py --dry-run python scripts/match_pending_to_linkedin.py """ import re import json import yaml import shutil import unicodedata from pathlib import Path from datetime import datetime, timezone from typing import Dict, Optional, Tuple, List from collections import defaultdict # Paths CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") LINKEDIN_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual") LOCATIONS_FILE = Path("/Users/kempersc/apps/glam/data/linkedin_locations.json") # City mappings CITY_TO_PROVINCE = { 'amsterdam': 'NH', 'haarlem': 'NH', 'alkmaar': 'NH', 'hilversum': 'NH', 'zaandam': 'NH', 'hoorn': 'NH', 'enkhuizen': 'NH', 'purmerend': 'NH', 'amstelveen': 'NH', 'heemstede': 'NH', 'bussum': 'NH', 'naarden': 'NH', 'muiden': 'NH', 'weesp': 'NH', 'edam': 'NH', 'volendam': 'NH', 'den helder': 'NH', 'laren': 'NH', 'blaricum': 'NH', 'castricum': 'NH', 'rotterdam': 'ZH', 'den haag': 'ZH', 'the hague': 'ZH', "'s-gravenhage": 'ZH', 'leiden': 'ZH', 'delft': 'ZH', 'dordrecht': 'ZH', 'gouda': 'ZH', 'schiedam': 'ZH', 'zoetermeer': 'ZH', 'vlaardingen': 'ZH', 'hoek van holland': 'ZH', 'maassluis': 'ZH', 'rijswijk': 'ZH', 'wassenaar': 'ZH', 'lisse': 'ZH', 'noordwijk': 'ZH', 'katwijk': 'ZH', 'utrecht': 'UT', 'amersfoort': 'UT', 'zeist': 'UT', 'nieuwegein': 'UT', 'veenendaal': 'UT', 'houten': 'UT', 'soest': 'UT', 'baarn': 'UT', 'doorn': 'UT', 'driebergen': 'UT', 'bunnik': 'UT', 'arnhem': 'GE', 'nijmegen': 'GE', 'apeldoorn': 'GE', 'ede': 'GE', 'wageningen': 'GE', 'doetinchem': 'GE', 'zutphen': 'GE', 'harderwijk': 'GE', 'tiel': 'GE', 'culemborg': 'GE', 'barneveld': 'GE', 'elburg': 'GE', 'winterswijk': 'GE', 'oosterbeek': 'GE', 'velp': 'GE', 'rhenen': 'GE', 'borculo': 'GE', 'lochem': 'GE', 'epe': 'GE', 'eindhoven': 'NB', 'tilburg': 'NB', 'breda': 'NB', "'s-hertogenbosch": 'NB', 'den bosch': 'NB', 'helmond': 'NB', 'oss': 'NB', 'roosendaal': 'NB', 'bergen op zoom': 'NB', 'waalwijk': 'NB', 'uden': 'NB', 'boxtel': 'NB', 'oisterwijk': 'NB', 'vught': 'NB', 'nuenen': 'NB', 'best': 'NB', 'etten-leur': 'NB', 'oosterhout': 'NB', 'maastricht': 'LI', 'venlo': 'LI', 'heerlen': 'LI', 'roermond': 'LI', 'sittard': 'LI', 'geleen': 'LI', 'weert': 'LI', 'valkenburg': 'LI', 'thorn': 'LI', 'venray': 'LI', 'zwolle': 'OV', 'deventer': 'OV', 'enschede': 'OV', 'hengelo': 'OV', 'almelo': 'OV', 'kampen': 'OV', 'oldenzaal': 'OV', 'rijssen': 'OV', 'staphorst': 'OV', 'giethoorn': 'OV', 'steenwijk': 'OV', 'leeuwarden': 'FR', 'drachten': 'FR', 'sneek': 'FR', 'heerenveen': 'FR', 'harlingen': 'FR', 'dokkum': 'FR', 'franeker': 'FR', 'joure': 'FR', 'workum': 'FR', 'makkum': 'FR', 'hindeloopen': 'FR', 'assen': 'DR', 'emmen': 'DR', 'hoogeveen': 'DR', 'meppel': 'DR', 'coevorden': 'DR', 'borger': 'DR', 'veenhuizen': 'DR', 'groningen': 'GR', 'veendam': 'GR', 'winschoten': 'GR', 'appingedam': 'GR', 'delfzijl': 'GR', 'middelburg': 'ZE', 'vlissingen': 'ZE', 'goes': 'ZE', 'terneuzen': 'ZE', 'zierikzee': 'ZE', 'veere': 'ZE', 'almere': 'FL', 'lelystad': 'FL', 'dronten': 'FL', 'urk': 'FL', } CITY_TO_CODE = { 'amsterdam': 'AMS', 'rotterdam': 'ROT', 'den haag': 'DHA', 'the hague': 'DHA', 'utrecht': 'UTR', 'eindhoven': 'EIN', 'tilburg': 'TIL', 'groningen': 'GRO', 'almere': 'ALM', 'breda': 'BRE', 'nijmegen': 'NIJ', 'apeldoorn': 'APE', 'haarlem': 'HAA', 'arnhem': 'ARN', 'enschede': 'ENS', 'amersfoort': 'AME', 'zaandam': 'ZAA', "'s-hertogenbosch": 'DBO', 'den bosch': 'DBO', 'zwolle': 'ZWO', 'leiden': 'LEI', 'maastricht': 'MAA', 'dordrecht': 'DOR', 'deventer': 'DEV', 'delft': 'DEL', 'alkmaar': 'ALK', 'leeuwarden': 'LEE', 'hilversum': 'HIL', 'assen': 'ASS', 'middelburg': 'MID', 'hoorn': 'HOO', 'enkhuizen': 'ENK', 'wageningen': 'WAG', 'gouda': 'GOU', 'venlo': 'VEN', 'heerlen': 'HEE', 'roermond': 'ROE', 'zeist': 'ZEI', 'ede': 'EDE', 'harderwijk': 'HAR', 'zutphen': 'ZUT', 'helmond': 'HEL', 'oss': 'OSS', 'schiedam': 'SCH', 'vlaardingen': 'VLA', 'hoek van holland': 'HVH', 'rijswijk': 'RIJ', 'wassenaar': 'WAS', 'sneek': 'SNE', 'dokkum': 'DOK', 'joure': 'JOU', 'meppel': 'MEP', 'coevorden': 'COE', 'lelystad': 'LEL', } def normalize_name(name: str) -> str: """Normalize organization name for matching.""" normalized = unicodedata.normalize('NFKD', name) normalized = normalized.lower().strip() # Remove punctuation except spaces normalized = re.sub(r'[^\w\s]', '', normalized) normalized = ' '.join(normalized.split()) return normalized def extract_org_name_from_filename(filename: str) -> str: """Extract organization name from LinkedIn HTML filename.""" match = re.match(r'\(\d+\)\s*(.+?)_\s*(About|People)\s*_\s*LinkedIn', filename) if match: return match.group(1).strip() match = re.match(r'(.+?)_\s*(About|People)\s*_\s*LinkedIn', filename) if match: return match.group(1).strip() return filename def build_linkedin_lookup() -> Dict[str, Path]: """Build normalized name -> About page path lookup.""" lookup = {} for f in LINKEDIN_DIR.glob('*About*LinkedIn.html'): org_name = extract_org_name_from_filename(f.name) norm_name = normalize_name(org_name) lookup[norm_name] = f return lookup def extract_location_from_about_page(filepath: Path) -> Optional[Dict]: """Extract location from LinkedIn About page.""" try: with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() # Pattern 1: org-locations-module with Primary pattern = r'org-locations-module.*?Primary.*?
]*>(.*?)
' match = re.search(pattern, content, re.DOTALL | re.IGNORECASE) if match: address = match.group(1).strip() address = re.sub(r'<[^>]+>', '', address) address = re.sub(r'\s+', ' ', address).strip() return parse_address(address) # Pattern 2: Locations section without Primary pattern2 = r'Locations\s*\(\d+\).*?]*class="[^"]*break-words[^"]*"[^>]*>(.*?)
' match2 = re.search(pattern2, content, re.DOTALL | re.IGNORECASE) if match2: address = match2.group(1).strip() address = re.sub(r'<[^>]+>', '', address) address = re.sub(r'\s+', ' ', address).strip() return parse_address(address) return None except Exception as e: return None def parse_address(address_text: str) -> Optional[Dict]: """Parse LinkedIn address string.""" parts = [p.strip() for p in address_text.split(',')] if len(parts) < 2: return None result = {'raw': address_text} # Country is last part (2-letter code) country = parts[-1].upper().strip() # Handle "Province Country" format if len(country) > 2: tokens = country.split() for t in reversed(tokens): if len(t) == 2 and t.isalpha(): country = t break result['country'] = country if len(country) == 2 else 'XX' # Find city (usually second-to-last non-postal part) for part in reversed(parts[:-1]): part = part.strip() # Skip parts that look like postal codes (have digits) if not re.search(r'\d', part): result['city'] = part break return result if 'city' in result else None def normalize_country(country_raw: str) -> str: """Normalize country code.""" if not country_raw: return 'XX' country = country_raw.upper().strip() if len(country) == 2 and country.isalpha(): return country # Handle "Province Country" format parts = country.split() for p in reversed(parts): if len(p) == 2 and p.isalpha(): return p return 'XX' def get_institution_type(data: Dict) -> str: """Get institution type code.""" type_map = { 'MUSEUM': 'M', 'ARCHIVE': 'A', 'LIBRARY': 'L', 'GALLERY': 'G', 'RESEARCH_CENTER': 'R', 'EDUCATION_PROVIDER': 'E', 'OFFICIAL_INSTITUTION': 'O', 'COLLECTING_SOCIETY': 'S', 'HOLY_SITES': 'H', 'DIGITAL_PLATFORM': 'D', 'BOTANICAL_ZOO': 'B', 'CORPORATION': 'C', 'FEATURES': 'F', 'INTANGIBLE_HERITAGE_GROUP': 'I', 'MIXED': 'X', 'PERSONAL_COLLECTION': 'P', 'NGO': 'N', 'TASTE_SMELL': 'T', 'UNKNOWN': 'U', } return type_map.get(data.get('institution_type', 'MUSEUM'), 'M') def extract_abbreviation(name: str) -> str: """Extract abbreviation from name.""" skip_words = {'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'en', 'of', 'the', 'a', 'an', 'and', 'or', 'museum', 'stichting'} name_clean = re.sub(r'[^\w\s]', ' ', name) words = [w for w in name_clean.split() if w.lower() not in skip_words and len(w) > 1] if not words: words = name_clean.split()[:3] if len(words) == 1: return words[0][:4].upper() return ''.join(w[0] for w in words[:5]).upper() def generate_ghcid(name: str, location: Dict, inst_type: str) -> str: """Generate GHCID from location data.""" country = normalize_country(location.get('country', 'XX')) city = location.get('city', '').lower().strip() province = CITY_TO_PROVINCE.get(city, 'XX') city_code = CITY_TO_CODE.get(city) if not city_code: words = city.replace('-', ' ').split() if len(words) == 1: city_code = words[0][:3].upper() else: city_code = ''.join(w[0] for w in words[:3]).upper() abbrev = extract_abbreviation(name) return f"{country}-{province}-{city_code}-{inst_type}-{abbrev}" def match_pending_to_linkedin(pending_files: List[Path], linkedin_lookup: Dict[str, Path]) -> Dict[Path, Tuple[Path, Dict]]: """Match PENDING files to LinkedIn About pages.""" matches = {} for pending_path in pending_files: try: with open(pending_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) emic_name = data.get('custodian_name', {}).get('emic_name', '') if not emic_name: continue norm_emic = normalize_name(emic_name) # Exact match if norm_emic in linkedin_lookup: about_path = linkedin_lookup[norm_emic] location = extract_location_from_about_page(about_path) if location: matches[pending_path] = (about_path, location) continue # Partial match - check if one contains the other for norm_linkedin, about_path in linkedin_lookup.items(): if len(norm_emic) > 5 and len(norm_linkedin) > 5: if norm_emic in norm_linkedin or norm_linkedin in norm_emic: location = extract_location_from_about_page(about_path) if location: matches[pending_path] = (about_path, location) break except: continue return matches def process_match(pending_path: Path, about_path: Path, location: Dict, dry_run: bool = True) -> Optional[str]: """Process a matched PENDING file.""" try: with open(pending_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) emic_name = data.get('custodian_name', {}).get('emic_name', '') inst_type = get_institution_type(data) new_ghcid = generate_ghcid(emic_name, location, inst_type) new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml" # Check collision if new_path.exists() and new_path != pending_path: name_slug = re.sub(r'[^\w]+', '-', emic_name.lower()).strip('-')[:30] new_ghcid = f"{new_ghcid}-{name_slug}" new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml" if dry_run: print(f"[WOULD RESOLVE] {emic_name}") print(f" LinkedIn: {about_path.name}") print(f" Location: {location.get('city')}, {location.get('country')}") print(f" -> {new_ghcid}.yaml") return 'dry_run' # Update data data['ghcid_current'] = new_ghcid if 'location' not in data: data['location'] = {} data['location']['city'] = location.get('city') data['location']['country'] = location.get('country') if 'ghcid_resolution' not in data: data['ghcid_resolution'] = {} data['ghcid_resolution']['method'] = 'linkedin_about_page_extraction' data['ghcid_resolution']['resolved_at'] = datetime.now(timezone.utc).isoformat() data['ghcid_resolution']['linkedin_source'] = about_path.name with open(pending_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) shutil.move(pending_path, new_path) print(f"[RESOLVED] {emic_name} -> {new_ghcid}.yaml") return new_ghcid except Exception as e: print(f"[ERROR] {pending_path.name}: {e}") return None def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--dry-run', action='store_true') parser.add_argument('--limit', type=int, default=0) args = parser.parse_args() print("Building LinkedIn About page lookup...") linkedin_lookup = build_linkedin_lookup() print(f"Found {len(linkedin_lookup)} About pages") print("\nFinding PENDING files...") pending_files = list(CUSTODIAN_DIR.glob("*PENDING*.yaml")) print(f"Found {len(pending_files)} PENDING files") if args.limit: pending_files = pending_files[:args.limit] print("\nMatching PENDING files to LinkedIn About pages...") matches = match_pending_to_linkedin(pending_files, linkedin_lookup) print(f"Found {len(matches)} matches") # Group by country country_counts = defaultdict(int) for pending_path, (about_path, location) in matches.items(): country = normalize_country(location.get('country', 'XX')) country_counts[country] += 1 print("\nMatches by country:") for country, count in sorted(country_counts.items(), key=lambda x: -x[1]): print(f" {country}: {count}") print(f"\n{'DRY RUN - ' if args.dry_run else ''}Processing matches...") success = 0 failed = 0 for pending_path, (about_path, location) in matches.items(): result = process_match(pending_path, about_path, location, dry_run=args.dry_run) if result: success += 1 else: failed += 1 print(f"\n{'Would resolve' if args.dry_run else 'Resolved'}: {success}") if failed: print(f"Failed: {failed}") print(f"Remaining unmatched: {len(pending_files) - len(matches)}") if __name__ == '__main__': main()