381 lines
15 KiB
Python
381 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Match PENDING files to LinkedIn About pages and extract locations.
|
|
|
|
This script:
|
|
1. Builds a fuzzy lookup of LinkedIn About page organization names
|
|
2. Matches PENDING file emic_names to About pages
|
|
3. Extracts location data from matched About pages
|
|
4. Resolves PENDING files with proper GHCIDs
|
|
|
|
Usage:
|
|
python scripts/match_pending_to_linkedin.py --dry-run
|
|
python scripts/match_pending_to_linkedin.py
|
|
"""
|
|
|
|
import re
|
|
import json
|
|
import yaml
|
|
import shutil
|
|
import unicodedata
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, Optional, Tuple, List
|
|
from collections import defaultdict
|
|
|
|
# Paths
|
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
|
LINKEDIN_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual")
|
|
LOCATIONS_FILE = Path("/Users/kempersc/apps/glam/data/linkedin_locations.json")
|
|
|
|
# City mappings
|
|
CITY_TO_PROVINCE = {
|
|
'amsterdam': 'NH', 'haarlem': 'NH', 'alkmaar': 'NH', 'hilversum': 'NH',
|
|
'zaandam': 'NH', 'hoorn': 'NH', 'enkhuizen': 'NH', 'purmerend': 'NH',
|
|
'amstelveen': 'NH', 'heemstede': 'NH', 'bussum': 'NH', 'naarden': 'NH',
|
|
'muiden': 'NH', 'weesp': 'NH', 'edam': 'NH', 'volendam': 'NH',
|
|
'den helder': 'NH', 'laren': 'NH', 'blaricum': 'NH', 'castricum': 'NH',
|
|
'rotterdam': 'ZH', 'den haag': 'ZH', 'the hague': 'ZH', "'s-gravenhage": 'ZH',
|
|
'leiden': 'ZH', 'delft': 'ZH', 'dordrecht': 'ZH', 'gouda': 'ZH',
|
|
'schiedam': 'ZH', 'zoetermeer': 'ZH', 'vlaardingen': 'ZH',
|
|
'hoek van holland': 'ZH', 'maassluis': 'ZH', 'rijswijk': 'ZH',
|
|
'wassenaar': 'ZH', 'lisse': 'ZH', 'noordwijk': 'ZH', 'katwijk': 'ZH',
|
|
'utrecht': 'UT', 'amersfoort': 'UT', 'zeist': 'UT', 'nieuwegein': 'UT',
|
|
'veenendaal': 'UT', 'houten': 'UT', 'soest': 'UT', 'baarn': 'UT',
|
|
'doorn': 'UT', 'driebergen': 'UT', 'bunnik': 'UT',
|
|
'arnhem': 'GE', 'nijmegen': 'GE', 'apeldoorn': 'GE', 'ede': 'GE',
|
|
'wageningen': 'GE', 'doetinchem': 'GE', 'zutphen': 'GE', 'harderwijk': 'GE',
|
|
'tiel': 'GE', 'culemborg': 'GE', 'barneveld': 'GE', 'elburg': 'GE',
|
|
'winterswijk': 'GE', 'oosterbeek': 'GE', 'velp': 'GE', 'rhenen': 'GE',
|
|
'borculo': 'GE', 'lochem': 'GE', 'epe': 'GE',
|
|
'eindhoven': 'NB', 'tilburg': 'NB', 'breda': 'NB', "'s-hertogenbosch": 'NB',
|
|
'den bosch': 'NB', 'helmond': 'NB', 'oss': 'NB', 'roosendaal': 'NB',
|
|
'bergen op zoom': 'NB', 'waalwijk': 'NB', 'uden': 'NB', 'boxtel': 'NB',
|
|
'oisterwijk': 'NB', 'vught': 'NB', 'nuenen': 'NB', 'best': 'NB',
|
|
'etten-leur': 'NB', 'oosterhout': 'NB',
|
|
'maastricht': 'LI', 'venlo': 'LI', 'heerlen': 'LI', 'roermond': 'LI',
|
|
'sittard': 'LI', 'geleen': 'LI', 'weert': 'LI', 'valkenburg': 'LI',
|
|
'thorn': 'LI', 'venray': 'LI',
|
|
'zwolle': 'OV', 'deventer': 'OV', 'enschede': 'OV', 'hengelo': 'OV',
|
|
'almelo': 'OV', 'kampen': 'OV', 'oldenzaal': 'OV', 'rijssen': 'OV',
|
|
'staphorst': 'OV', 'giethoorn': 'OV', 'steenwijk': 'OV',
|
|
'leeuwarden': 'FR', 'drachten': 'FR', 'sneek': 'FR', 'heerenveen': 'FR',
|
|
'harlingen': 'FR', 'dokkum': 'FR', 'franeker': 'FR', 'joure': 'FR',
|
|
'workum': 'FR', 'makkum': 'FR', 'hindeloopen': 'FR',
|
|
'assen': 'DR', 'emmen': 'DR', 'hoogeveen': 'DR', 'meppel': 'DR',
|
|
'coevorden': 'DR', 'borger': 'DR', 'veenhuizen': 'DR',
|
|
'groningen': 'GR', 'veendam': 'GR', 'winschoten': 'GR',
|
|
'appingedam': 'GR', 'delfzijl': 'GR',
|
|
'middelburg': 'ZE', 'vlissingen': 'ZE', 'goes': 'ZE', 'terneuzen': 'ZE',
|
|
'zierikzee': 'ZE', 'veere': 'ZE',
|
|
'almere': 'FL', 'lelystad': 'FL', 'dronten': 'FL', 'urk': 'FL',
|
|
}
|
|
|
|
CITY_TO_CODE = {
|
|
'amsterdam': 'AMS', 'rotterdam': 'ROT', 'den haag': 'DHA', 'the hague': 'DHA',
|
|
'utrecht': 'UTR', 'eindhoven': 'EIN', 'tilburg': 'TIL', 'groningen': 'GRO',
|
|
'almere': 'ALM', 'breda': 'BRE', 'nijmegen': 'NIJ', 'apeldoorn': 'APE',
|
|
'haarlem': 'HAA', 'arnhem': 'ARN', 'enschede': 'ENS', 'amersfoort': 'AME',
|
|
'zaandam': 'ZAA', "'s-hertogenbosch": 'DBO', 'den bosch': 'DBO',
|
|
'zwolle': 'ZWO', 'leiden': 'LEI', 'maastricht': 'MAA', 'dordrecht': 'DOR',
|
|
'deventer': 'DEV', 'delft': 'DEL', 'alkmaar': 'ALK', 'leeuwarden': 'LEE',
|
|
'hilversum': 'HIL', 'assen': 'ASS', 'middelburg': 'MID', 'hoorn': 'HOO',
|
|
'enkhuizen': 'ENK', 'wageningen': 'WAG', 'gouda': 'GOU', 'venlo': 'VEN',
|
|
'heerlen': 'HEE', 'roermond': 'ROE', 'zeist': 'ZEI', 'ede': 'EDE',
|
|
'harderwijk': 'HAR', 'zutphen': 'ZUT', 'helmond': 'HEL', 'oss': 'OSS',
|
|
'schiedam': 'SCH', 'vlaardingen': 'VLA', 'hoek van holland': 'HVH',
|
|
'rijswijk': 'RIJ', 'wassenaar': 'WAS', 'sneek': 'SNE', 'dokkum': 'DOK',
|
|
'joure': 'JOU', 'meppel': 'MEP', 'coevorden': 'COE', 'lelystad': 'LEL',
|
|
}
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize organization name for matching."""
|
|
normalized = unicodedata.normalize('NFKD', name)
|
|
normalized = normalized.lower().strip()
|
|
# Remove punctuation except spaces
|
|
normalized = re.sub(r'[^\w\s]', '', normalized)
|
|
normalized = ' '.join(normalized.split())
|
|
return normalized
|
|
|
|
|
|
def extract_org_name_from_filename(filename: str) -> str:
|
|
"""Extract organization name from LinkedIn HTML filename."""
|
|
match = re.match(r'\(\d+\)\s*(.+?)_\s*(About|People)\s*_\s*LinkedIn', filename)
|
|
if match:
|
|
return match.group(1).strip()
|
|
match = re.match(r'(.+?)_\s*(About|People)\s*_\s*LinkedIn', filename)
|
|
if match:
|
|
return match.group(1).strip()
|
|
return filename
|
|
|
|
|
|
def build_linkedin_lookup() -> Dict[str, Path]:
|
|
"""Build normalized name -> About page path lookup."""
|
|
lookup = {}
|
|
for f in LINKEDIN_DIR.glob('*About*LinkedIn.html'):
|
|
org_name = extract_org_name_from_filename(f.name)
|
|
norm_name = normalize_name(org_name)
|
|
lookup[norm_name] = f
|
|
return lookup
|
|
|
|
|
|
def extract_location_from_about_page(filepath: Path) -> Optional[Dict]:
|
|
"""Extract location from LinkedIn About page."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
|
content = f.read()
|
|
|
|
# Pattern 1: org-locations-module with Primary
|
|
pattern = r'org-locations-module.*?Primary.*?<p[^>]*>(.*?)</p>'
|
|
match = re.search(pattern, content, re.DOTALL | re.IGNORECASE)
|
|
|
|
if match:
|
|
address = match.group(1).strip()
|
|
address = re.sub(r'<[^>]+>', '', address)
|
|
address = re.sub(r'\s+', ' ', address).strip()
|
|
return parse_address(address)
|
|
|
|
# Pattern 2: Locations section without Primary
|
|
pattern2 = r'Locations\s*\(\d+\).*?<p[^>]*class="[^"]*break-words[^"]*"[^>]*>(.*?)</p>'
|
|
match2 = re.search(pattern2, content, re.DOTALL | re.IGNORECASE)
|
|
|
|
if match2:
|
|
address = match2.group(1).strip()
|
|
address = re.sub(r'<[^>]+>', '', address)
|
|
address = re.sub(r'\s+', ' ', address).strip()
|
|
return parse_address(address)
|
|
|
|
return None
|
|
except Exception as e:
|
|
return None
|
|
|
|
|
|
def parse_address(address_text: str) -> Optional[Dict]:
|
|
"""Parse LinkedIn address string."""
|
|
parts = [p.strip() for p in address_text.split(',')]
|
|
if len(parts) < 2:
|
|
return None
|
|
|
|
result = {'raw': address_text}
|
|
|
|
# Country is last part (2-letter code)
|
|
country = parts[-1].upper().strip()
|
|
# Handle "Province Country" format
|
|
if len(country) > 2:
|
|
tokens = country.split()
|
|
for t in reversed(tokens):
|
|
if len(t) == 2 and t.isalpha():
|
|
country = t
|
|
break
|
|
result['country'] = country if len(country) == 2 else 'XX'
|
|
|
|
# Find city (usually second-to-last non-postal part)
|
|
for part in reversed(parts[:-1]):
|
|
part = part.strip()
|
|
# Skip parts that look like postal codes (have digits)
|
|
if not re.search(r'\d', part):
|
|
result['city'] = part
|
|
break
|
|
|
|
return result if 'city' in result else None
|
|
|
|
|
|
def normalize_country(country_raw: str) -> str:
|
|
"""Normalize country code."""
|
|
if not country_raw:
|
|
return 'XX'
|
|
country = country_raw.upper().strip()
|
|
if len(country) == 2 and country.isalpha():
|
|
return country
|
|
# Handle "Province Country" format
|
|
parts = country.split()
|
|
for p in reversed(parts):
|
|
if len(p) == 2 and p.isalpha():
|
|
return p
|
|
return 'XX'
|
|
|
|
|
|
def get_institution_type(data: Dict) -> str:
|
|
"""Get institution type code."""
|
|
type_map = {
|
|
'MUSEUM': 'M', 'ARCHIVE': 'A', 'LIBRARY': 'L', 'GALLERY': 'G',
|
|
'RESEARCH_CENTER': 'R', 'EDUCATION_PROVIDER': 'E', 'OFFICIAL_INSTITUTION': 'O',
|
|
'COLLECTING_SOCIETY': 'S', 'HOLY_SITES': 'H', 'DIGITAL_PLATFORM': 'D',
|
|
'BOTANICAL_ZOO': 'B', 'CORPORATION': 'C', 'FEATURES': 'F',
|
|
'INTANGIBLE_HERITAGE_GROUP': 'I', 'MIXED': 'X', 'PERSONAL_COLLECTION': 'P',
|
|
'NGO': 'N', 'TASTE_SMELL': 'T', 'UNKNOWN': 'U',
|
|
}
|
|
return type_map.get(data.get('institution_type', 'MUSEUM'), 'M')
|
|
|
|
|
|
def extract_abbreviation(name: str) -> str:
|
|
"""Extract abbreviation from name."""
|
|
skip_words = {'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der',
|
|
'en', 'of', 'the', 'a', 'an', 'and', 'or', 'museum', 'stichting'}
|
|
name_clean = re.sub(r'[^\w\s]', ' ', name)
|
|
words = [w for w in name_clean.split() if w.lower() not in skip_words and len(w) > 1]
|
|
if not words:
|
|
words = name_clean.split()[:3]
|
|
if len(words) == 1:
|
|
return words[0][:4].upper()
|
|
return ''.join(w[0] for w in words[:5]).upper()
|
|
|
|
|
|
def generate_ghcid(name: str, location: Dict, inst_type: str) -> str:
|
|
"""Generate GHCID from location data."""
|
|
country = normalize_country(location.get('country', 'XX'))
|
|
city = location.get('city', '').lower().strip()
|
|
|
|
province = CITY_TO_PROVINCE.get(city, 'XX')
|
|
city_code = CITY_TO_CODE.get(city)
|
|
if not city_code:
|
|
words = city.replace('-', ' ').split()
|
|
if len(words) == 1:
|
|
city_code = words[0][:3].upper()
|
|
else:
|
|
city_code = ''.join(w[0] for w in words[:3]).upper()
|
|
|
|
abbrev = extract_abbreviation(name)
|
|
return f"{country}-{province}-{city_code}-{inst_type}-{abbrev}"
|
|
|
|
|
|
def match_pending_to_linkedin(pending_files: List[Path], linkedin_lookup: Dict[str, Path]) -> Dict[Path, Tuple[Path, Dict]]:
|
|
"""Match PENDING files to LinkedIn About pages."""
|
|
matches = {}
|
|
|
|
for pending_path in pending_files:
|
|
try:
|
|
with open(pending_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
emic_name = data.get('custodian_name', {}).get('emic_name', '')
|
|
if not emic_name:
|
|
continue
|
|
|
|
norm_emic = normalize_name(emic_name)
|
|
|
|
# Exact match
|
|
if norm_emic in linkedin_lookup:
|
|
about_path = linkedin_lookup[norm_emic]
|
|
location = extract_location_from_about_page(about_path)
|
|
if location:
|
|
matches[pending_path] = (about_path, location)
|
|
continue
|
|
|
|
# Partial match - check if one contains the other
|
|
for norm_linkedin, about_path in linkedin_lookup.items():
|
|
if len(norm_emic) > 5 and len(norm_linkedin) > 5:
|
|
if norm_emic in norm_linkedin or norm_linkedin in norm_emic:
|
|
location = extract_location_from_about_page(about_path)
|
|
if location:
|
|
matches[pending_path] = (about_path, location)
|
|
break
|
|
except:
|
|
continue
|
|
|
|
return matches
|
|
|
|
|
|
def process_match(pending_path: Path, about_path: Path, location: Dict, dry_run: bool = True) -> Optional[str]:
|
|
"""Process a matched PENDING file."""
|
|
try:
|
|
with open(pending_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
emic_name = data.get('custodian_name', {}).get('emic_name', '')
|
|
inst_type = get_institution_type(data)
|
|
|
|
new_ghcid = generate_ghcid(emic_name, location, inst_type)
|
|
new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"
|
|
|
|
# Check collision
|
|
if new_path.exists() and new_path != pending_path:
|
|
name_slug = re.sub(r'[^\w]+', '-', emic_name.lower()).strip('-')[:30]
|
|
new_ghcid = f"{new_ghcid}-{name_slug}"
|
|
new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"
|
|
|
|
if dry_run:
|
|
print(f"[WOULD RESOLVE] {emic_name}")
|
|
print(f" LinkedIn: {about_path.name}")
|
|
print(f" Location: {location.get('city')}, {location.get('country')}")
|
|
print(f" -> {new_ghcid}.yaml")
|
|
return 'dry_run'
|
|
|
|
# Update data
|
|
data['ghcid_current'] = new_ghcid
|
|
if 'location' not in data:
|
|
data['location'] = {}
|
|
data['location']['city'] = location.get('city')
|
|
data['location']['country'] = location.get('country')
|
|
|
|
if 'ghcid_resolution' not in data:
|
|
data['ghcid_resolution'] = {}
|
|
data['ghcid_resolution']['method'] = 'linkedin_about_page_extraction'
|
|
data['ghcid_resolution']['resolved_at'] = datetime.now(timezone.utc).isoformat()
|
|
data['ghcid_resolution']['linkedin_source'] = about_path.name
|
|
|
|
with open(pending_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
shutil.move(pending_path, new_path)
|
|
print(f"[RESOLVED] {emic_name} -> {new_ghcid}.yaml")
|
|
return new_ghcid
|
|
|
|
except Exception as e:
|
|
print(f"[ERROR] {pending_path.name}: {e}")
|
|
return None
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--dry-run', action='store_true')
|
|
parser.add_argument('--limit', type=int, default=0)
|
|
args = parser.parse_args()
|
|
|
|
print("Building LinkedIn About page lookup...")
|
|
linkedin_lookup = build_linkedin_lookup()
|
|
print(f"Found {len(linkedin_lookup)} About pages")
|
|
|
|
print("\nFinding PENDING files...")
|
|
pending_files = list(CUSTODIAN_DIR.glob("*PENDING*.yaml"))
|
|
print(f"Found {len(pending_files)} PENDING files")
|
|
|
|
if args.limit:
|
|
pending_files = pending_files[:args.limit]
|
|
|
|
print("\nMatching PENDING files to LinkedIn About pages...")
|
|
matches = match_pending_to_linkedin(pending_files, linkedin_lookup)
|
|
print(f"Found {len(matches)} matches")
|
|
|
|
# Group by country
|
|
country_counts = defaultdict(int)
|
|
for pending_path, (about_path, location) in matches.items():
|
|
country = normalize_country(location.get('country', 'XX'))
|
|
country_counts[country] += 1
|
|
|
|
print("\nMatches by country:")
|
|
for country, count in sorted(country_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {country}: {count}")
|
|
|
|
print(f"\n{'DRY RUN - ' if args.dry_run else ''}Processing matches...")
|
|
|
|
success = 0
|
|
failed = 0
|
|
|
|
for pending_path, (about_path, location) in matches.items():
|
|
result = process_match(pending_path, about_path, location, dry_run=args.dry_run)
|
|
if result:
|
|
success += 1
|
|
else:
|
|
failed += 1
|
|
|
|
print(f"\n{'Would resolve' if args.dry_run else 'Resolved'}: {success}")
|
|
if failed:
|
|
print(f"Failed: {failed}")
|
|
print(f"Remaining unmatched: {len(pending_files) - len(matches)}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|