glam/scripts/apply_linkedin_locations.py
2026-01-09 20:35:19 +01:00

450 lines
18 KiB
Python

#!/usr/bin/env python3
"""
Apply extracted LinkedIn locations to PENDING files.
This script:
1. Loads locations extracted from LinkedIn About pages
2. Matches PENDING files to extracted locations by organization name
3. Generates proper GHCIDs with location data
4. Renames files with correct GHCIDs
Usage:
python scripts/apply_linkedin_locations.py --dry-run
python scripts/apply_linkedin_locations.py --apply
"""
import re
import json
import yaml
import unicodedata
import shutil
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, Optional, Tuple, List
from collections import defaultdict
# Paths
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
LOCATIONS_FILE = Path("/Users/kempersc/apps/glam/data/linkedin_locations.json")
ARCHIVE_DIR = CUSTODIAN_DIR / "archive" / "pending_resolved_20250109"
# Extended city to province mapping
CITY_TO_PROVINCE = {
# Noord-Holland
'amsterdam': 'NH', 'haarlem': 'NH', 'alkmaar': 'NH', 'hilversum': 'NH',
'zaandam': 'NH', 'hoorn': 'NH', 'enkhuizen': 'NH', 'purmerend': 'NH',
'amstelveen': 'NH', 'heemstede': 'NH', 'beverwijk': 'NH', 'velsen': 'NH',
'castricum': 'NH', 'huizen': 'NH', 'bussum': 'NH', 'naarden': 'NH',
'weesp': 'NH', 'edam': 'NH', 'volendam': 'NH', 'texel': 'NH',
'den helder': 'NH', 'schagen': 'NH', 'heerhugowaard': 'NH',
# Zuid-Holland
'rotterdam': 'ZH', 'den haag': 'ZH', 'the hague': 'ZH', "'s-gravenhage": 'ZH',
'leiden': 'ZH', 'delft': 'ZH', 'dordrecht': 'ZH', 'gouda': 'ZH',
'schiedam': 'ZH', 'zoetermeer': 'ZH', 'vlaardingen': 'ZH', 'spijkenisse': 'ZH',
'alphen aan den rijn': 'ZH', 'katwijk': 'ZH', 'leidschendam': 'ZH',
'voorburg': 'ZH', 'rijswijk': 'ZH', 'wassenaar': 'ZH', 'oegstgeest': 'ZH',
'voorschoten': 'ZH', 'lisse': 'ZH', 'noordwijk': 'ZH', 'sassenheim': 'ZH',
'gorinchem': 'ZH', 'schoonhoven': 'ZH', 'woerden': 'ZH',
# Utrecht
'utrecht': 'UT', 'amersfoort': 'UT', 'zeist': 'UT', 'nieuwegein': 'UT',
'veenendaal': 'UT', 'houten': 'UT', 'soest': 'UT', 'baarn': 'UT',
'bunnik': 'UT', 'maarssen': 'UT', 'de bilt': 'UT', 'bilthoven': 'UT',
'doorn': 'UT', 'driebergen': 'UT', 'wijk bij duurstede': 'UT',
# Gelderland
'arnhem': 'GE', 'nijmegen': 'GE', 'apeldoorn': 'GE', 'ede': 'GE',
'wageningen': 'GE', 'doetinchem': 'GE', 'zutphen': 'GE', 'harderwijk': 'GE',
'zevenaar': 'GE', 'tiel': 'GE', 'culemborg': 'GE', 'barneveld': 'GE',
'elburg': 'GE', 'nunspeet': 'GE', 'putten': 'GE', 'ermelo': 'GE',
'epe': 'GE', 'hattem': 'GE', 'zaltbommel': 'GE', 'winterswijk': 'GE',
'oosterbeek': 'GE', 'velp': 'GE', 'rhenen': 'GE', 'buren': 'GE',
# Noord-Brabant
'eindhoven': 'NB', 'tilburg': 'NB', 'breda': 'NB', "'s-hertogenbosch": 'NB',
'den bosch': 'NB', 'helmond': 'NB', 'oss': 'NB', 'roosendaal': 'NB',
'bergen op zoom': 'NB', 'waalwijk': 'NB', 'uden': 'NB', 'veghel': 'NB',
'boxtel': 'NB', 'oisterwijk': 'NB', 'vught': 'NB', 'cuijk': 'NB',
'deurne': 'NB', 'geldrop': 'NB', 'mierlo': 'NB', 'nuenen': 'NB',
'valkenswaard': 'NB', 'heeze': 'NB', 'best': 'NB', 'son': 'NB',
'etten-leur': 'NB', 'oosterhout': 'NB', 'dongen': 'NB', 'gilze': 'NB',
# Limburg
'maastricht': 'LI', 'venlo': 'LI', 'heerlen': 'LI', 'roermond': 'LI',
'sittard': 'LI', 'geleen': 'LI', 'weert': 'LI', 'kerkrade': 'LI',
'valkenburg': 'LI', 'vaals': 'LI', 'meerssen': 'LI', 'brunssum': 'LI',
'landgraaf': 'LI', 'stein': 'LI', 'beek': 'LI', 'eijsden': 'LI',
'gulpen': 'LI', 'margraten': 'LI', 'simpelveld': 'LI',
# Overijssel
'zwolle': 'OV', 'deventer': 'OV', 'enschede': 'OV', 'hengelo': 'OV',
'almelo': 'OV', 'kampen': 'OV', 'oldenzaal': 'OV', 'hardenberg': 'OV',
'rijssen': 'OV', 'holten': 'OV', 'raalte': 'OV', 'ommen': 'OV',
'dalfsen': 'OV', 'staphorst': 'OV', 'giethoorn': 'OV', 'hasselt': 'OV',
'steenwijk': 'OV', 'vollenhove': 'OV', 'blokzijl': 'OV',
# Friesland
'leeuwarden': 'FR', 'drachten': 'FR', 'sneek': 'FR', 'heerenveen': 'FR',
'harlingen': 'FR', 'dokkum': 'FR', 'bolsward': 'FR', 'franeker': 'FR',
'joure': 'FR', 'lemmer': 'FR', 'workum': 'FR', 'makkum': 'FR',
'hindeloopen': 'FR', 'stavoren': 'FR', 'sloten': 'FR',
# Drenthe
'assen': 'DR', 'emmen': 'DR', 'hoogeveen': 'DR', 'meppel': 'DR',
'coevorden': 'DR', 'borger': 'DR', 'dwingeloo': 'DR', 'westerbork': 'DR',
# Groningen
'groningen': 'GR', 'veendam': 'GR', 'winschoten': 'GR', 'hoogezand': 'GR',
'stadskanaal': 'GR', 'delfzijl': 'GR', 'appingedam': 'GR', 'ter apel': 'GR',
'leek': 'GR', 'marum': 'GR', 'zuidhorn': 'GR', 'uithuizen': 'GR',
# Zeeland
'middelburg': 'ZE', 'vlissingen': 'ZE', 'goes': 'ZE', 'terneuzen': 'ZE',
'hulst': 'ZE', 'zierikzee': 'ZE', 'veere': 'ZE', 'domburg': 'ZE',
'sluis': 'ZE', 'yerseke': 'ZE',
# Flevoland
'almere': 'FL', 'lelystad': 'FL', 'dronten': 'FL', 'emmeloord': 'FL',
'urk': 'FL', 'zeewolde': 'FL', 'biddinghuizen': 'FL',
}
# City to 3-letter code (extended)
CITY_TO_CODE = {
'amsterdam': 'AMS', 'rotterdam': 'ROT', 'den haag': 'DHA', 'the hague': 'DHA',
"'s-gravenhage": 'DHA', 'utrecht': 'UTR', 'eindhoven': 'EIN', 'tilburg': 'TIL',
'groningen': 'GRO', 'almere': 'ALM', 'breda': 'BRE', 'nijmegen': 'NIJ',
'apeldoorn': 'APE', 'haarlem': 'HAA', 'arnhem': 'ARN', 'enschede': 'ENS',
'amersfoort': 'AME', 'zaandam': 'ZAA', "'s-hertogenbosch": 'DBO', 'den bosch': 'DBO',
'zwolle': 'ZWO', 'leiden': 'LEI', 'maastricht': 'MAA', 'dordrecht': 'DOR',
'deventer': 'DEV', 'delft': 'DEL', 'alkmaar': 'ALK', 'leeuwarden': 'LEE',
'hilversum': 'HIL', 'assen': 'ASS', 'middelburg': 'MID', 'hoorn': 'HOO',
'enkhuizen': 'ENK', 'wageningen': 'WAG', 'gouda': 'GOU', 'venlo': 'VEN',
'heerlen': 'HEE', 'roermond': 'ROE', 'zeist': 'ZEI', 'ede': 'EDE',
'veenendaal': 'VEE', 'harderwijk': 'HAR', 'doetinchem': 'DOE', 'zutphen': 'ZUT',
'helmond': 'HEL', 'oss': 'OSS', 'roosendaal': 'ROS', 'bergen op zoom': 'BOZ',
'hengelo': 'HEN', 'almelo': 'ALO', 'kampen': 'KAM', 'sneek': 'SNE',
'heerenveen': 'HEV', 'drachten': 'DRA', 'emmen': 'EMM', 'hoogeveen': 'HOV',
'meppel': 'MEP', 'lelystad': 'LEL', 'vlissingen': 'VLI', 'goes': 'GOE',
'terneuzen': 'TER', 'zoetermeer': 'ZOE', 'spijkenisse': 'SPI', 'purmerend': 'PUR',
}
def normalize_name(name: str) -> str:
"""Normalize organization name for matching."""
# Remove common suffixes and normalize
normalized = unicodedata.normalize('NFKD', name)
normalized = normalized.lower().strip()
# Remove special characters except spaces
normalized = re.sub(r'[^\w\s]', '', normalized)
# Normalize whitespace
normalized = ' '.join(normalized.split())
return normalized
def extract_emic_name_from_pending(filepath: Path) -> Optional[str]:
"""Extract emic_name from PENDING file."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if 'custodian_name' in data and 'emic_name' in data['custodian_name']:
return data['custodian_name']['emic_name']
return None
except:
return None
def extract_abbreviation(name: str) -> str:
"""Extract abbreviation from organization name."""
skip_words = {
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
'en', 'of', 'the', 'a', 'an', 'and', 'or', 'of', 'for', 'to', 'at', 'by',
'museum', 'archief', 'bibliotheek', 'stichting', 'vereniging', 'centrum',
}
# Clean name
name_clean = re.sub(r'[^\w\s]', ' ', name)
words = [w for w in name_clean.split() if w.lower() not in skip_words and len(w) > 1]
if not words:
words = name_clean.split()[:3]
# Generate abbreviation
if len(words) == 1:
abbrev = words[0][:4].upper()
else:
abbrev = ''.join(w[0] for w in words[:5]).upper()
return abbrev if abbrev else 'XXX'
def normalize_country_code(country_raw: str) -> str:
"""Extract proper 2-letter country code from raw country field."""
if not country_raw:
return 'XX'
country_upper = country_raw.upper().strip()
# If it's already a valid 2-letter code, use it
if len(country_upper) == 2 and country_upper.isalpha():
return country_upper
# Common patterns: "ZUID-HOLLAND NL", "ÎLE-DE-FRANCE FR", etc.
# Extract the last word which should be the country code
parts = country_upper.split()
if parts:
last_part = parts[-1]
if len(last_part) == 2 and last_part.isalpha():
return last_part
# Handle cases like "JAWA TENGAH 57141 ID"
for part in reversed(parts):
if len(part) == 2 and part.isalpha():
return part
# Fallback mappings
country_mappings = {
'NETHERLANDS': 'NL', 'NEDERLAND': 'NL',
'FRANCE': 'FR', 'GERMANY': 'DE', 'DEUTSCHLAND': 'DE',
'BELGIUM': 'BE', 'BELGIË': 'BE', 'BELGIQUE': 'BE',
'UNITED KINGDOM': 'GB', 'UK': 'GB', 'ENGLAND': 'GB', 'SCOTLAND': 'GB',
'ITALY': 'IT', 'ITALIA': 'IT',
'SPAIN': 'ES', 'ESPAÑA': 'ES',
'PORTUGAL': 'PT',
'AUSTRIA': 'AT', 'ÖSTERREICH': 'AT',
'SWITZERLAND': 'CH', 'SCHWEIZ': 'CH', 'SUISSE': 'CH',
'DENMARK': 'DK', 'DANMARK': 'DK',
'SWEDEN': 'SE', 'SVERIGE': 'SE',
'NORWAY': 'NO', 'NORGE': 'NO',
'FINLAND': 'FI', 'SUOMI': 'FI',
'INDONESIA': 'ID',
'UNITED STATES': 'US', 'USA': 'US',
'CANADA': 'CA',
'AUSTRALIA': 'AU',
}
for name, code in country_mappings.items():
if name in country_upper:
return code
return 'XX'
def generate_ghcid(name: str, location: Dict, institution_type: str = 'M') -> str:
"""Generate GHCID from name and location data."""
country = normalize_country_code(location.get('country', 'XX'))
# Get city and normalize
city = location.get('city', '')
city_lower = city.lower().strip()
# Get province code
province_code = location.get('province_code')
if not province_code:
province_code = CITY_TO_PROVINCE.get(city_lower, 'XX')
# Get city code
city_code = location.get('city_code')
if not city_code or city_code == 'XXX':
city_code = CITY_TO_CODE.get(city_lower)
if not city_code:
# Generate from city name
words = city_lower.replace('-', ' ').split()
if len(words) == 1:
city_code = words[0][:3].upper()
else:
city_code = ''.join(w[0] for w in words[:3]).upper()
# Get abbreviation from name
abbrev = extract_abbreviation(name)
return f"{country}-{province_code}-{city_code}-{institution_type}-{abbrev}"
def match_pending_to_locations(pending_files: List[Path], locations: Dict) -> Dict[Path, Tuple[str, Dict]]:
"""Match PENDING files to extracted locations."""
# Build normalized lookup from locations
normalized_locations = {}
for org_name, data in locations.items():
norm_name = normalize_name(org_name)
normalized_locations[norm_name] = (org_name, data)
matches = {}
for pending_path in pending_files:
emic_name = extract_emic_name_from_pending(pending_path)
if not emic_name:
continue
norm_emic = normalize_name(emic_name)
# Exact match
if norm_emic in normalized_locations:
orig_name, data = normalized_locations[norm_emic]
matches[pending_path] = (orig_name, data)
continue
# Fuzzy match - check if one contains the other
for norm_loc, (orig_name, data) in normalized_locations.items():
if norm_emic in norm_loc or norm_loc in norm_emic:
matches[pending_path] = (orig_name, data)
break
return matches
def get_institution_type_from_file(filepath: Path) -> str:
"""Extract institution type from PENDING file."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
inst_type = data.get('institution_type', 'MUSEUM')
type_map = {
'MUSEUM': 'M', 'ARCHIVE': 'A', 'LIBRARY': 'L', 'GALLERY': 'G',
'RESEARCH_CENTER': 'R', 'EDUCATION_PROVIDER': 'E', 'OFFICIAL_INSTITUTION': 'O',
'COLLECTING_SOCIETY': 'S', 'HOLY_SITES': 'H', 'DIGITAL_PLATFORM': 'D',
'BOTANICAL_ZOO': 'B', 'CORPORATION': 'C', 'FEATURES': 'F',
'INTANGIBLE_HERITAGE_GROUP': 'I', 'MIXED': 'X', 'PERSONAL_COLLECTION': 'P',
'NGO': 'N', 'TASTE_SMELL': 'T', 'UNKNOWN': 'U',
}
return type_map.get(inst_type, 'M')
except:
return 'M'
def apply_location_to_pending(pending_path: Path, location_data: Dict, dry_run: bool = True) -> Optional[Path]:
"""Apply location data to a PENDING file and rename."""
try:
with open(pending_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
emic_name = data.get('custodian_name', {}).get('emic_name', '')
inst_type = get_institution_type_from_file(pending_path)
location = location_data['location']
# Generate new GHCID
new_ghcid = generate_ghcid(emic_name, location, inst_type)
# Check for collision
new_filename = f"{new_ghcid}.yaml"
new_path = CUSTODIAN_DIR / new_filename
if new_path.exists() and new_path != pending_path:
# Collision - add name suffix
name_slug = re.sub(r'[^\w]+', '-', emic_name.lower()).strip('-')[:30]
new_ghcid = f"{new_ghcid}-{name_slug}"
new_filename = f"{new_ghcid}.yaml"
new_path = CUSTODIAN_DIR / new_filename
# Update data
data['ghcid_current'] = new_ghcid
# Add location data if not present
if 'location' not in data:
data['location'] = {}
data['location']['city'] = location.get('city')
data['location']['country'] = location.get('country')
if location.get('postal_code'):
data['location']['postal_code'] = location.get('postal_code')
if location.get('street'):
data['location']['street'] = location.get('street')
# Add resolution provenance
if 'ghcid_resolution' not in data:
data['ghcid_resolution'] = {}
data['ghcid_resolution']['method'] = 'linkedin_about_page_extraction'
data['ghcid_resolution']['resolved_at'] = datetime.now(timezone.utc).isoformat()
data['ghcid_resolution']['source_file'] = location_data.get('source_file')
if dry_run:
print(f" Would rename: {pending_path.name}")
print(f" to: {new_filename}")
return None
else:
# Write updated data
with open(pending_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
# Rename file
shutil.move(pending_path, new_path)
print(f" Renamed: {pending_path.name} -> {new_filename}")
return new_path
except Exception as e:
print(f" Error processing {pending_path.name}: {e}")
return None
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
parser.add_argument('--apply', action='store_true', help='Actually apply changes')
parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process')
args = parser.parse_args()
if not args.dry_run and not args.apply:
print("Please specify --dry-run or --apply")
return
dry_run = not args.apply
# Load extracted locations
print(f"Loading locations from {LOCATIONS_FILE}...")
with open(LOCATIONS_FILE, 'r', encoding='utf-8') as f:
locations_data = json.load(f)
locations = locations_data.get('organizations', {})
print(f"Loaded {len(locations)} organization locations")
# Find PENDING files
pending_files = list(CUSTODIAN_DIR.glob("*-XX-XXX-PENDING-*.yaml"))
print(f"Found {len(pending_files)} PENDING files")
if args.limit:
pending_files = pending_files[:args.limit]
# Match PENDING files to locations
print("\nMatching PENDING files to extracted locations...")
matches = match_pending_to_locations(pending_files, locations)
print(f"Found {len(matches)} matches")
# Group by country
country_counts = defaultdict(int)
for pending_path, (org_name, data) in matches.items():
country = data['location'].get('country', 'XX')
country_counts[country] += 1
print("\nMatches by country:")
for country, count in sorted(country_counts.items(), key=lambda x: -x[1]):
print(f" {country}: {count}")
# Apply locations
print(f"\n{'DRY RUN - ' if dry_run else ''}Applying locations...")
success = 0
failed = 0
for pending_path, (org_name, location_data) in matches.items():
result = apply_location_to_pending(pending_path, location_data, dry_run=dry_run)
if dry_run or result:
success += 1
else:
failed += 1
print(f"\n{'Would resolve' if dry_run else 'Resolved'}: {success}")
if failed:
print(f"Failed: {failed}")
# Show remaining
remaining = len(pending_files) - len(matches)
print(f"Remaining PENDING (no location match): {remaining}")
if __name__ == '__main__':
main()