450 lines
18 KiB
Python
450 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Apply extracted LinkedIn locations to PENDING files.
|
|
|
|
This script:
|
|
1. Loads locations extracted from LinkedIn About pages
|
|
2. Matches PENDING files to extracted locations by organization name
|
|
3. Generates proper GHCIDs with location data
|
|
4. Renames files with correct GHCIDs
|
|
|
|
Usage:
|
|
python scripts/apply_linkedin_locations.py --dry-run
|
|
python scripts/apply_linkedin_locations.py --apply
|
|
"""
|
|
|
|
import re
|
|
import json
|
|
import yaml
|
|
import unicodedata
|
|
import shutil
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, Optional, Tuple, List
|
|
from collections import defaultdict
|
|
|
|
# Paths
|
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
|
LOCATIONS_FILE = Path("/Users/kempersc/apps/glam/data/linkedin_locations.json")
|
|
ARCHIVE_DIR = CUSTODIAN_DIR / "archive" / "pending_resolved_20250109"
|
|
|
|
# Extended city to province mapping
|
|
CITY_TO_PROVINCE = {
|
|
# Noord-Holland
|
|
'amsterdam': 'NH', 'haarlem': 'NH', 'alkmaar': 'NH', 'hilversum': 'NH',
|
|
'zaandam': 'NH', 'hoorn': 'NH', 'enkhuizen': 'NH', 'purmerend': 'NH',
|
|
'amstelveen': 'NH', 'heemstede': 'NH', 'beverwijk': 'NH', 'velsen': 'NH',
|
|
'castricum': 'NH', 'huizen': 'NH', 'bussum': 'NH', 'naarden': 'NH',
|
|
'weesp': 'NH', 'edam': 'NH', 'volendam': 'NH', 'texel': 'NH',
|
|
'den helder': 'NH', 'schagen': 'NH', 'heerhugowaard': 'NH',
|
|
|
|
# Zuid-Holland
|
|
'rotterdam': 'ZH', 'den haag': 'ZH', 'the hague': 'ZH', "'s-gravenhage": 'ZH',
|
|
'leiden': 'ZH', 'delft': 'ZH', 'dordrecht': 'ZH', 'gouda': 'ZH',
|
|
'schiedam': 'ZH', 'zoetermeer': 'ZH', 'vlaardingen': 'ZH', 'spijkenisse': 'ZH',
|
|
'alphen aan den rijn': 'ZH', 'katwijk': 'ZH', 'leidschendam': 'ZH',
|
|
'voorburg': 'ZH', 'rijswijk': 'ZH', 'wassenaar': 'ZH', 'oegstgeest': 'ZH',
|
|
'voorschoten': 'ZH', 'lisse': 'ZH', 'noordwijk': 'ZH', 'sassenheim': 'ZH',
|
|
'gorinchem': 'ZH', 'schoonhoven': 'ZH', 'woerden': 'ZH',
|
|
|
|
# Utrecht
|
|
'utrecht': 'UT', 'amersfoort': 'UT', 'zeist': 'UT', 'nieuwegein': 'UT',
|
|
'veenendaal': 'UT', 'houten': 'UT', 'soest': 'UT', 'baarn': 'UT',
|
|
'bunnik': 'UT', 'maarssen': 'UT', 'de bilt': 'UT', 'bilthoven': 'UT',
|
|
'doorn': 'UT', 'driebergen': 'UT', 'wijk bij duurstede': 'UT',
|
|
|
|
# Gelderland
|
|
'arnhem': 'GE', 'nijmegen': 'GE', 'apeldoorn': 'GE', 'ede': 'GE',
|
|
'wageningen': 'GE', 'doetinchem': 'GE', 'zutphen': 'GE', 'harderwijk': 'GE',
|
|
'zevenaar': 'GE', 'tiel': 'GE', 'culemborg': 'GE', 'barneveld': 'GE',
|
|
'elburg': 'GE', 'nunspeet': 'GE', 'putten': 'GE', 'ermelo': 'GE',
|
|
'epe': 'GE', 'hattem': 'GE', 'zaltbommel': 'GE', 'winterswijk': 'GE',
|
|
'oosterbeek': 'GE', 'velp': 'GE', 'rhenen': 'GE', 'buren': 'GE',
|
|
|
|
# Noord-Brabant
|
|
'eindhoven': 'NB', 'tilburg': 'NB', 'breda': 'NB', "'s-hertogenbosch": 'NB',
|
|
'den bosch': 'NB', 'helmond': 'NB', 'oss': 'NB', 'roosendaal': 'NB',
|
|
'bergen op zoom': 'NB', 'waalwijk': 'NB', 'uden': 'NB', 'veghel': 'NB',
|
|
'boxtel': 'NB', 'oisterwijk': 'NB', 'vught': 'NB', 'cuijk': 'NB',
|
|
'deurne': 'NB', 'geldrop': 'NB', 'mierlo': 'NB', 'nuenen': 'NB',
|
|
'valkenswaard': 'NB', 'heeze': 'NB', 'best': 'NB', 'son': 'NB',
|
|
'etten-leur': 'NB', 'oosterhout': 'NB', 'dongen': 'NB', 'gilze': 'NB',
|
|
|
|
# Limburg
|
|
'maastricht': 'LI', 'venlo': 'LI', 'heerlen': 'LI', 'roermond': 'LI',
|
|
'sittard': 'LI', 'geleen': 'LI', 'weert': 'LI', 'kerkrade': 'LI',
|
|
'valkenburg': 'LI', 'vaals': 'LI', 'meerssen': 'LI', 'brunssum': 'LI',
|
|
'landgraaf': 'LI', 'stein': 'LI', 'beek': 'LI', 'eijsden': 'LI',
|
|
'gulpen': 'LI', 'margraten': 'LI', 'simpelveld': 'LI',
|
|
|
|
# Overijssel
|
|
'zwolle': 'OV', 'deventer': 'OV', 'enschede': 'OV', 'hengelo': 'OV',
|
|
'almelo': 'OV', 'kampen': 'OV', 'oldenzaal': 'OV', 'hardenberg': 'OV',
|
|
'rijssen': 'OV', 'holten': 'OV', 'raalte': 'OV', 'ommen': 'OV',
|
|
'dalfsen': 'OV', 'staphorst': 'OV', 'giethoorn': 'OV', 'hasselt': 'OV',
|
|
'steenwijk': 'OV', 'vollenhove': 'OV', 'blokzijl': 'OV',
|
|
|
|
# Friesland
|
|
'leeuwarden': 'FR', 'drachten': 'FR', 'sneek': 'FR', 'heerenveen': 'FR',
|
|
'harlingen': 'FR', 'dokkum': 'FR', 'bolsward': 'FR', 'franeker': 'FR',
|
|
'joure': 'FR', 'lemmer': 'FR', 'workum': 'FR', 'makkum': 'FR',
|
|
'hindeloopen': 'FR', 'stavoren': 'FR', 'sloten': 'FR',
|
|
|
|
# Drenthe
|
|
'assen': 'DR', 'emmen': 'DR', 'hoogeveen': 'DR', 'meppel': 'DR',
|
|
'coevorden': 'DR', 'borger': 'DR', 'dwingeloo': 'DR', 'westerbork': 'DR',
|
|
|
|
# Groningen
|
|
'groningen': 'GR', 'veendam': 'GR', 'winschoten': 'GR', 'hoogezand': 'GR',
|
|
'stadskanaal': 'GR', 'delfzijl': 'GR', 'appingedam': 'GR', 'ter apel': 'GR',
|
|
'leek': 'GR', 'marum': 'GR', 'zuidhorn': 'GR', 'uithuizen': 'GR',
|
|
|
|
# Zeeland
|
|
'middelburg': 'ZE', 'vlissingen': 'ZE', 'goes': 'ZE', 'terneuzen': 'ZE',
|
|
'hulst': 'ZE', 'zierikzee': 'ZE', 'veere': 'ZE', 'domburg': 'ZE',
|
|
'sluis': 'ZE', 'yerseke': 'ZE',
|
|
|
|
# Flevoland
|
|
'almere': 'FL', 'lelystad': 'FL', 'dronten': 'FL', 'emmeloord': 'FL',
|
|
'urk': 'FL', 'zeewolde': 'FL', 'biddinghuizen': 'FL',
|
|
}
|
|
|
|
# City to 3-letter code (extended)
|
|
CITY_TO_CODE = {
|
|
'amsterdam': 'AMS', 'rotterdam': 'ROT', 'den haag': 'DHA', 'the hague': 'DHA',
|
|
"'s-gravenhage": 'DHA', 'utrecht': 'UTR', 'eindhoven': 'EIN', 'tilburg': 'TIL',
|
|
'groningen': 'GRO', 'almere': 'ALM', 'breda': 'BRE', 'nijmegen': 'NIJ',
|
|
'apeldoorn': 'APE', 'haarlem': 'HAA', 'arnhem': 'ARN', 'enschede': 'ENS',
|
|
'amersfoort': 'AME', 'zaandam': 'ZAA', "'s-hertogenbosch": 'DBO', 'den bosch': 'DBO',
|
|
'zwolle': 'ZWO', 'leiden': 'LEI', 'maastricht': 'MAA', 'dordrecht': 'DOR',
|
|
'deventer': 'DEV', 'delft': 'DEL', 'alkmaar': 'ALK', 'leeuwarden': 'LEE',
|
|
'hilversum': 'HIL', 'assen': 'ASS', 'middelburg': 'MID', 'hoorn': 'HOO',
|
|
'enkhuizen': 'ENK', 'wageningen': 'WAG', 'gouda': 'GOU', 'venlo': 'VEN',
|
|
'heerlen': 'HEE', 'roermond': 'ROE', 'zeist': 'ZEI', 'ede': 'EDE',
|
|
'veenendaal': 'VEE', 'harderwijk': 'HAR', 'doetinchem': 'DOE', 'zutphen': 'ZUT',
|
|
'helmond': 'HEL', 'oss': 'OSS', 'roosendaal': 'ROS', 'bergen op zoom': 'BOZ',
|
|
'hengelo': 'HEN', 'almelo': 'ALO', 'kampen': 'KAM', 'sneek': 'SNE',
|
|
'heerenveen': 'HEV', 'drachten': 'DRA', 'emmen': 'EMM', 'hoogeveen': 'HOV',
|
|
'meppel': 'MEP', 'lelystad': 'LEL', 'vlissingen': 'VLI', 'goes': 'GOE',
|
|
'terneuzen': 'TER', 'zoetermeer': 'ZOE', 'spijkenisse': 'SPI', 'purmerend': 'PUR',
|
|
}
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize organization name for matching."""
|
|
# Remove common suffixes and normalize
|
|
normalized = unicodedata.normalize('NFKD', name)
|
|
normalized = normalized.lower().strip()
|
|
# Remove special characters except spaces
|
|
normalized = re.sub(r'[^\w\s]', '', normalized)
|
|
# Normalize whitespace
|
|
normalized = ' '.join(normalized.split())
|
|
return normalized
|
|
|
|
|
|
def extract_emic_name_from_pending(filepath: Path) -> Optional[str]:
|
|
"""Extract emic_name from PENDING file."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if 'custodian_name' in data and 'emic_name' in data['custodian_name']:
|
|
return data['custodian_name']['emic_name']
|
|
return None
|
|
except:
|
|
return None
|
|
|
|
|
|
def extract_abbreviation(name: str) -> str:
|
|
"""Extract abbreviation from organization name."""
|
|
skip_words = {
|
|
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
|
|
'en', 'of', 'the', 'a', 'an', 'and', 'or', 'of', 'for', 'to', 'at', 'by',
|
|
'museum', 'archief', 'bibliotheek', 'stichting', 'vereniging', 'centrum',
|
|
}
|
|
|
|
# Clean name
|
|
name_clean = re.sub(r'[^\w\s]', ' ', name)
|
|
words = [w for w in name_clean.split() if w.lower() not in skip_words and len(w) > 1]
|
|
|
|
if not words:
|
|
words = name_clean.split()[:3]
|
|
|
|
# Generate abbreviation
|
|
if len(words) == 1:
|
|
abbrev = words[0][:4].upper()
|
|
else:
|
|
abbrev = ''.join(w[0] for w in words[:5]).upper()
|
|
|
|
return abbrev if abbrev else 'XXX'
|
|
|
|
|
|
def normalize_country_code(country_raw: str) -> str:
|
|
"""Extract proper 2-letter country code from raw country field."""
|
|
if not country_raw:
|
|
return 'XX'
|
|
|
|
country_upper = country_raw.upper().strip()
|
|
|
|
# If it's already a valid 2-letter code, use it
|
|
if len(country_upper) == 2 and country_upper.isalpha():
|
|
return country_upper
|
|
|
|
# Common patterns: "ZUID-HOLLAND NL", "ÎLE-DE-FRANCE FR", etc.
|
|
# Extract the last word which should be the country code
|
|
parts = country_upper.split()
|
|
if parts:
|
|
last_part = parts[-1]
|
|
if len(last_part) == 2 and last_part.isalpha():
|
|
return last_part
|
|
|
|
# Handle cases like "JAWA TENGAH 57141 ID"
|
|
for part in reversed(parts):
|
|
if len(part) == 2 and part.isalpha():
|
|
return part
|
|
|
|
# Fallback mappings
|
|
country_mappings = {
|
|
'NETHERLANDS': 'NL', 'NEDERLAND': 'NL',
|
|
'FRANCE': 'FR', 'GERMANY': 'DE', 'DEUTSCHLAND': 'DE',
|
|
'BELGIUM': 'BE', 'BELGIË': 'BE', 'BELGIQUE': 'BE',
|
|
'UNITED KINGDOM': 'GB', 'UK': 'GB', 'ENGLAND': 'GB', 'SCOTLAND': 'GB',
|
|
'ITALY': 'IT', 'ITALIA': 'IT',
|
|
'SPAIN': 'ES', 'ESPAÑA': 'ES',
|
|
'PORTUGAL': 'PT',
|
|
'AUSTRIA': 'AT', 'ÖSTERREICH': 'AT',
|
|
'SWITZERLAND': 'CH', 'SCHWEIZ': 'CH', 'SUISSE': 'CH',
|
|
'DENMARK': 'DK', 'DANMARK': 'DK',
|
|
'SWEDEN': 'SE', 'SVERIGE': 'SE',
|
|
'NORWAY': 'NO', 'NORGE': 'NO',
|
|
'FINLAND': 'FI', 'SUOMI': 'FI',
|
|
'INDONESIA': 'ID',
|
|
'UNITED STATES': 'US', 'USA': 'US',
|
|
'CANADA': 'CA',
|
|
'AUSTRALIA': 'AU',
|
|
}
|
|
|
|
for name, code in country_mappings.items():
|
|
if name in country_upper:
|
|
return code
|
|
|
|
return 'XX'
|
|
|
|
|
|
def generate_ghcid(name: str, location: Dict, institution_type: str = 'M') -> str:
|
|
"""Generate GHCID from name and location data."""
|
|
country = normalize_country_code(location.get('country', 'XX'))
|
|
|
|
# Get city and normalize
|
|
city = location.get('city', '')
|
|
city_lower = city.lower().strip()
|
|
|
|
# Get province code
|
|
province_code = location.get('province_code')
|
|
if not province_code:
|
|
province_code = CITY_TO_PROVINCE.get(city_lower, 'XX')
|
|
|
|
# Get city code
|
|
city_code = location.get('city_code')
|
|
if not city_code or city_code == 'XXX':
|
|
city_code = CITY_TO_CODE.get(city_lower)
|
|
if not city_code:
|
|
# Generate from city name
|
|
words = city_lower.replace('-', ' ').split()
|
|
if len(words) == 1:
|
|
city_code = words[0][:3].upper()
|
|
else:
|
|
city_code = ''.join(w[0] for w in words[:3]).upper()
|
|
|
|
# Get abbreviation from name
|
|
abbrev = extract_abbreviation(name)
|
|
|
|
return f"{country}-{province_code}-{city_code}-{institution_type}-{abbrev}"
|
|
|
|
|
|
def match_pending_to_locations(pending_files: List[Path], locations: Dict) -> Dict[Path, Tuple[str, Dict]]:
|
|
"""Match PENDING files to extracted locations."""
|
|
|
|
# Build normalized lookup from locations
|
|
normalized_locations = {}
|
|
for org_name, data in locations.items():
|
|
norm_name = normalize_name(org_name)
|
|
normalized_locations[norm_name] = (org_name, data)
|
|
|
|
matches = {}
|
|
|
|
for pending_path in pending_files:
|
|
emic_name = extract_emic_name_from_pending(pending_path)
|
|
if not emic_name:
|
|
continue
|
|
|
|
norm_emic = normalize_name(emic_name)
|
|
|
|
# Exact match
|
|
if norm_emic in normalized_locations:
|
|
orig_name, data = normalized_locations[norm_emic]
|
|
matches[pending_path] = (orig_name, data)
|
|
continue
|
|
|
|
# Fuzzy match - check if one contains the other
|
|
for norm_loc, (orig_name, data) in normalized_locations.items():
|
|
if norm_emic in norm_loc or norm_loc in norm_emic:
|
|
matches[pending_path] = (orig_name, data)
|
|
break
|
|
|
|
return matches
|
|
|
|
|
|
def get_institution_type_from_file(filepath: Path) -> str:
|
|
"""Extract institution type from PENDING file."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
inst_type = data.get('institution_type', 'MUSEUM')
|
|
type_map = {
|
|
'MUSEUM': 'M', 'ARCHIVE': 'A', 'LIBRARY': 'L', 'GALLERY': 'G',
|
|
'RESEARCH_CENTER': 'R', 'EDUCATION_PROVIDER': 'E', 'OFFICIAL_INSTITUTION': 'O',
|
|
'COLLECTING_SOCIETY': 'S', 'HOLY_SITES': 'H', 'DIGITAL_PLATFORM': 'D',
|
|
'BOTANICAL_ZOO': 'B', 'CORPORATION': 'C', 'FEATURES': 'F',
|
|
'INTANGIBLE_HERITAGE_GROUP': 'I', 'MIXED': 'X', 'PERSONAL_COLLECTION': 'P',
|
|
'NGO': 'N', 'TASTE_SMELL': 'T', 'UNKNOWN': 'U',
|
|
}
|
|
return type_map.get(inst_type, 'M')
|
|
except:
|
|
return 'M'
|
|
|
|
|
|
def apply_location_to_pending(pending_path: Path, location_data: Dict, dry_run: bool = True) -> Optional[Path]:
|
|
"""Apply location data to a PENDING file and rename."""
|
|
|
|
try:
|
|
with open(pending_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
emic_name = data.get('custodian_name', {}).get('emic_name', '')
|
|
inst_type = get_institution_type_from_file(pending_path)
|
|
location = location_data['location']
|
|
|
|
# Generate new GHCID
|
|
new_ghcid = generate_ghcid(emic_name, location, inst_type)
|
|
|
|
# Check for collision
|
|
new_filename = f"{new_ghcid}.yaml"
|
|
new_path = CUSTODIAN_DIR / new_filename
|
|
|
|
if new_path.exists() and new_path != pending_path:
|
|
# Collision - add name suffix
|
|
name_slug = re.sub(r'[^\w]+', '-', emic_name.lower()).strip('-')[:30]
|
|
new_ghcid = f"{new_ghcid}-{name_slug}"
|
|
new_filename = f"{new_ghcid}.yaml"
|
|
new_path = CUSTODIAN_DIR / new_filename
|
|
|
|
# Update data
|
|
data['ghcid_current'] = new_ghcid
|
|
|
|
# Add location data if not present
|
|
if 'location' not in data:
|
|
data['location'] = {}
|
|
|
|
data['location']['city'] = location.get('city')
|
|
data['location']['country'] = location.get('country')
|
|
if location.get('postal_code'):
|
|
data['location']['postal_code'] = location.get('postal_code')
|
|
if location.get('street'):
|
|
data['location']['street'] = location.get('street')
|
|
|
|
# Add resolution provenance
|
|
if 'ghcid_resolution' not in data:
|
|
data['ghcid_resolution'] = {}
|
|
data['ghcid_resolution']['method'] = 'linkedin_about_page_extraction'
|
|
data['ghcid_resolution']['resolved_at'] = datetime.now(timezone.utc).isoformat()
|
|
data['ghcid_resolution']['source_file'] = location_data.get('source_file')
|
|
|
|
if dry_run:
|
|
print(f" Would rename: {pending_path.name}")
|
|
print(f" to: {new_filename}")
|
|
return None
|
|
else:
|
|
# Write updated data
|
|
with open(pending_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
# Rename file
|
|
shutil.move(pending_path, new_path)
|
|
print(f" Renamed: {pending_path.name} -> {new_filename}")
|
|
return new_path
|
|
|
|
except Exception as e:
|
|
print(f" Error processing {pending_path.name}: {e}")
|
|
return None
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
|
|
parser.add_argument('--apply', action='store_true', help='Actually apply changes')
|
|
parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process')
|
|
args = parser.parse_args()
|
|
|
|
if not args.dry_run and not args.apply:
|
|
print("Please specify --dry-run or --apply")
|
|
return
|
|
|
|
dry_run = not args.apply
|
|
|
|
# Load extracted locations
|
|
print(f"Loading locations from {LOCATIONS_FILE}...")
|
|
with open(LOCATIONS_FILE, 'r', encoding='utf-8') as f:
|
|
locations_data = json.load(f)
|
|
|
|
locations = locations_data.get('organizations', {})
|
|
print(f"Loaded {len(locations)} organization locations")
|
|
|
|
# Find PENDING files
|
|
pending_files = list(CUSTODIAN_DIR.glob("*-XX-XXX-PENDING-*.yaml"))
|
|
print(f"Found {len(pending_files)} PENDING files")
|
|
|
|
if args.limit:
|
|
pending_files = pending_files[:args.limit]
|
|
|
|
# Match PENDING files to locations
|
|
print("\nMatching PENDING files to extracted locations...")
|
|
matches = match_pending_to_locations(pending_files, locations)
|
|
print(f"Found {len(matches)} matches")
|
|
|
|
# Group by country
|
|
country_counts = defaultdict(int)
|
|
for pending_path, (org_name, data) in matches.items():
|
|
country = data['location'].get('country', 'XX')
|
|
country_counts[country] += 1
|
|
|
|
print("\nMatches by country:")
|
|
for country, count in sorted(country_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {country}: {count}")
|
|
|
|
# Apply locations
|
|
print(f"\n{'DRY RUN - ' if dry_run else ''}Applying locations...")
|
|
|
|
success = 0
|
|
failed = 0
|
|
|
|
for pending_path, (org_name, location_data) in matches.items():
|
|
result = apply_location_to_pending(pending_path, location_data, dry_run=dry_run)
|
|
if dry_run or result:
|
|
success += 1
|
|
else:
|
|
failed += 1
|
|
|
|
print(f"\n{'Would resolve' if dry_run else 'Resolved'}: {success}")
|
|
if failed:
|
|
print(f"Failed: {failed}")
|
|
|
|
# Show remaining
|
|
remaining = len(pending_files) - len(matches)
|
|
print(f"Remaining PENDING (no location match): {remaining}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|