glam/scripts/resolve_pending_wikidata.py
2026-01-09 18:26:58 +01:00

291 lines
8.6 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Resolve PENDING files using Wikidata location lookup.
This script:
1. Searches Wikidata for organization by emic name
2. Gets location (P131) from Wikidata
3. Maps location to Dutch province/city code
4. Assigns proper GHCID and renames file
Usage:
python scripts/resolve_pending_wikidata.py --dry-run # Preview
python scripts/resolve_pending_wikidata.py --limit 50 # Process 50 files
python scripts/resolve_pending_wikidata.py # Process all
"""
import os
import re
import yaml
import time
import requests
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, Optional, Tuple
# Dutch city to province/code mapping
CITY_MAPPING = {
'amsterdam': ('NH', 'AMS'),
'the hague': ('ZH', 'DHA'),
'den haag': ('ZH', 'DHA'),
"'s-gravenhage": ('ZH', 'DHA'),
'rotterdam': ('ZH', 'ROT'),
'utrecht': ('UT', 'UTR'),
'eindhoven': ('NB', 'EIN'),
'groningen': ('GR', 'GRO'),
'tilburg': ('NB', 'TIL'),
'breda': ('NB', 'BRE'),
'nijmegen': ('GE', 'NIJ'),
'haarlem': ('NH', 'HAA'),
'arnhem': ('GE', 'ARN'),
'apeldoorn': ('GE', 'APE'),
'maastricht': ('LI', 'MAA'),
'leiden': ('ZH', 'LEI'),
'dordrecht': ('ZH', 'DOR'),
'zwolle': ('OV', 'ZWO'),
'deventer': ('OV', 'DEV'),
'delft': ('ZH', 'DEL'),
'alkmaar': ('NH', 'ALK'),
'gouda': ('ZH', 'GOU'),
'hilversum': ('NH', 'HIL'),
'middelburg': ('ZE', 'MID'),
'leeuwarden': ('FR', 'LEE'),
'assen': ('DR', 'ASS'),
'amersfoort': ('UT', 'AME'),
'lelystad': ('FL', 'LEL'),
'enschede': ('OV', 'ENS'),
'almere': ('FL', 'ALM'),
'wageningen': ('GE', 'WAG'),
'hoorn': ('NH', 'HOO'),
's-hertogenbosch': ('NB', 'SHE'),
'den bosch': ('NB', 'SHE'),
}
# Institution type mapping
TYPE_CORRECTIONS = {
'ministerie': 'O', # Official/Government
'ministry': 'O',
'gemeente': 'O',
'politie': 'O',
'dienst': 'O',
'academie': 'E', # Education
'academy': 'E',
'university': 'E',
'universiteit': 'E',
'hogeschool': 'E',
'school': 'E',
'museum': 'M',
'archief': 'A',
'archive': 'A',
'bibliotheek': 'L',
'library': 'L',
'stichting': 'N', # NGO
'foundation': 'N',
'vereniging': 'S', # Society
'association': 'S',
}
def search_wikidata(query: str) -> Optional[str]:
"""Search Wikidata for an entity."""
url = "https://www.wikidata.org/w/api.php"
params = {
'action': 'wbsearchentities',
'search': query,
'language': 'en',
'format': 'json',
'limit': 1
}
try:
resp = requests.get(url, params=params, timeout=10)
resp.raise_for_status()
data = resp.json()
if data.get('search'):
return data['search'][0]['id']
except Exception as e:
pass
return None
def get_location_from_wikidata(entity_id: str) -> Optional[str]:
"""Get location (P131) from Wikidata entity."""
sparql = f"""
SELECT ?locationLabel WHERE {{
wd:{entity_id} wdt:P131 ?location.
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,nl". }}
}}
LIMIT 1
"""
url = "https://query.wikidata.org/sparql"
try:
resp = requests.get(url, params={'query': sparql, 'format': 'json'}, timeout=10)
resp.raise_for_status()
data = resp.json()
bindings = data.get('results', {}).get('bindings', [])
if bindings:
return bindings[0].get('locationLabel', {}).get('value', '')
except Exception as e:
pass
return None
def get_province_city(location: str) -> Tuple[Optional[str], Optional[str]]:
"""Map location to province and city code."""
if not location:
return None, None
location_lower = location.lower().strip()
if location_lower in CITY_MAPPING:
return CITY_MAPPING[location_lower]
return None, None
def infer_institution_type(name: str) -> Optional[str]:
"""Infer institution type from name."""
name_lower = name.lower()
for keyword, type_code in TYPE_CORRECTIONS.items():
if keyword in name_lower:
return type_code
return None
def generate_abbreviation(name: str) -> str:
"""Generate abbreviation from name."""
# Skip common words
skip = {'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'en', 'of',
'the', 'a', 'an', 'of', 'for', 'and', 'or', 'at', 'in', 'on'}
words = name.replace('-', ' ').replace("'", ' ').split()
abbrev = ''.join(w[0].upper() for w in words if w.lower() not in skip and w)
return abbrev[:8] if abbrev else 'UNK'
def load_yaml(filepath: Path) -> Optional[Dict]:
"""Load YAML file."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
except:
return None
def save_yaml(filepath: Path, data: Dict):
"""Save YAML file."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False,
sort_keys=False, width=120)
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true')
parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process')
parser.add_argument('--custodian-dir', type=Path,
default=Path('/Users/kempersc/apps/glam/data/custodian'))
args = parser.parse_args()
custodian_dir = args.custodian_dir
print("=" * 80)
print("RESOLVING PENDING FILES VIA WIKIDATA")
print("=" * 80)
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
if args.limit:
print(f"Limit: {args.limit} files")
print()
# Find NL PENDING files only
pending_files = sorted(custodian_dir.glob('NL-XX-XXX-PENDING-*.yaml'))
if args.limit:
pending_files = pending_files[:args.limit]
print(f"Processing {len(pending_files)} files...")
print()
resolved = 0
failed = 0
skipped = 0
for i, filepath in enumerate(pending_files):
data = load_yaml(filepath)
if not data:
continue
name = data.get('custodian_name', {}).get('emic_name', '')
if not name:
continue
# Rate limiting
if i > 0 and i % 10 == 0:
time.sleep(1)
# Search Wikidata
entity_id = search_wikidata(name)
if not entity_id:
failed += 1
if args.dry_run and failed <= 10:
print(f"[SKIP] {name[:50]}: No Wikidata match")
continue
# Get location
location = get_location_from_wikidata(entity_id)
province, city_code = get_province_city(location)
if not province or not city_code:
failed += 1
if args.dry_run and failed <= 10:
print(f"[SKIP] {name[:50]}: Location '{location}' not mapped")
continue
# Infer type
inst_type = infer_institution_type(name)
if not inst_type:
inst_type = data.get('institution_type', 'M')[0] # First letter
# Generate abbreviation
abbrev = generate_abbreviation(name)
# New GHCID
new_ghcid = f"NL-{province}-{city_code}-{inst_type}-{abbrev}"
new_filepath = custodian_dir / f"{new_ghcid}.yaml"
# Check collision
if new_filepath.exists():
skipped += 1
if args.dry_run and skipped <= 10:
print(f"[COLLISION] {name[:40]} -> {new_ghcid}")
continue
print(f"[{'DRY RUN' if args.dry_run else 'RESOLVE'}] {name[:40]}")
print(f" Wikidata: {entity_id}, Location: {location}")
print(f" {filepath.name} -> {new_filepath.name}")
if not args.dry_run:
# Update data
data['ghcid_current'] = new_ghcid
# Add provenance
if 'provenance' not in data:
data['provenance'] = {}
notes = data['provenance'].get('notes', [])
if isinstance(notes, str):
notes = [notes]
notes.append(f"GHCID resolved via Wikidata {entity_id} on {datetime.now(timezone.utc).isoformat()}")
data['provenance']['notes'] = notes
# Save and rename
save_yaml(new_filepath, data)
filepath.unlink()
resolved += 1
print()
print("=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Resolved: {resolved}")
print(f"Failed (no match/location): {failed}")
print(f"Skipped (collision): {skipped}")
if __name__ == '__main__':
main()