291 lines
8.6 KiB
Python
Executable file
291 lines
8.6 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Resolve PENDING files using Wikidata location lookup.
|
|
|
|
This script:
|
|
1. Searches Wikidata for organization by emic name
|
|
2. Gets location (P131) from Wikidata
|
|
3. Maps location to Dutch province/city code
|
|
4. Assigns proper GHCID and renames file
|
|
|
|
Usage:
|
|
python scripts/resolve_pending_wikidata.py --dry-run # Preview
|
|
python scripts/resolve_pending_wikidata.py --limit 50 # Process 50 files
|
|
python scripts/resolve_pending_wikidata.py # Process all
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import yaml
|
|
import time
|
|
import requests
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, Optional, Tuple
|
|
|
|
# Dutch city to province/code mapping
|
|
CITY_MAPPING = {
|
|
'amsterdam': ('NH', 'AMS'),
|
|
'the hague': ('ZH', 'DHA'),
|
|
'den haag': ('ZH', 'DHA'),
|
|
"'s-gravenhage": ('ZH', 'DHA'),
|
|
'rotterdam': ('ZH', 'ROT'),
|
|
'utrecht': ('UT', 'UTR'),
|
|
'eindhoven': ('NB', 'EIN'),
|
|
'groningen': ('GR', 'GRO'),
|
|
'tilburg': ('NB', 'TIL'),
|
|
'breda': ('NB', 'BRE'),
|
|
'nijmegen': ('GE', 'NIJ'),
|
|
'haarlem': ('NH', 'HAA'),
|
|
'arnhem': ('GE', 'ARN'),
|
|
'apeldoorn': ('GE', 'APE'),
|
|
'maastricht': ('LI', 'MAA'),
|
|
'leiden': ('ZH', 'LEI'),
|
|
'dordrecht': ('ZH', 'DOR'),
|
|
'zwolle': ('OV', 'ZWO'),
|
|
'deventer': ('OV', 'DEV'),
|
|
'delft': ('ZH', 'DEL'),
|
|
'alkmaar': ('NH', 'ALK'),
|
|
'gouda': ('ZH', 'GOU'),
|
|
'hilversum': ('NH', 'HIL'),
|
|
'middelburg': ('ZE', 'MID'),
|
|
'leeuwarden': ('FR', 'LEE'),
|
|
'assen': ('DR', 'ASS'),
|
|
'amersfoort': ('UT', 'AME'),
|
|
'lelystad': ('FL', 'LEL'),
|
|
'enschede': ('OV', 'ENS'),
|
|
'almere': ('FL', 'ALM'),
|
|
'wageningen': ('GE', 'WAG'),
|
|
'hoorn': ('NH', 'HOO'),
|
|
's-hertogenbosch': ('NB', 'SHE'),
|
|
'den bosch': ('NB', 'SHE'),
|
|
}
|
|
|
|
# Institution type mapping
|
|
TYPE_CORRECTIONS = {
|
|
'ministerie': 'O', # Official/Government
|
|
'ministry': 'O',
|
|
'gemeente': 'O',
|
|
'politie': 'O',
|
|
'dienst': 'O',
|
|
'academie': 'E', # Education
|
|
'academy': 'E',
|
|
'university': 'E',
|
|
'universiteit': 'E',
|
|
'hogeschool': 'E',
|
|
'school': 'E',
|
|
'museum': 'M',
|
|
'archief': 'A',
|
|
'archive': 'A',
|
|
'bibliotheek': 'L',
|
|
'library': 'L',
|
|
'stichting': 'N', # NGO
|
|
'foundation': 'N',
|
|
'vereniging': 'S', # Society
|
|
'association': 'S',
|
|
}
|
|
|
|
|
|
def search_wikidata(query: str) -> Optional[str]:
|
|
"""Search Wikidata for an entity."""
|
|
url = "https://www.wikidata.org/w/api.php"
|
|
params = {
|
|
'action': 'wbsearchentities',
|
|
'search': query,
|
|
'language': 'en',
|
|
'format': 'json',
|
|
'limit': 1
|
|
}
|
|
try:
|
|
resp = requests.get(url, params=params, timeout=10)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
if data.get('search'):
|
|
return data['search'][0]['id']
|
|
except Exception as e:
|
|
pass
|
|
return None
|
|
|
|
|
|
def get_location_from_wikidata(entity_id: str) -> Optional[str]:
|
|
"""Get location (P131) from Wikidata entity."""
|
|
sparql = f"""
|
|
SELECT ?locationLabel WHERE {{
|
|
wd:{entity_id} wdt:P131 ?location.
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,nl". }}
|
|
}}
|
|
LIMIT 1
|
|
"""
|
|
url = "https://query.wikidata.org/sparql"
|
|
try:
|
|
resp = requests.get(url, params={'query': sparql, 'format': 'json'}, timeout=10)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
bindings = data.get('results', {}).get('bindings', [])
|
|
if bindings:
|
|
return bindings[0].get('locationLabel', {}).get('value', '')
|
|
except Exception as e:
|
|
pass
|
|
return None
|
|
|
|
|
|
def get_province_city(location: str) -> Tuple[Optional[str], Optional[str]]:
|
|
"""Map location to province and city code."""
|
|
if not location:
|
|
return None, None
|
|
location_lower = location.lower().strip()
|
|
if location_lower in CITY_MAPPING:
|
|
return CITY_MAPPING[location_lower]
|
|
return None, None
|
|
|
|
|
|
def infer_institution_type(name: str) -> Optional[str]:
|
|
"""Infer institution type from name."""
|
|
name_lower = name.lower()
|
|
for keyword, type_code in TYPE_CORRECTIONS.items():
|
|
if keyword in name_lower:
|
|
return type_code
|
|
return None
|
|
|
|
|
|
def generate_abbreviation(name: str) -> str:
|
|
"""Generate abbreviation from name."""
|
|
# Skip common words
|
|
skip = {'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'en', 'of',
|
|
'the', 'a', 'an', 'of', 'for', 'and', 'or', 'at', 'in', 'on'}
|
|
words = name.replace('-', ' ').replace("'", ' ').split()
|
|
abbrev = ''.join(w[0].upper() for w in words if w.lower() not in skip and w)
|
|
return abbrev[:8] if abbrev else 'UNK'
|
|
|
|
|
|
def load_yaml(filepath: Path) -> Optional[Dict]:
|
|
"""Load YAML file."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
except:
|
|
return None
|
|
|
|
|
|
def save_yaml(filepath: Path, data: Dict):
|
|
"""Save YAML file."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False,
|
|
sort_keys=False, width=120)
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--dry-run', action='store_true')
|
|
parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process')
|
|
parser.add_argument('--custodian-dir', type=Path,
|
|
default=Path('/Users/kempersc/apps/glam/data/custodian'))
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = args.custodian_dir
|
|
|
|
print("=" * 80)
|
|
print("RESOLVING PENDING FILES VIA WIKIDATA")
|
|
print("=" * 80)
|
|
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
|
|
if args.limit:
|
|
print(f"Limit: {args.limit} files")
|
|
print()
|
|
|
|
# Find NL PENDING files only
|
|
pending_files = sorted(custodian_dir.glob('NL-XX-XXX-PENDING-*.yaml'))
|
|
if args.limit:
|
|
pending_files = pending_files[:args.limit]
|
|
|
|
print(f"Processing {len(pending_files)} files...")
|
|
print()
|
|
|
|
resolved = 0
|
|
failed = 0
|
|
skipped = 0
|
|
|
|
for i, filepath in enumerate(pending_files):
|
|
data = load_yaml(filepath)
|
|
if not data:
|
|
continue
|
|
|
|
name = data.get('custodian_name', {}).get('emic_name', '')
|
|
if not name:
|
|
continue
|
|
|
|
# Rate limiting
|
|
if i > 0 and i % 10 == 0:
|
|
time.sleep(1)
|
|
|
|
# Search Wikidata
|
|
entity_id = search_wikidata(name)
|
|
if not entity_id:
|
|
failed += 1
|
|
if args.dry_run and failed <= 10:
|
|
print(f"[SKIP] {name[:50]}: No Wikidata match")
|
|
continue
|
|
|
|
# Get location
|
|
location = get_location_from_wikidata(entity_id)
|
|
province, city_code = get_province_city(location)
|
|
|
|
if not province or not city_code:
|
|
failed += 1
|
|
if args.dry_run and failed <= 10:
|
|
print(f"[SKIP] {name[:50]}: Location '{location}' not mapped")
|
|
continue
|
|
|
|
# Infer type
|
|
inst_type = infer_institution_type(name)
|
|
if not inst_type:
|
|
inst_type = data.get('institution_type', 'M')[0] # First letter
|
|
|
|
# Generate abbreviation
|
|
abbrev = generate_abbreviation(name)
|
|
|
|
# New GHCID
|
|
new_ghcid = f"NL-{province}-{city_code}-{inst_type}-{abbrev}"
|
|
new_filepath = custodian_dir / f"{new_ghcid}.yaml"
|
|
|
|
# Check collision
|
|
if new_filepath.exists():
|
|
skipped += 1
|
|
if args.dry_run and skipped <= 10:
|
|
print(f"[COLLISION] {name[:40]} -> {new_ghcid}")
|
|
continue
|
|
|
|
print(f"[{'DRY RUN' if args.dry_run else 'RESOLVE'}] {name[:40]}")
|
|
print(f" Wikidata: {entity_id}, Location: {location}")
|
|
print(f" {filepath.name} -> {new_filepath.name}")
|
|
|
|
if not args.dry_run:
|
|
# Update data
|
|
data['ghcid_current'] = new_ghcid
|
|
|
|
# Add provenance
|
|
if 'provenance' not in data:
|
|
data['provenance'] = {}
|
|
notes = data['provenance'].get('notes', [])
|
|
if isinstance(notes, str):
|
|
notes = [notes]
|
|
notes.append(f"GHCID resolved via Wikidata {entity_id} on {datetime.now(timezone.utc).isoformat()}")
|
|
data['provenance']['notes'] = notes
|
|
|
|
# Save and rename
|
|
save_yaml(new_filepath, data)
|
|
filepath.unlink()
|
|
|
|
resolved += 1
|
|
print()
|
|
|
|
print("=" * 80)
|
|
print("SUMMARY")
|
|
print("=" * 80)
|
|
print(f"Resolved: {resolved}")
|
|
print(f"Failed (no match/location): {failed}")
|
|
print(f"Skipped (collision): {skipped}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|