#!/usr/bin/env python3 """ Resolve PENDING files using Wikidata location lookup. This script: 1. Searches Wikidata for organization by emic name 2. Gets location (P131) from Wikidata 3. Maps location to Dutch province/city code 4. Assigns proper GHCID and renames file Usage: python scripts/resolve_pending_wikidata.py --dry-run # Preview python scripts/resolve_pending_wikidata.py --limit 50 # Process 50 files python scripts/resolve_pending_wikidata.py # Process all """ import os import re import yaml import time import requests from pathlib import Path from datetime import datetime, timezone from typing import Dict, Optional, Tuple # Dutch city to province/code mapping CITY_MAPPING = { 'amsterdam': ('NH', 'AMS'), 'the hague': ('ZH', 'DHA'), 'den haag': ('ZH', 'DHA'), "'s-gravenhage": ('ZH', 'DHA'), 'rotterdam': ('ZH', 'ROT'), 'utrecht': ('UT', 'UTR'), 'eindhoven': ('NB', 'EIN'), 'groningen': ('GR', 'GRO'), 'tilburg': ('NB', 'TIL'), 'breda': ('NB', 'BRE'), 'nijmegen': ('GE', 'NIJ'), 'haarlem': ('NH', 'HAA'), 'arnhem': ('GE', 'ARN'), 'apeldoorn': ('GE', 'APE'), 'maastricht': ('LI', 'MAA'), 'leiden': ('ZH', 'LEI'), 'dordrecht': ('ZH', 'DOR'), 'zwolle': ('OV', 'ZWO'), 'deventer': ('OV', 'DEV'), 'delft': ('ZH', 'DEL'), 'alkmaar': ('NH', 'ALK'), 'gouda': ('ZH', 'GOU'), 'hilversum': ('NH', 'HIL'), 'middelburg': ('ZE', 'MID'), 'leeuwarden': ('FR', 'LEE'), 'assen': ('DR', 'ASS'), 'amersfoort': ('UT', 'AME'), 'lelystad': ('FL', 'LEL'), 'enschede': ('OV', 'ENS'), 'almere': ('FL', 'ALM'), 'wageningen': ('GE', 'WAG'), 'hoorn': ('NH', 'HOO'), 's-hertogenbosch': ('NB', 'SHE'), 'den bosch': ('NB', 'SHE'), } # Institution type mapping TYPE_CORRECTIONS = { 'ministerie': 'O', # Official/Government 'ministry': 'O', 'gemeente': 'O', 'politie': 'O', 'dienst': 'O', 'academie': 'E', # Education 'academy': 'E', 'university': 'E', 'universiteit': 'E', 'hogeschool': 'E', 'school': 'E', 'museum': 'M', 'archief': 'A', 'archive': 'A', 'bibliotheek': 'L', 'library': 'L', 'stichting': 'N', # NGO 'foundation': 'N', 'vereniging': 'S', # Society 'association': 'S', } def search_wikidata(query: str) -> Optional[str]: """Search Wikidata for an entity.""" url = "https://www.wikidata.org/w/api.php" params = { 'action': 'wbsearchentities', 'search': query, 'language': 'en', 'format': 'json', 'limit': 1 } try: resp = requests.get(url, params=params, timeout=10) resp.raise_for_status() data = resp.json() if data.get('search'): return data['search'][0]['id'] except Exception as e: pass return None def get_location_from_wikidata(entity_id: str) -> Optional[str]: """Get location (P131) from Wikidata entity.""" sparql = f""" SELECT ?locationLabel WHERE {{ wd:{entity_id} wdt:P131 ?location. SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,nl". }} }} LIMIT 1 """ url = "https://query.wikidata.org/sparql" try: resp = requests.get(url, params={'query': sparql, 'format': 'json'}, timeout=10) resp.raise_for_status() data = resp.json() bindings = data.get('results', {}).get('bindings', []) if bindings: return bindings[0].get('locationLabel', {}).get('value', '') except Exception as e: pass return None def get_province_city(location: str) -> Tuple[Optional[str], Optional[str]]: """Map location to province and city code.""" if not location: return None, None location_lower = location.lower().strip() if location_lower in CITY_MAPPING: return CITY_MAPPING[location_lower] return None, None def infer_institution_type(name: str) -> Optional[str]: """Infer institution type from name.""" name_lower = name.lower() for keyword, type_code in TYPE_CORRECTIONS.items(): if keyword in name_lower: return type_code return None def generate_abbreviation(name: str) -> str: """Generate abbreviation from name.""" # Skip common words skip = {'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'en', 'of', 'the', 'a', 'an', 'of', 'for', 'and', 'or', 'at', 'in', 'on'} words = name.replace('-', ' ').replace("'", ' ').split() abbrev = ''.join(w[0].upper() for w in words if w.lower() not in skip and w) return abbrev[:8] if abbrev else 'UNK' def load_yaml(filepath: Path) -> Optional[Dict]: """Load YAML file.""" try: with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) except: return None def save_yaml(filepath: Path, data: Dict): """Save YAML file.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--dry-run', action='store_true') parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process') parser.add_argument('--custodian-dir', type=Path, default=Path('/Users/kempersc/apps/glam/data/custodian')) args = parser.parse_args() custodian_dir = args.custodian_dir print("=" * 80) print("RESOLVING PENDING FILES VIA WIKIDATA") print("=" * 80) print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}") if args.limit: print(f"Limit: {args.limit} files") print() # Find NL PENDING files only pending_files = sorted(custodian_dir.glob('NL-XX-XXX-PENDING-*.yaml')) if args.limit: pending_files = pending_files[:args.limit] print(f"Processing {len(pending_files)} files...") print() resolved = 0 failed = 0 skipped = 0 for i, filepath in enumerate(pending_files): data = load_yaml(filepath) if not data: continue name = data.get('custodian_name', {}).get('emic_name', '') if not name: continue # Rate limiting if i > 0 and i % 10 == 0: time.sleep(1) # Search Wikidata entity_id = search_wikidata(name) if not entity_id: failed += 1 if args.dry_run and failed <= 10: print(f"[SKIP] {name[:50]}: No Wikidata match") continue # Get location location = get_location_from_wikidata(entity_id) province, city_code = get_province_city(location) if not province or not city_code: failed += 1 if args.dry_run and failed <= 10: print(f"[SKIP] {name[:50]}: Location '{location}' not mapped") continue # Infer type inst_type = infer_institution_type(name) if not inst_type: inst_type = data.get('institution_type', 'M')[0] # First letter # Generate abbreviation abbrev = generate_abbreviation(name) # New GHCID new_ghcid = f"NL-{province}-{city_code}-{inst_type}-{abbrev}" new_filepath = custodian_dir / f"{new_ghcid}.yaml" # Check collision if new_filepath.exists(): skipped += 1 if args.dry_run and skipped <= 10: print(f"[COLLISION] {name[:40]} -> {new_ghcid}") continue print(f"[{'DRY RUN' if args.dry_run else 'RESOLVE'}] {name[:40]}") print(f" Wikidata: {entity_id}, Location: {location}") print(f" {filepath.name} -> {new_filepath.name}") if not args.dry_run: # Update data data['ghcid_current'] = new_ghcid # Add provenance if 'provenance' not in data: data['provenance'] = {} notes = data['provenance'].get('notes', []) if isinstance(notes, str): notes = [notes] notes.append(f"GHCID resolved via Wikidata {entity_id} on {datetime.now(timezone.utc).isoformat()}") data['provenance']['notes'] = notes # Save and rename save_yaml(new_filepath, data) filepath.unlink() resolved += 1 print() print("=" * 80) print("SUMMARY") print("=" * 80) print(f"Resolved: {resolved}") print(f"Failed (no match/location): {failed}") print(f"Skipped (collision): {skipped}") if __name__ == '__main__': main()