#!/usr/bin/env python3 """ Comprehensive PENDING file resolver using multiple strategies: 1. Known organization lookup table 2. City name extraction from emic name 3. Country re-detection for misclassified files 4. Wikidata lookup for remaining Usage: python scripts/resolve_pending_comprehensive.py --dry-run python scripts/resolve_pending_comprehensive.py --limit 100 python scripts/resolve_pending_comprehensive.py """ import os import re import yaml from pathlib import Path from datetime import datetime, timezone from typing import Dict, Optional, Tuple, List # Known Dutch organizations with their locations # Format: 'name pattern': ('province', 'city_code', 'type', 'abbreviation') KNOWN_ORGANIZATIONS = { # Government 'ministerie van buitenlandse zaken': ('ZH', 'DHA', 'O', 'MBZ'), 'ministerie van justitie en veiligheid': ('ZH', 'DHA', 'O', 'MJV'), 'ministerie van onderwijs': ('ZH', 'DHA', 'O', 'MOC'), 'ministerie van defensie': ('ZH', 'DHA', 'O', 'MD'), 'ministerie van financien': ('ZH', 'DHA', 'O', 'MF'), 'algemene rekenkamer': ('ZH', 'DHA', 'O', 'AR'), 'politie nederland': ('ZH', 'DHA', 'O', 'PN'), 'douane nederland': ('ZH', 'ROT', 'O', 'DN'), 'kadaster': ('GE', 'APE', 'O', 'K'), 'rijkswaterstaat': ('UT', 'UTR', 'O', 'RWS'), 'netherlands enterprise agency': ('ZH', 'DHA', 'O', 'NEA'), 'dienst uitvoering onderwijs': ('GR', 'GRO', 'O', 'DUO'), # Education 'reinwardt academie': ('NH', 'AMS', 'E', 'RA'), 'academie minerva': ('GR', 'GRO', 'E', 'AM'), 'university of humanistic studies': ('UT', 'UTR', 'E', 'UHS'), 'erasmus university': ('ZH', 'ROT', 'E', 'EUR'), 'universiteit van amsterdam': ('NH', 'AMS', 'E', 'UVA'), 'vrije universiteit amsterdam': ('NH', 'AMS', 'E', 'VU'), 'universiteit leiden': ('ZH', 'LEI', 'E', 'UL'), 'universiteit utrecht': ('UT', 'UTR', 'E', 'UU'), 'rijksuniversiteit groningen': ('GR', 'GRO', 'E', 'RUG'), 'technische universiteit delft': ('ZH', 'DEL', 'E', 'TUD'), 'technische universiteit eindhoven': ('NB', 'EIN', 'E', 'TUE'), 'hogeschool van amsterdam': ('NH', 'AMS', 'E', 'HVA'), 'hogeschool rotterdam': ('ZH', 'ROT', 'E', 'HR'), 'hogeschool utrecht': ('UT', 'UTR', 'E', 'HU'), # Museums 'anne frank stichting': ('NH', 'AMS', 'M', 'AFS'), 'allard pierson': ('NH', 'AMS', 'M', 'AP'), 'airborne museum': ('GE', 'ARN', 'M', 'ABM'), 'van gogh museum': ('NH', 'AMS', 'M', 'VGM'), 'mauritshuis': ('ZH', 'DHA', 'M', 'MH'), 'kroller muller museum': ('GE', 'OTT', 'M', 'KMM'), 'naturalis': ('ZH', 'LEI', 'M', 'NAT'), 'tropenmuseum': ('NH', 'AMS', 'M', 'TM'), 'joods historisch museum': ('NH', 'AMS', 'M', 'JHM'), 'koninklijk paleis amsterdam': ('NH', 'AMS', 'M', 'KPA'), # Archives 'stadsarchief amsterdam': ('NH', 'AMS', 'A', 'SAA'), 'nationaal archief': ('ZH', 'DHA', 'A', 'NA'), 'amsab': ('BE', 'GEN', 'A', 'AMS'), # Belgian # Research 'african studies centre leiden': ('ZH', 'LEI', 'R', 'ASCL'), 'niod': ('NH', 'AMS', 'R', 'NIOD'), 'knaw': ('NH', 'AMS', 'R', 'KNAW'), 'nwo': ('ZH', 'DHA', 'R', 'NWO'), 'rivm': ('UT', 'BIL', 'R', 'RIVM'), 'tno': ('ZH', 'DHA', 'R', 'TNO'), # NGOs/Foundations 'amsterdams fonds voor de kunst': ('NH', 'AMS', 'N', 'AFK'), 'mondriaan fonds': ('NH', 'AMS', 'N', 'MF'), 'stimuleringsfonds': ('ZH', 'ROT', 'N', 'SF'), 'fonds voor cultuurparticipatie': ('UT', 'UTR', 'N', 'FCP'), } # Additional city patterns to detect CITY_PATTERNS = { r'\bamsterdam\b': ('NH', 'AMS'), r'\brotterdam\b': ('ZH', 'ROT'), r'\bden haag\b': ('ZH', 'DHA'), r'\b\'s-gravenhage\b': ('ZH', 'DHA'), r'\butrecht\b': ('UT', 'UTR'), r'\beindhoven\b': ('NB', 'EIN'), r'\bgroningen\b': ('GR', 'GRO'), r'\bleiden\b': ('ZH', 'LEI'), r'\bhaarlem\b': ('NH', 'HAA'), r'\barnhem\b': ('GE', 'ARN'), r'\bnijmegen\b': ('GE', 'NIJ'), r'\bmaastricht\b': ('LI', 'MAA'), r'\btilburg\b': ('NB', 'TIL'), r'\bbreda\b': ('NB', 'BRE'), r'\bzwolle\b': ('OV', 'ZWO'), r'\bdeventer\b': ('OV', 'DEV'), r'\bdelft\b': ('ZH', 'DEL'), r'\balkmaar\b': ('NH', 'ALK'), r'\bgouda\b': ('ZH', 'GOU'), r'\bhilversum\b': ('NH', 'HIL'), r'\bmiddelburg\b': ('ZE', 'MID'), r'\bleeuwarden\b': ('FR', 'LEE'), r'\bassen\b': ('DR', 'ASS'), r'\bapeldoorn\b': ('GE', 'APE'), r'\benschede\b': ('OV', 'ENS'), r'\bdordrecht\b': ('ZH', 'DOR'), r'\bhattem\b': ('GE', 'HAT'), r'\bkampen\b': ('OV', 'KAM'), r'\belburg\b': ('GE', 'ELB'), r'\bharderwijk\b': ('GE', 'HAR'), r'\bwageningen\b': ('GE', 'WAG'), } # Non-Dutch indicators - files with these should be reclassified NON_DUTCH_PATTERNS = { r'\bsaudi\b': 'SA', r'\bمشاريع\b': 'SA', # Arabic r'\bوزارة\b': 'SA', # Arabic Ministry r'\baalborg\b': 'DK', r'\bsainte-m[eè]re\b': 'FR', r'\bnouvelle-aquitaine\b': 'FR', r'\bbelgium\b': 'BE', r'\bgent\b': 'BE', r'\bantwerp\b': 'BE', r'\bghent\b': 'BE', r'\bberlin\b': 'DE', r'\bweimar\b': 'DE', r'\bbritish\b': 'GB', r'\blondon\b': 'GB', r'\bparis\b': 'FR', r'\broma\b': 'IT', r'\bmilano\b': 'IT', } # Institution type inference TYPE_KEYWORDS = { 'museum': 'M', 'musea': 'M', 'archief': 'A', 'archive': 'A', 'bibliotheek': 'L', 'library': 'L', 'universiteit': 'E', 'university': 'E', 'hogeschool': 'E', 'academie': 'E', 'academy': 'E', 'school': 'E', 'ministerie': 'O', 'ministry': 'O', 'gemeente': 'O', 'politie': 'O', 'rijks': 'O', 'dienst': 'O', 'stichting': 'N', 'foundation': 'N', 'fonds': 'N', 'fund': 'N', 'vereniging': 'S', 'society': 'S', 'association': 'S', } def normalize_name(name: str) -> str: """Normalize name for matching.""" return name.lower().strip() def lookup_known_org(name: str) -> Optional[Tuple[str, str, str, str]]: """Look up organization in known list.""" name_lower = normalize_name(name) for pattern, info in KNOWN_ORGANIZATIONS.items(): if pattern in name_lower: return info return None def detect_city(name: str) -> Optional[Tuple[str, str]]: """Detect city from name.""" name_lower = normalize_name(name) for pattern, (prov, city) in CITY_PATTERNS.items(): if re.search(pattern, name_lower): return (prov, city) return None def detect_non_dutch(name: str) -> Optional[str]: """Detect if organization is not Dutch.""" name_lower = normalize_name(name) for pattern, country in NON_DUTCH_PATTERNS.items(): if re.search(pattern, name_lower): return country return None def infer_type(name: str) -> str: """Infer institution type from name.""" name_lower = normalize_name(name) for keyword, type_code in TYPE_KEYWORDS.items(): if keyword in name_lower: return type_code return 'M' # Default to Museum def generate_abbreviation(name: str) -> str: """Generate abbreviation from name.""" skip = {'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'en', 'of', 'the', 'a', 'an', 'of', 'for', 'and', 'or', 'at', 'in', 'on', 'stichting', 'museum', 'archief', 'bibliotheek'} words = re.split(r'[\s\-\'\"\(\)]+', name) abbrev = ''.join(w[0].upper() for w in words if w.lower() not in skip and w and w[0].isalpha()) return abbrev[:8] if abbrev else 'UNK' def load_yaml(filepath: Path) -> Optional[Dict]: """Load YAML file.""" try: with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) except: return None def save_yaml(filepath: Path, data: Dict): """Save YAML file.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) def resolve_pending_file(filepath: Path, custodian_dir: Path, dry_run: bool = True) -> Tuple[str, Optional[Path]]: """ Resolve a PENDING file. Returns: (status, new_filepath) Status: 'resolved', 'reclassified', 'collision', 'failed' """ data = load_yaml(filepath) if not data: return ('error', None) name = data.get('custodian_name', {}).get('emic_name', '') if not name: return ('error', None) # Strategy 1: Check if non-Dutch new_country = detect_non_dutch(name) if new_country and new_country != 'NL': # Reclassify to different country old_name = filepath.stem new_name = old_name.replace('NL-XX-XXX-PENDING-', f'{new_country}-XX-XXX-PENDING-') new_filepath = custodian_dir / f"{new_name}.yaml" if new_filepath.exists(): return ('collision', None) if not dry_run: data['ghcid_current'] = new_name save_yaml(new_filepath, data) filepath.unlink() return ('reclassified', new_filepath) # Strategy 2: Known organization lookup known = lookup_known_org(name) if known: prov, city, inst_type, abbrev = known new_ghcid = f"NL-{prov}-{city}-{inst_type}-{abbrev}" new_filepath = custodian_dir / f"{new_ghcid}.yaml" if new_filepath.exists(): return ('collision', new_filepath) if not dry_run: data['ghcid_current'] = new_ghcid if 'provenance' not in data: data['provenance'] = {} notes = data['provenance'].get('notes', []) if isinstance(notes, str): notes = [notes] notes.append(f"GHCID resolved via known org lookup on {datetime.now(timezone.utc).isoformat()}") data['provenance']['notes'] = notes save_yaml(new_filepath, data) filepath.unlink() return ('resolved', new_filepath) # Strategy 3: City name extraction city_info = detect_city(name) if city_info: prov, city = city_info inst_type = infer_type(name) abbrev = generate_abbreviation(name) new_ghcid = f"NL-{prov}-{city}-{inst_type}-{abbrev}" new_filepath = custodian_dir / f"{new_ghcid}.yaml" if new_filepath.exists(): return ('collision', new_filepath) if not dry_run: data['ghcid_current'] = new_ghcid if 'provenance' not in data: data['provenance'] = {} notes = data['provenance'].get('notes', []) if isinstance(notes, str): notes = [notes] notes.append(f"GHCID resolved via city extraction on {datetime.now(timezone.utc).isoformat()}") data['provenance']['notes'] = notes save_yaml(new_filepath, data) filepath.unlink() return ('resolved', new_filepath) return ('failed', None) def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--dry-run', action='store_true') parser.add_argument('--limit', type=int, default=0) parser.add_argument('--custodian-dir', type=Path, default=Path('/Users/kempersc/apps/glam/data/custodian')) args = parser.parse_args() custodian_dir = args.custodian_dir print("=" * 80) print("COMPREHENSIVE PENDING FILE RESOLVER") print("=" * 80) print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}") if args.limit: print(f"Limit: {args.limit} files") print() # Find NL PENDING files pending_files = sorted(custodian_dir.glob('NL-XX-XXX-PENDING-*.yaml')) if args.limit: pending_files = pending_files[:args.limit] print(f"Processing {len(pending_files)} files...") print() stats = {'resolved': 0, 'reclassified': 0, 'collision': 0, 'failed': 0, 'error': 0} for filepath in pending_files: data = load_yaml(filepath) if not data: stats['error'] += 1 continue name = data.get('custodian_name', {}).get('emic_name', '') status, new_path = resolve_pending_file(filepath, custodian_dir, args.dry_run) stats[status] += 1 if status in ['resolved', 'reclassified']: action = 'DRY RUN' if args.dry_run else status.upper() print(f"[{action}] {name[:45]}") if new_path: print(f" -> {new_path.name}") print() print("=" * 80) print("SUMMARY") print("=" * 80) for status, count in stats.items(): if count > 0: print(f" {status}: {count}") print(f" TOTAL: {sum(stats.values())}") if __name__ == '__main__': main()