glam/scripts/resolve_pending_comprehensive.py
2026-01-09 18:26:58 +01:00

383 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Comprehensive PENDING file resolver using multiple strategies:
1. Known organization lookup table
2. City name extraction from emic name
3. Country re-detection for misclassified files
4. Wikidata lookup for remaining
Usage:
python scripts/resolve_pending_comprehensive.py --dry-run
python scripts/resolve_pending_comprehensive.py --limit 100
python scripts/resolve_pending_comprehensive.py
"""
import os
import re
import yaml
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, Optional, Tuple, List
# Known Dutch organizations with their locations
# Format: 'name pattern': ('province', 'city_code', 'type', 'abbreviation')
KNOWN_ORGANIZATIONS = {
# Government
'ministerie van buitenlandse zaken': ('ZH', 'DHA', 'O', 'MBZ'),
'ministerie van justitie en veiligheid': ('ZH', 'DHA', 'O', 'MJV'),
'ministerie van onderwijs': ('ZH', 'DHA', 'O', 'MOC'),
'ministerie van defensie': ('ZH', 'DHA', 'O', 'MD'),
'ministerie van financien': ('ZH', 'DHA', 'O', 'MF'),
'algemene rekenkamer': ('ZH', 'DHA', 'O', 'AR'),
'politie nederland': ('ZH', 'DHA', 'O', 'PN'),
'douane nederland': ('ZH', 'ROT', 'O', 'DN'),
'kadaster': ('GE', 'APE', 'O', 'K'),
'rijkswaterstaat': ('UT', 'UTR', 'O', 'RWS'),
'netherlands enterprise agency': ('ZH', 'DHA', 'O', 'NEA'),
'dienst uitvoering onderwijs': ('GR', 'GRO', 'O', 'DUO'),
# Education
'reinwardt academie': ('NH', 'AMS', 'E', 'RA'),
'academie minerva': ('GR', 'GRO', 'E', 'AM'),
'university of humanistic studies': ('UT', 'UTR', 'E', 'UHS'),
'erasmus university': ('ZH', 'ROT', 'E', 'EUR'),
'universiteit van amsterdam': ('NH', 'AMS', 'E', 'UVA'),
'vrije universiteit amsterdam': ('NH', 'AMS', 'E', 'VU'),
'universiteit leiden': ('ZH', 'LEI', 'E', 'UL'),
'universiteit utrecht': ('UT', 'UTR', 'E', 'UU'),
'rijksuniversiteit groningen': ('GR', 'GRO', 'E', 'RUG'),
'technische universiteit delft': ('ZH', 'DEL', 'E', 'TUD'),
'technische universiteit eindhoven': ('NB', 'EIN', 'E', 'TUE'),
'hogeschool van amsterdam': ('NH', 'AMS', 'E', 'HVA'),
'hogeschool rotterdam': ('ZH', 'ROT', 'E', 'HR'),
'hogeschool utrecht': ('UT', 'UTR', 'E', 'HU'),
# Museums
'anne frank stichting': ('NH', 'AMS', 'M', 'AFS'),
'allard pierson': ('NH', 'AMS', 'M', 'AP'),
'airborne museum': ('GE', 'ARN', 'M', 'ABM'),
'van gogh museum': ('NH', 'AMS', 'M', 'VGM'),
'mauritshuis': ('ZH', 'DHA', 'M', 'MH'),
'kroller muller museum': ('GE', 'OTT', 'M', 'KMM'),
'naturalis': ('ZH', 'LEI', 'M', 'NAT'),
'tropenmuseum': ('NH', 'AMS', 'M', 'TM'),
'joods historisch museum': ('NH', 'AMS', 'M', 'JHM'),
'koninklijk paleis amsterdam': ('NH', 'AMS', 'M', 'KPA'),
# Archives
'stadsarchief amsterdam': ('NH', 'AMS', 'A', 'SAA'),
'nationaal archief': ('ZH', 'DHA', 'A', 'NA'),
'amsab': ('BE', 'GEN', 'A', 'AMS'), # Belgian
# Research
'african studies centre leiden': ('ZH', 'LEI', 'R', 'ASCL'),
'niod': ('NH', 'AMS', 'R', 'NIOD'),
'knaw': ('NH', 'AMS', 'R', 'KNAW'),
'nwo': ('ZH', 'DHA', 'R', 'NWO'),
'rivm': ('UT', 'BIL', 'R', 'RIVM'),
'tno': ('ZH', 'DHA', 'R', 'TNO'),
# NGOs/Foundations
'amsterdams fonds voor de kunst': ('NH', 'AMS', 'N', 'AFK'),
'mondriaan fonds': ('NH', 'AMS', 'N', 'MF'),
'stimuleringsfonds': ('ZH', 'ROT', 'N', 'SF'),
'fonds voor cultuurparticipatie': ('UT', 'UTR', 'N', 'FCP'),
}
# Additional city patterns to detect
CITY_PATTERNS = {
r'\bamsterdam\b': ('NH', 'AMS'),
r'\brotterdam\b': ('ZH', 'ROT'),
r'\bden haag\b': ('ZH', 'DHA'),
r'\b\'s-gravenhage\b': ('ZH', 'DHA'),
r'\butrecht\b': ('UT', 'UTR'),
r'\beindhoven\b': ('NB', 'EIN'),
r'\bgroningen\b': ('GR', 'GRO'),
r'\bleiden\b': ('ZH', 'LEI'),
r'\bhaarlem\b': ('NH', 'HAA'),
r'\barnhem\b': ('GE', 'ARN'),
r'\bnijmegen\b': ('GE', 'NIJ'),
r'\bmaastricht\b': ('LI', 'MAA'),
r'\btilburg\b': ('NB', 'TIL'),
r'\bbreda\b': ('NB', 'BRE'),
r'\bzwolle\b': ('OV', 'ZWO'),
r'\bdeventer\b': ('OV', 'DEV'),
r'\bdelft\b': ('ZH', 'DEL'),
r'\balkmaar\b': ('NH', 'ALK'),
r'\bgouda\b': ('ZH', 'GOU'),
r'\bhilversum\b': ('NH', 'HIL'),
r'\bmiddelburg\b': ('ZE', 'MID'),
r'\bleeuwarden\b': ('FR', 'LEE'),
r'\bassen\b': ('DR', 'ASS'),
r'\bapeldoorn\b': ('GE', 'APE'),
r'\benschede\b': ('OV', 'ENS'),
r'\bdordrecht\b': ('ZH', 'DOR'),
r'\bhattem\b': ('GE', 'HAT'),
r'\bkampen\b': ('OV', 'KAM'),
r'\belburg\b': ('GE', 'ELB'),
r'\bharderwijk\b': ('GE', 'HAR'),
r'\bwageningen\b': ('GE', 'WAG'),
}
# Non-Dutch indicators - files with these should be reclassified
NON_DUTCH_PATTERNS = {
r'\bsaudi\b': 'SA',
r'\bمشاريع\b': 'SA', # Arabic
r'\bوزارة\b': 'SA', # Arabic Ministry
r'\baalborg\b': 'DK',
r'\bsainte-m[eè]re\b': 'FR',
r'\bnouvelle-aquitaine\b': 'FR',
r'\bbelgium\b': 'BE',
r'\bgent\b': 'BE',
r'\bantwerp\b': 'BE',
r'\bghent\b': 'BE',
r'\bberlin\b': 'DE',
r'\bweimar\b': 'DE',
r'\bbritish\b': 'GB',
r'\blondon\b': 'GB',
r'\bparis\b': 'FR',
r'\broma\b': 'IT',
r'\bmilano\b': 'IT',
}
# Institution type inference
TYPE_KEYWORDS = {
'museum': 'M',
'musea': 'M',
'archief': 'A',
'archive': 'A',
'bibliotheek': 'L',
'library': 'L',
'universiteit': 'E',
'university': 'E',
'hogeschool': 'E',
'academie': 'E',
'academy': 'E',
'school': 'E',
'ministerie': 'O',
'ministry': 'O',
'gemeente': 'O',
'politie': 'O',
'rijks': 'O',
'dienst': 'O',
'stichting': 'N',
'foundation': 'N',
'fonds': 'N',
'fund': 'N',
'vereniging': 'S',
'society': 'S',
'association': 'S',
}
def normalize_name(name: str) -> str:
"""Normalize name for matching."""
return name.lower().strip()
def lookup_known_org(name: str) -> Optional[Tuple[str, str, str, str]]:
"""Look up organization in known list."""
name_lower = normalize_name(name)
for pattern, info in KNOWN_ORGANIZATIONS.items():
if pattern in name_lower:
return info
return None
def detect_city(name: str) -> Optional[Tuple[str, str]]:
"""Detect city from name."""
name_lower = normalize_name(name)
for pattern, (prov, city) in CITY_PATTERNS.items():
if re.search(pattern, name_lower):
return (prov, city)
return None
def detect_non_dutch(name: str) -> Optional[str]:
"""Detect if organization is not Dutch."""
name_lower = normalize_name(name)
for pattern, country in NON_DUTCH_PATTERNS.items():
if re.search(pattern, name_lower):
return country
return None
def infer_type(name: str) -> str:
"""Infer institution type from name."""
name_lower = normalize_name(name)
for keyword, type_code in TYPE_KEYWORDS.items():
if keyword in name_lower:
return type_code
return 'M' # Default to Museum
def generate_abbreviation(name: str) -> str:
"""Generate abbreviation from name."""
skip = {'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'en', 'of',
'the', 'a', 'an', 'of', 'for', 'and', 'or', 'at', 'in', 'on',
'stichting', 'museum', 'archief', 'bibliotheek'}
words = re.split(r'[\s\-\'\"\(\)]+', name)
abbrev = ''.join(w[0].upper() for w in words
if w.lower() not in skip and w and w[0].isalpha())
return abbrev[:8] if abbrev else 'UNK'
def load_yaml(filepath: Path) -> Optional[Dict]:
"""Load YAML file."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
except:
return None
def save_yaml(filepath: Path, data: Dict):
"""Save YAML file."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False,
sort_keys=False, width=120)
def resolve_pending_file(filepath: Path, custodian_dir: Path, dry_run: bool = True) -> Tuple[str, Optional[Path]]:
"""
Resolve a PENDING file.
Returns: (status, new_filepath)
Status: 'resolved', 'reclassified', 'collision', 'failed'
"""
data = load_yaml(filepath)
if not data:
return ('error', None)
name = data.get('custodian_name', {}).get('emic_name', '')
if not name:
return ('error', None)
# Strategy 1: Check if non-Dutch
new_country = detect_non_dutch(name)
if new_country and new_country != 'NL':
# Reclassify to different country
old_name = filepath.stem
new_name = old_name.replace('NL-XX-XXX-PENDING-', f'{new_country}-XX-XXX-PENDING-')
new_filepath = custodian_dir / f"{new_name}.yaml"
if new_filepath.exists():
return ('collision', None)
if not dry_run:
data['ghcid_current'] = new_name
save_yaml(new_filepath, data)
filepath.unlink()
return ('reclassified', new_filepath)
# Strategy 2: Known organization lookup
known = lookup_known_org(name)
if known:
prov, city, inst_type, abbrev = known
new_ghcid = f"NL-{prov}-{city}-{inst_type}-{abbrev}"
new_filepath = custodian_dir / f"{new_ghcid}.yaml"
if new_filepath.exists():
return ('collision', new_filepath)
if not dry_run:
data['ghcid_current'] = new_ghcid
if 'provenance' not in data:
data['provenance'] = {}
notes = data['provenance'].get('notes', [])
if isinstance(notes, str):
notes = [notes]
notes.append(f"GHCID resolved via known org lookup on {datetime.now(timezone.utc).isoformat()}")
data['provenance']['notes'] = notes
save_yaml(new_filepath, data)
filepath.unlink()
return ('resolved', new_filepath)
# Strategy 3: City name extraction
city_info = detect_city(name)
if city_info:
prov, city = city_info
inst_type = infer_type(name)
abbrev = generate_abbreviation(name)
new_ghcid = f"NL-{prov}-{city}-{inst_type}-{abbrev}"
new_filepath = custodian_dir / f"{new_ghcid}.yaml"
if new_filepath.exists():
return ('collision', new_filepath)
if not dry_run:
data['ghcid_current'] = new_ghcid
if 'provenance' not in data:
data['provenance'] = {}
notes = data['provenance'].get('notes', [])
if isinstance(notes, str):
notes = [notes]
notes.append(f"GHCID resolved via city extraction on {datetime.now(timezone.utc).isoformat()}")
data['provenance']['notes'] = notes
save_yaml(new_filepath, data)
filepath.unlink()
return ('resolved', new_filepath)
return ('failed', None)
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true')
parser.add_argument('--limit', type=int, default=0)
parser.add_argument('--custodian-dir', type=Path,
default=Path('/Users/kempersc/apps/glam/data/custodian'))
args = parser.parse_args()
custodian_dir = args.custodian_dir
print("=" * 80)
print("COMPREHENSIVE PENDING FILE RESOLVER")
print("=" * 80)
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
if args.limit:
print(f"Limit: {args.limit} files")
print()
# Find NL PENDING files
pending_files = sorted(custodian_dir.glob('NL-XX-XXX-PENDING-*.yaml'))
if args.limit:
pending_files = pending_files[:args.limit]
print(f"Processing {len(pending_files)} files...")
print()
stats = {'resolved': 0, 'reclassified': 0, 'collision': 0, 'failed': 0, 'error': 0}
for filepath in pending_files:
data = load_yaml(filepath)
if not data:
stats['error'] += 1
continue
name = data.get('custodian_name', {}).get('emic_name', '')
status, new_path = resolve_pending_file(filepath, custodian_dir, args.dry_run)
stats[status] += 1
if status in ['resolved', 'reclassified']:
action = 'DRY RUN' if args.dry_run else status.upper()
print(f"[{action}] {name[:45]}")
if new_path:
print(f" -> {new_path.name}")
print()
print("=" * 80)
print("SUMMARY")
print("=" * 80)
for status, count in stats.items():
if count > 0:
print(f" {status}: {count}")
print(f" TOTAL: {sum(stats.values())}")
if __name__ == '__main__':
main()