383 lines
13 KiB
Python
383 lines
13 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Comprehensive PENDING file resolver using multiple strategies:
|
||
1. Known organization lookup table
|
||
2. City name extraction from emic name
|
||
3. Country re-detection for misclassified files
|
||
4. Wikidata lookup for remaining
|
||
|
||
Usage:
|
||
python scripts/resolve_pending_comprehensive.py --dry-run
|
||
python scripts/resolve_pending_comprehensive.py --limit 100
|
||
python scripts/resolve_pending_comprehensive.py
|
||
"""
|
||
|
||
import os
|
||
import re
|
||
import yaml
|
||
from pathlib import Path
|
||
from datetime import datetime, timezone
|
||
from typing import Dict, Optional, Tuple, List
|
||
|
||
# Known Dutch organizations with their locations
|
||
# Format: 'name pattern': ('province', 'city_code', 'type', 'abbreviation')
|
||
KNOWN_ORGANIZATIONS = {
|
||
# Government
|
||
'ministerie van buitenlandse zaken': ('ZH', 'DHA', 'O', 'MBZ'),
|
||
'ministerie van justitie en veiligheid': ('ZH', 'DHA', 'O', 'MJV'),
|
||
'ministerie van onderwijs': ('ZH', 'DHA', 'O', 'MOC'),
|
||
'ministerie van defensie': ('ZH', 'DHA', 'O', 'MD'),
|
||
'ministerie van financien': ('ZH', 'DHA', 'O', 'MF'),
|
||
'algemene rekenkamer': ('ZH', 'DHA', 'O', 'AR'),
|
||
'politie nederland': ('ZH', 'DHA', 'O', 'PN'),
|
||
'douane nederland': ('ZH', 'ROT', 'O', 'DN'),
|
||
'kadaster': ('GE', 'APE', 'O', 'K'),
|
||
'rijkswaterstaat': ('UT', 'UTR', 'O', 'RWS'),
|
||
'netherlands enterprise agency': ('ZH', 'DHA', 'O', 'NEA'),
|
||
'dienst uitvoering onderwijs': ('GR', 'GRO', 'O', 'DUO'),
|
||
|
||
# Education
|
||
'reinwardt academie': ('NH', 'AMS', 'E', 'RA'),
|
||
'academie minerva': ('GR', 'GRO', 'E', 'AM'),
|
||
'university of humanistic studies': ('UT', 'UTR', 'E', 'UHS'),
|
||
'erasmus university': ('ZH', 'ROT', 'E', 'EUR'),
|
||
'universiteit van amsterdam': ('NH', 'AMS', 'E', 'UVA'),
|
||
'vrije universiteit amsterdam': ('NH', 'AMS', 'E', 'VU'),
|
||
'universiteit leiden': ('ZH', 'LEI', 'E', 'UL'),
|
||
'universiteit utrecht': ('UT', 'UTR', 'E', 'UU'),
|
||
'rijksuniversiteit groningen': ('GR', 'GRO', 'E', 'RUG'),
|
||
'technische universiteit delft': ('ZH', 'DEL', 'E', 'TUD'),
|
||
'technische universiteit eindhoven': ('NB', 'EIN', 'E', 'TUE'),
|
||
'hogeschool van amsterdam': ('NH', 'AMS', 'E', 'HVA'),
|
||
'hogeschool rotterdam': ('ZH', 'ROT', 'E', 'HR'),
|
||
'hogeschool utrecht': ('UT', 'UTR', 'E', 'HU'),
|
||
|
||
# Museums
|
||
'anne frank stichting': ('NH', 'AMS', 'M', 'AFS'),
|
||
'allard pierson': ('NH', 'AMS', 'M', 'AP'),
|
||
'airborne museum': ('GE', 'ARN', 'M', 'ABM'),
|
||
'van gogh museum': ('NH', 'AMS', 'M', 'VGM'),
|
||
'mauritshuis': ('ZH', 'DHA', 'M', 'MH'),
|
||
'kroller muller museum': ('GE', 'OTT', 'M', 'KMM'),
|
||
'naturalis': ('ZH', 'LEI', 'M', 'NAT'),
|
||
'tropenmuseum': ('NH', 'AMS', 'M', 'TM'),
|
||
'joods historisch museum': ('NH', 'AMS', 'M', 'JHM'),
|
||
'koninklijk paleis amsterdam': ('NH', 'AMS', 'M', 'KPA'),
|
||
|
||
# Archives
|
||
'stadsarchief amsterdam': ('NH', 'AMS', 'A', 'SAA'),
|
||
'nationaal archief': ('ZH', 'DHA', 'A', 'NA'),
|
||
'amsab': ('BE', 'GEN', 'A', 'AMS'), # Belgian
|
||
|
||
# Research
|
||
'african studies centre leiden': ('ZH', 'LEI', 'R', 'ASCL'),
|
||
'niod': ('NH', 'AMS', 'R', 'NIOD'),
|
||
'knaw': ('NH', 'AMS', 'R', 'KNAW'),
|
||
'nwo': ('ZH', 'DHA', 'R', 'NWO'),
|
||
'rivm': ('UT', 'BIL', 'R', 'RIVM'),
|
||
'tno': ('ZH', 'DHA', 'R', 'TNO'),
|
||
|
||
# NGOs/Foundations
|
||
'amsterdams fonds voor de kunst': ('NH', 'AMS', 'N', 'AFK'),
|
||
'mondriaan fonds': ('NH', 'AMS', 'N', 'MF'),
|
||
'stimuleringsfonds': ('ZH', 'ROT', 'N', 'SF'),
|
||
'fonds voor cultuurparticipatie': ('UT', 'UTR', 'N', 'FCP'),
|
||
}
|
||
|
||
# Additional city patterns to detect
|
||
CITY_PATTERNS = {
|
||
r'\bamsterdam\b': ('NH', 'AMS'),
|
||
r'\brotterdam\b': ('ZH', 'ROT'),
|
||
r'\bden haag\b': ('ZH', 'DHA'),
|
||
r'\b\'s-gravenhage\b': ('ZH', 'DHA'),
|
||
r'\butrecht\b': ('UT', 'UTR'),
|
||
r'\beindhoven\b': ('NB', 'EIN'),
|
||
r'\bgroningen\b': ('GR', 'GRO'),
|
||
r'\bleiden\b': ('ZH', 'LEI'),
|
||
r'\bhaarlem\b': ('NH', 'HAA'),
|
||
r'\barnhem\b': ('GE', 'ARN'),
|
||
r'\bnijmegen\b': ('GE', 'NIJ'),
|
||
r'\bmaastricht\b': ('LI', 'MAA'),
|
||
r'\btilburg\b': ('NB', 'TIL'),
|
||
r'\bbreda\b': ('NB', 'BRE'),
|
||
r'\bzwolle\b': ('OV', 'ZWO'),
|
||
r'\bdeventer\b': ('OV', 'DEV'),
|
||
r'\bdelft\b': ('ZH', 'DEL'),
|
||
r'\balkmaar\b': ('NH', 'ALK'),
|
||
r'\bgouda\b': ('ZH', 'GOU'),
|
||
r'\bhilversum\b': ('NH', 'HIL'),
|
||
r'\bmiddelburg\b': ('ZE', 'MID'),
|
||
r'\bleeuwarden\b': ('FR', 'LEE'),
|
||
r'\bassen\b': ('DR', 'ASS'),
|
||
r'\bapeldoorn\b': ('GE', 'APE'),
|
||
r'\benschede\b': ('OV', 'ENS'),
|
||
r'\bdordrecht\b': ('ZH', 'DOR'),
|
||
r'\bhattem\b': ('GE', 'HAT'),
|
||
r'\bkampen\b': ('OV', 'KAM'),
|
||
r'\belburg\b': ('GE', 'ELB'),
|
||
r'\bharderwijk\b': ('GE', 'HAR'),
|
||
r'\bwageningen\b': ('GE', 'WAG'),
|
||
}
|
||
|
||
# Non-Dutch indicators - files with these should be reclassified
|
||
NON_DUTCH_PATTERNS = {
|
||
r'\bsaudi\b': 'SA',
|
||
r'\bمشاريع\b': 'SA', # Arabic
|
||
r'\bوزارة\b': 'SA', # Arabic Ministry
|
||
r'\baalborg\b': 'DK',
|
||
r'\bsainte-m[eè]re\b': 'FR',
|
||
r'\bnouvelle-aquitaine\b': 'FR',
|
||
r'\bbelgium\b': 'BE',
|
||
r'\bgent\b': 'BE',
|
||
r'\bantwerp\b': 'BE',
|
||
r'\bghent\b': 'BE',
|
||
r'\bberlin\b': 'DE',
|
||
r'\bweimar\b': 'DE',
|
||
r'\bbritish\b': 'GB',
|
||
r'\blondon\b': 'GB',
|
||
r'\bparis\b': 'FR',
|
||
r'\broma\b': 'IT',
|
||
r'\bmilano\b': 'IT',
|
||
}
|
||
|
||
# Institution type inference
|
||
TYPE_KEYWORDS = {
|
||
'museum': 'M',
|
||
'musea': 'M',
|
||
'archief': 'A',
|
||
'archive': 'A',
|
||
'bibliotheek': 'L',
|
||
'library': 'L',
|
||
'universiteit': 'E',
|
||
'university': 'E',
|
||
'hogeschool': 'E',
|
||
'academie': 'E',
|
||
'academy': 'E',
|
||
'school': 'E',
|
||
'ministerie': 'O',
|
||
'ministry': 'O',
|
||
'gemeente': 'O',
|
||
'politie': 'O',
|
||
'rijks': 'O',
|
||
'dienst': 'O',
|
||
'stichting': 'N',
|
||
'foundation': 'N',
|
||
'fonds': 'N',
|
||
'fund': 'N',
|
||
'vereniging': 'S',
|
||
'society': 'S',
|
||
'association': 'S',
|
||
}
|
||
|
||
|
||
def normalize_name(name: str) -> str:
|
||
"""Normalize name for matching."""
|
||
return name.lower().strip()
|
||
|
||
|
||
def lookup_known_org(name: str) -> Optional[Tuple[str, str, str, str]]:
|
||
"""Look up organization in known list."""
|
||
name_lower = normalize_name(name)
|
||
for pattern, info in KNOWN_ORGANIZATIONS.items():
|
||
if pattern in name_lower:
|
||
return info
|
||
return None
|
||
|
||
|
||
def detect_city(name: str) -> Optional[Tuple[str, str]]:
|
||
"""Detect city from name."""
|
||
name_lower = normalize_name(name)
|
||
for pattern, (prov, city) in CITY_PATTERNS.items():
|
||
if re.search(pattern, name_lower):
|
||
return (prov, city)
|
||
return None
|
||
|
||
|
||
def detect_non_dutch(name: str) -> Optional[str]:
|
||
"""Detect if organization is not Dutch."""
|
||
name_lower = normalize_name(name)
|
||
for pattern, country in NON_DUTCH_PATTERNS.items():
|
||
if re.search(pattern, name_lower):
|
||
return country
|
||
return None
|
||
|
||
|
||
def infer_type(name: str) -> str:
|
||
"""Infer institution type from name."""
|
||
name_lower = normalize_name(name)
|
||
for keyword, type_code in TYPE_KEYWORDS.items():
|
||
if keyword in name_lower:
|
||
return type_code
|
||
return 'M' # Default to Museum
|
||
|
||
|
||
def generate_abbreviation(name: str) -> str:
|
||
"""Generate abbreviation from name."""
|
||
skip = {'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'en', 'of',
|
||
'the', 'a', 'an', 'of', 'for', 'and', 'or', 'at', 'in', 'on',
|
||
'stichting', 'museum', 'archief', 'bibliotheek'}
|
||
words = re.split(r'[\s\-\'\"\(\)]+', name)
|
||
abbrev = ''.join(w[0].upper() for w in words
|
||
if w.lower() not in skip and w and w[0].isalpha())
|
||
return abbrev[:8] if abbrev else 'UNK'
|
||
|
||
|
||
def load_yaml(filepath: Path) -> Optional[Dict]:
|
||
"""Load YAML file."""
|
||
try:
|
||
with open(filepath, 'r', encoding='utf-8') as f:
|
||
return yaml.safe_load(f)
|
||
except:
|
||
return None
|
||
|
||
|
||
def save_yaml(filepath: Path, data: Dict):
|
||
"""Save YAML file."""
|
||
with open(filepath, 'w', encoding='utf-8') as f:
|
||
yaml.dump(data, f, allow_unicode=True, default_flow_style=False,
|
||
sort_keys=False, width=120)
|
||
|
||
|
||
def resolve_pending_file(filepath: Path, custodian_dir: Path, dry_run: bool = True) -> Tuple[str, Optional[Path]]:
|
||
"""
|
||
Resolve a PENDING file.
|
||
|
||
Returns: (status, new_filepath)
|
||
Status: 'resolved', 'reclassified', 'collision', 'failed'
|
||
"""
|
||
data = load_yaml(filepath)
|
||
if not data:
|
||
return ('error', None)
|
||
|
||
name = data.get('custodian_name', {}).get('emic_name', '')
|
||
if not name:
|
||
return ('error', None)
|
||
|
||
# Strategy 1: Check if non-Dutch
|
||
new_country = detect_non_dutch(name)
|
||
if new_country and new_country != 'NL':
|
||
# Reclassify to different country
|
||
old_name = filepath.stem
|
||
new_name = old_name.replace('NL-XX-XXX-PENDING-', f'{new_country}-XX-XXX-PENDING-')
|
||
new_filepath = custodian_dir / f"{new_name}.yaml"
|
||
|
||
if new_filepath.exists():
|
||
return ('collision', None)
|
||
|
||
if not dry_run:
|
||
data['ghcid_current'] = new_name
|
||
save_yaml(new_filepath, data)
|
||
filepath.unlink()
|
||
|
||
return ('reclassified', new_filepath)
|
||
|
||
# Strategy 2: Known organization lookup
|
||
known = lookup_known_org(name)
|
||
if known:
|
||
prov, city, inst_type, abbrev = known
|
||
new_ghcid = f"NL-{prov}-{city}-{inst_type}-{abbrev}"
|
||
new_filepath = custodian_dir / f"{new_ghcid}.yaml"
|
||
|
||
if new_filepath.exists():
|
||
return ('collision', new_filepath)
|
||
|
||
if not dry_run:
|
||
data['ghcid_current'] = new_ghcid
|
||
if 'provenance' not in data:
|
||
data['provenance'] = {}
|
||
notes = data['provenance'].get('notes', [])
|
||
if isinstance(notes, str):
|
||
notes = [notes]
|
||
notes.append(f"GHCID resolved via known org lookup on {datetime.now(timezone.utc).isoformat()}")
|
||
data['provenance']['notes'] = notes
|
||
save_yaml(new_filepath, data)
|
||
filepath.unlink()
|
||
|
||
return ('resolved', new_filepath)
|
||
|
||
# Strategy 3: City name extraction
|
||
city_info = detect_city(name)
|
||
if city_info:
|
||
prov, city = city_info
|
||
inst_type = infer_type(name)
|
||
abbrev = generate_abbreviation(name)
|
||
new_ghcid = f"NL-{prov}-{city}-{inst_type}-{abbrev}"
|
||
new_filepath = custodian_dir / f"{new_ghcid}.yaml"
|
||
|
||
if new_filepath.exists():
|
||
return ('collision', new_filepath)
|
||
|
||
if not dry_run:
|
||
data['ghcid_current'] = new_ghcid
|
||
if 'provenance' not in data:
|
||
data['provenance'] = {}
|
||
notes = data['provenance'].get('notes', [])
|
||
if isinstance(notes, str):
|
||
notes = [notes]
|
||
notes.append(f"GHCID resolved via city extraction on {datetime.now(timezone.utc).isoformat()}")
|
||
data['provenance']['notes'] = notes
|
||
save_yaml(new_filepath, data)
|
||
filepath.unlink()
|
||
|
||
return ('resolved', new_filepath)
|
||
|
||
return ('failed', None)
|
||
|
||
|
||
def main():
|
||
import argparse
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument('--dry-run', action='store_true')
|
||
parser.add_argument('--limit', type=int, default=0)
|
||
parser.add_argument('--custodian-dir', type=Path,
|
||
default=Path('/Users/kempersc/apps/glam/data/custodian'))
|
||
args = parser.parse_args()
|
||
|
||
custodian_dir = args.custodian_dir
|
||
|
||
print("=" * 80)
|
||
print("COMPREHENSIVE PENDING FILE RESOLVER")
|
||
print("=" * 80)
|
||
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
|
||
if args.limit:
|
||
print(f"Limit: {args.limit} files")
|
||
print()
|
||
|
||
# Find NL PENDING files
|
||
pending_files = sorted(custodian_dir.glob('NL-XX-XXX-PENDING-*.yaml'))
|
||
if args.limit:
|
||
pending_files = pending_files[:args.limit]
|
||
|
||
print(f"Processing {len(pending_files)} files...")
|
||
print()
|
||
|
||
stats = {'resolved': 0, 'reclassified': 0, 'collision': 0, 'failed': 0, 'error': 0}
|
||
|
||
for filepath in pending_files:
|
||
data = load_yaml(filepath)
|
||
if not data:
|
||
stats['error'] += 1
|
||
continue
|
||
|
||
name = data.get('custodian_name', {}).get('emic_name', '')
|
||
status, new_path = resolve_pending_file(filepath, custodian_dir, args.dry_run)
|
||
stats[status] += 1
|
||
|
||
if status in ['resolved', 'reclassified']:
|
||
action = 'DRY RUN' if args.dry_run else status.upper()
|
||
print(f"[{action}] {name[:45]}")
|
||
if new_path:
|
||
print(f" -> {new_path.name}")
|
||
print()
|
||
|
||
print("=" * 80)
|
||
print("SUMMARY")
|
||
print("=" * 80)
|
||
for status, count in stats.items():
|
||
if count > 0:
|
||
print(f" {status}: {count}")
|
||
print(f" TOTAL: {sum(stats.values())}")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|