220 lines
7.9 KiB
Python
220 lines
7.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Resolve NL PENDING files based on regional/provincial names.
|
|
|
|
Organizations like "Erfgoedhuis Zuid-Holland", "Noord-Hollands Archief", etc.
|
|
can be resolved using their regional headquarters.
|
|
|
|
Usage:
|
|
python scripts/resolve_pending_by_region.py --dry-run
|
|
python scripts/resolve_pending_by_region.py
|
|
"""
|
|
|
|
import re
|
|
import yaml
|
|
import shutil
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Optional, Tuple, Dict
|
|
|
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
|
|
|
# Regional organizations with their typical headquarters
|
|
REGIONAL_ORGS = {
|
|
# Exact matches
|
|
'archief gooi en vechtstreek': ('NH', 'HIL', 'Hilversum'),
|
|
'waterlands archief': ('NH', 'PUR', 'Purmerend'),
|
|
'noord-hollands archief': ('NH', 'HAA', 'Haarlem'),
|
|
'west-brabants archief': ('NB', 'BER', 'Bergen op Zoom'),
|
|
'streekarchief midden-holland': ('ZH', 'GOU', 'Gouda'),
|
|
'erfgoedhuis zuid-holland': ('ZH', 'DEL', 'Delft'),
|
|
'steunpunt cultureel erfgoed noord-holland': ('NH', 'AMS', 'Amsterdam'),
|
|
'stichting landschap noord-holland': ('NH', 'AMS', 'Amsterdam'),
|
|
'collectie overijssel': ('OV', 'ZWO', 'Zwolle'),
|
|
'huis voor de kunsten limburg': ('LI', 'ROE', 'Roermond'),
|
|
'landschapsbeheer drenthe': ('DR', 'ASS', 'Assen'),
|
|
'natuurmuseum brabant': ('NB', 'TIL', 'Tilburg'),
|
|
'groene hotspot zeeland': ('ZE', 'GOE', 'Goes'),
|
|
'utrechts landschap': ('UT', 'UTR', 'Utrecht'),
|
|
'tracé - limburgs samenlevingsarchief': ('LI', 'MAA', 'Maastricht'),
|
|
|
|
# Pattern-based regional orgs (headquarters in provincial capitals)
|
|
'noord-holland': ('NH', 'HAA', 'Haarlem'),
|
|
'zuid-holland': ('ZH', 'DHA', 'Den Haag'),
|
|
'noord-brabant': ('NB', 'DBO', "'s-Hertogenbosch"),
|
|
'brabant': ('NB', 'DBO', "'s-Hertogenbosch"),
|
|
'limburg': ('LI', 'MAA', 'Maastricht'),
|
|
'zeeland': ('ZE', 'MID', 'Middelburg'),
|
|
'drenthe': ('DR', 'ASS', 'Assen'),
|
|
'overijssel': ('OV', 'ZWO', 'Zwolle'),
|
|
'gelderland': ('GE', 'ARN', 'Arnhem'),
|
|
'friesland': ('FR', 'LEE', 'Leeuwarden'),
|
|
'groningen': ('GR', 'GRO', 'Groningen'),
|
|
'flevoland': ('FL', 'LEL', 'Lelystad'),
|
|
|
|
# Regional areas
|
|
'gooi': ('NH', 'HIL', 'Hilversum'),
|
|
'vechtstreek': ('NH', 'WEE', 'Weesp'),
|
|
'kennemerland': ('NH', 'HAA', 'Haarlem'),
|
|
'west-friesland': ('NH', 'HOO', 'Hoorn'),
|
|
'waterland': ('NH', 'PUR', 'Purmerend'),
|
|
'zaanstreek': ('NH', 'ZAA', 'Zaandam'),
|
|
'alblasserwaard': ('ZH', 'GOR', 'Gorinchem'),
|
|
'vijfheerenlanden': ('UT', 'VIA', 'Vianen'),
|
|
'achterhoek': ('GE', 'DOE', 'Doetinchem'),
|
|
'veluwe': ('GE', 'APE', 'Apeldoorn'),
|
|
'rivierenland': ('GE', 'TIE', 'Tiel'),
|
|
'twente': ('OV', 'ENS', 'Enschede'),
|
|
'salland': ('OV', 'DEV', 'Deventer'),
|
|
'de peel': ('NB', 'HEL', 'Helmond'),
|
|
'maasvallei': ('LI', 'VEN', 'Venlo'),
|
|
'heuvelland': ('LI', 'MAA', 'Maastricht'),
|
|
'walcheren': ('ZE', 'MID', 'Middelburg'),
|
|
|
|
# Museums with regional scope
|
|
'philzuid': ('NB', 'EIN', 'Eindhoven'), # Philips museum
|
|
}
|
|
|
|
|
|
def match_regional_org(emic_name: str) -> Optional[Tuple[str, str, str]]:
|
|
"""Match organization to regional headquarters."""
|
|
name_lower = emic_name.lower()
|
|
|
|
# Check exact matches first (sorted by length, longest first)
|
|
for pattern, location in sorted(REGIONAL_ORGS.items(), key=lambda x: -len(x[0])):
|
|
if pattern in name_lower:
|
|
return location
|
|
|
|
return None
|
|
|
|
|
|
def extract_abbreviation(name: str) -> str:
|
|
"""Extract abbreviation from organization name."""
|
|
skip_words = {
|
|
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
|
|
'en', 'of', 'the', 'a', 'an', 'and', 'or', 'museum', 'stichting',
|
|
}
|
|
|
|
name_clean = re.sub(r'[^\w\s]', ' ', name)
|
|
words = [w for w in name_clean.split() if w.lower() not in skip_words and len(w) > 1]
|
|
|
|
if not words:
|
|
words = name_clean.split()[:3]
|
|
|
|
if len(words) == 1:
|
|
abbrev = words[0][:4].upper()
|
|
else:
|
|
abbrev = ''.join(w[0] for w in words[:5]).upper()
|
|
|
|
return abbrev if abbrev else 'XXX'
|
|
|
|
|
|
def get_institution_type(data: Dict) -> str:
|
|
"""Get institution type code from data."""
|
|
type_map = {
|
|
'MUSEUM': 'M', 'ARCHIVE': 'A', 'LIBRARY': 'L', 'GALLERY': 'G',
|
|
'RESEARCH_CENTER': 'R', 'EDUCATION_PROVIDER': 'E', 'OFFICIAL_INSTITUTION': 'O',
|
|
'COLLECTING_SOCIETY': 'S', 'HOLY_SITES': 'H', 'DIGITAL_PLATFORM': 'D',
|
|
'BOTANICAL_ZOO': 'B', 'CORPORATION': 'C', 'FEATURES': 'F',
|
|
'INTANGIBLE_HERITAGE_GROUP': 'I', 'MIXED': 'X', 'PERSONAL_COLLECTION': 'P',
|
|
'NGO': 'N', 'TASTE_SMELL': 'T', 'UNKNOWN': 'U',
|
|
}
|
|
inst_type = data.get('institution_type', 'MUSEUM')
|
|
return type_map.get(inst_type, 'M')
|
|
|
|
|
|
def process_pending_file(filepath: Path, dry_run: bool = True) -> Optional[str]:
|
|
"""Process a single PENDING file."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
emic_name = data.get('custodian_name', {}).get('emic_name', '')
|
|
if not emic_name:
|
|
return None
|
|
|
|
# Try to match regional org
|
|
result = match_regional_org(emic_name)
|
|
if not result:
|
|
return None
|
|
|
|
prov, city_code, city_name = result
|
|
inst_type = get_institution_type(data)
|
|
abbrev = extract_abbreviation(emic_name)
|
|
|
|
# Generate new GHCID
|
|
new_ghcid = f"NL-{prov}-{city_code.upper()}-{inst_type}-{abbrev}"
|
|
|
|
# Check for collision
|
|
new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"
|
|
if new_path.exists() and new_path != filepath:
|
|
# Add name suffix for collision
|
|
name_slug = re.sub(r'[^\w]+', '-', emic_name.lower()).strip('-')[:30]
|
|
new_ghcid = f"{new_ghcid}-{name_slug}"
|
|
new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"
|
|
|
|
if dry_run:
|
|
print(f"[WOULD RESOLVE] {emic_name}")
|
|
print(f" Region/Province: {prov} ({city_name})")
|
|
print(f" -> {new_ghcid}.yaml")
|
|
return 'dry_run'
|
|
|
|
# Update data
|
|
data['ghcid_current'] = new_ghcid
|
|
if 'location' not in data:
|
|
data['location'] = {}
|
|
data['location']['city'] = city_name
|
|
data['location']['country'] = 'NL'
|
|
|
|
# Add resolution provenance
|
|
if 'ghcid_resolution' not in data:
|
|
data['ghcid_resolution'] = {}
|
|
data['ghcid_resolution']['method'] = 'regional_name_extraction'
|
|
data['ghcid_resolution']['resolved_at'] = datetime.now(timezone.utc).isoformat()
|
|
data['ghcid_resolution']['matched_region'] = prov
|
|
|
|
# Write and rename
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
shutil.move(filepath, new_path)
|
|
print(f"[RESOLVED] {emic_name}")
|
|
print(f" Region: {prov} ({city_name}) -> {new_ghcid}.yaml")
|
|
|
|
return new_ghcid
|
|
|
|
except Exception as e:
|
|
print(f"[ERROR] {filepath.name}: {e}")
|
|
return None
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--dry-run', action='store_true')
|
|
parser.add_argument('--limit', type=int, default=0)
|
|
args = parser.parse_args()
|
|
|
|
# Find NL PENDING files only
|
|
pending_files = list(CUSTODIAN_DIR.glob("NL-XX-XXX-PENDING-*.yaml"))
|
|
print(f"Found {len(pending_files)} NL PENDING files")
|
|
|
|
if args.limit:
|
|
pending_files = pending_files[:args.limit]
|
|
|
|
resolved = 0
|
|
failed = 0
|
|
|
|
for filepath in pending_files:
|
|
result = process_pending_file(filepath, dry_run=args.dry_run)
|
|
if result:
|
|
resolved += 1
|
|
else:
|
|
failed += 1
|
|
|
|
print(f"\n{'Would resolve' if args.dry_run else 'Resolved'}: {resolved}")
|
|
print(f"No region found: {failed}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|