905 lines
33 KiB
Python
905 lines
33 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Comprehensive PENDING file resolver using multiple strategies:
|
||
1. Known organization lookup table
|
||
2. City name extraction from emic name
|
||
3. Country re-detection for misclassified files
|
||
4. Wikidata lookup for remaining
|
||
|
||
Usage:
|
||
python scripts/resolve_pending_comprehensive.py --dry-run
|
||
python scripts/resolve_pending_comprehensive.py --limit 100
|
||
python scripts/resolve_pending_comprehensive.py
|
||
"""
|
||
|
||
import os
|
||
import re
|
||
import yaml
|
||
from pathlib import Path
|
||
from datetime import datetime, timezone
|
||
from typing import Dict, Optional, Tuple, List
|
||
|
||
# Known Dutch organizations with their locations
|
||
# Format: 'name pattern': ('province', 'city_code', 'type', 'abbreviation')
|
||
KNOWN_ORGANIZATIONS = {
|
||
# Government - Ministeries
|
||
'ministerie van buitenlandse zaken': ('ZH', 'DHA', 'O', 'MBZ'),
|
||
'ministerie van justitie en veiligheid': ('ZH', 'DHA', 'O', 'MJV'),
|
||
'ministerie van onderwijs': ('ZH', 'DHA', 'O', 'MOC'),
|
||
'ministerie van defensie': ('ZH', 'DHA', 'O', 'MD'),
|
||
'ministerie van financien': ('ZH', 'DHA', 'O', 'MF'),
|
||
'ministerie van sociale zaken': ('ZH', 'DHA', 'O', 'MSZ'),
|
||
'ministerie van economische zaken': ('ZH', 'DHA', 'O', 'MEZ'),
|
||
'ministerie van volksgezondheid': ('ZH', 'DHA', 'O', 'MVW'),
|
||
'ministerie van binnenlandse zaken': ('ZH', 'DHA', 'O', 'MBZ'),
|
||
'ministerie van infrastructuur': ('ZH', 'DHA', 'O', 'MIW'),
|
||
'ministerie van landbouw': ('ZH', 'DHA', 'O', 'MLN'),
|
||
|
||
# Government - Agencies
|
||
'algemene rekenkamer': ('ZH', 'DHA', 'O', 'AR'),
|
||
'politie nederland': ('ZH', 'DHA', 'O', 'PN'),
|
||
'douane nederland': ('ZH', 'ROT', 'O', 'DN'),
|
||
'kadaster': ('GE', 'APE', 'O', 'K'),
|
||
'rijkswaterstaat': ('UT', 'UTR', 'O', 'RWS'),
|
||
'netherlands enterprise agency': ('ZH', 'DHA', 'O', 'NEA'),
|
||
'dienst uitvoering onderwijs': ('GR', 'GRO', 'O', 'DUO'),
|
||
'fiod': ('ZH', 'DHA', 'O', 'FIOD'),
|
||
'ssc-ict': ('ZH', 'DHA', 'O', 'SSC'),
|
||
'raad voor de kinderbescherming': ('ZH', 'DHA', 'O', 'RVK'),
|
||
'immigratie- en naturalisatiedienst': ('ZH', 'DHA', 'O', 'IND'),
|
||
'ind': ('ZH', 'DHA', 'O', 'IND'),
|
||
'coa': ('ZH', 'DHA', 'O', 'COA'),
|
||
'centraal orgaan opvang asielzoekers': ('ZH', 'DHA', 'O', 'COA'),
|
||
'sociale verzekeringsbank': ('NH', 'AME', 'O', 'SVB'),
|
||
'uwv': ('NH', 'AMS', 'O', 'UWV'),
|
||
'kamer van koophandel': ('UT', 'UTR', 'O', 'KVK'),
|
||
'autoriteit persoonsgegevens': ('ZH', 'DHA', 'O', 'AP'),
|
||
'belastingdienst': ('UT', 'UTR', 'O', 'BD'),
|
||
'autoriteit financiele markten': ('NH', 'AMS', 'O', 'AFM'),
|
||
'de nederlandsche bank': ('NH', 'AMS', 'O', 'DNB'),
|
||
'cbs': ('ZH', 'DHA', 'O', 'CBS'),
|
||
'centraal bureau voor de statistiek': ('ZH', 'DHA', 'O', 'CBS'),
|
||
'rijksdienst voor het cultureel erfgoed': ('GE', 'AME', 'O', 'RCE'),
|
||
'rijksdienst voor ondernemend nederland': ('ZH', 'DHA', 'O', 'RVO'),
|
||
'raad van state': ('ZH', 'DHA', 'O', 'RVS'),
|
||
'raad voor cultuur': ('ZH', 'DHA', 'O', 'RVC'),
|
||
|
||
# Education - Universities
|
||
'reinwardt academie': ('NH', 'AMS', 'E', 'RA'),
|
||
'academie minerva': ('GR', 'GRO', 'E', 'AM'),
|
||
'university of humanistic studies': ('UT', 'UTR', 'E', 'UHS'),
|
||
'erasmus university': ('ZH', 'ROT', 'E', 'EUR'),
|
||
'erasmus universiteit': ('ZH', 'ROT', 'E', 'EUR'),
|
||
'universiteit van amsterdam': ('NH', 'AMS', 'E', 'UVA'),
|
||
'vrije universiteit amsterdam': ('NH', 'AMS', 'E', 'VU'),
|
||
'universiteit leiden': ('ZH', 'LEI', 'E', 'UL'),
|
||
'leiden university': ('ZH', 'LEI', 'E', 'UL'),
|
||
'universiteit utrecht': ('UT', 'UTR', 'E', 'UU'),
|
||
'utrecht university': ('UT', 'UTR', 'E', 'UU'),
|
||
'rijksuniversiteit groningen': ('GR', 'GRO', 'E', 'RUG'),
|
||
'university of groningen': ('GR', 'GRO', 'E', 'RUG'),
|
||
'technische universiteit delft': ('ZH', 'DEL', 'E', 'TUD'),
|
||
'tu delft': ('ZH', 'DEL', 'E', 'TUD'),
|
||
'delft university': ('ZH', 'DEL', 'E', 'TUD'),
|
||
'technische universiteit eindhoven': ('NB', 'EIN', 'E', 'TUE'),
|
||
'tu eindhoven': ('NB', 'EIN', 'E', 'TUE'),
|
||
'tue': ('NB', 'EIN', 'E', 'TUE'),
|
||
'wageningen university': ('GE', 'WAG', 'E', 'WUR'),
|
||
'wageningen universiteit': ('GE', 'WAG', 'E', 'WUR'),
|
||
'radboud university': ('GE', 'NIJ', 'E', 'RU'),
|
||
'radboud universiteit': ('GE', 'NIJ', 'E', 'RU'),
|
||
'tilburg university': ('NB', 'TIL', 'E', 'TIU'),
|
||
'universiteit tilburg': ('NB', 'TIL', 'E', 'TIU'),
|
||
'maastricht university': ('LI', 'MAA', 'E', 'UM'),
|
||
'universiteit maastricht': ('LI', 'MAA', 'E', 'UM'),
|
||
'open universiteit': ('LI', 'HEE', 'E', 'OU'),
|
||
'nyenrode': ('NH', 'BRE', 'E', 'NYE'),
|
||
'royal academy of art': ('ZH', 'DHA', 'E', 'KABK'),
|
||
'koninklijke academie van beeldende kunsten': ('ZH', 'DHA', 'E', 'KABK'),
|
||
|
||
# Education - Hogescholen
|
||
'hogeschool van amsterdam': ('NH', 'AMS', 'E', 'HVA'),
|
||
'hogeschool rotterdam': ('ZH', 'ROT', 'E', 'HR'),
|
||
'hogeschool utrecht': ('UT', 'UTR', 'E', 'HU'),
|
||
'hogeschool inholland': ('NH', 'AMS', 'E', 'INH'),
|
||
'inholland': ('NH', 'AMS', 'E', 'INH'),
|
||
'hogeschool leiden': ('ZH', 'LEI', 'E', 'HL'),
|
||
'haagse hogeschool': ('ZH', 'DHA', 'E', 'HH'),
|
||
'saxion': ('OV', 'ENS', 'E', 'SAX'),
|
||
'avans': ('NB', 'BRE', 'E', 'AVA'),
|
||
'fontys': ('NB', 'EIN', 'E', 'FON'),
|
||
'zuyd hogeschool': ('LI', 'MAA', 'E', 'ZH'),
|
||
'hanzehogeschool': ('GR', 'GRO', 'E', 'HAN'),
|
||
'hku': ('UT', 'UTR', 'E', 'HKU'),
|
||
'design academy eindhoven': ('NB', 'EIN', 'E', 'DAE'),
|
||
'gerrit rietveld academie': ('NH', 'AMS', 'E', 'GRA'),
|
||
'koninklijk conservatorium': ('ZH', 'DHA', 'E', 'KC'),
|
||
'conservatorium van amsterdam': ('NH', 'AMS', 'E', 'CVA'),
|
||
'codarts': ('ZH', 'ROT', 'E', 'COD'),
|
||
'artez': ('GE', 'ARN', 'E', 'ARZ'),
|
||
|
||
# Museums - Amsterdam
|
||
'anne frank stichting': ('NH', 'AMS', 'M', 'AFS'),
|
||
'allard pierson': ('NH', 'AMS', 'M', 'AP'),
|
||
'van gogh museum': ('NH', 'AMS', 'M', 'VGM'),
|
||
'tropenmuseum': ('NH', 'AMS', 'M', 'TM'),
|
||
'joods historisch museum': ('NH', 'AMS', 'M', 'JHM'),
|
||
'koninklijk paleis amsterdam': ('NH', 'AMS', 'M', 'KPA'),
|
||
'stedelijk museum amsterdam': ('NH', 'AMS', 'M', 'SMA'),
|
||
'rijksmuseum': ('NH', 'AMS', 'M', 'RM'),
|
||
'ons lieve heer op solder': ('NH', 'AMS', 'M', 'OLHOS'),
|
||
'rembrandthuis': ('NH', 'AMS', 'M', 'RH'),
|
||
'amsterdam museum': ('NH', 'AMS', 'M', 'AM'),
|
||
'artis': ('NH', 'AMS', 'M', 'ART'),
|
||
'nemo science museum': ('NH', 'AMS', 'M', 'NEMO'),
|
||
'eye filmmuseum': ('NH', 'AMS', 'M', 'EYE'),
|
||
'moco museum': ('NH', 'AMS', 'M', 'MOCO'),
|
||
'hermitage amsterdam': ('NH', 'AMS', 'M', 'HA'),
|
||
'tassenmuseum hendrikje': ('NH', 'AMS', 'M', 'TMH'),
|
||
'willet-holthuysen': ('NH', 'AMS', 'M', 'WH'),
|
||
'geelvinck hinlopen huis': ('NH', 'AMS', 'M', 'GHH'),
|
||
'hortus botanicus': ('NH', 'AMS', 'M', 'HB'),
|
||
'multatuli museum': ('NH', 'AMS', 'M', 'MM'),
|
||
'beurs van berlage': ('NH', 'AMS', 'M', 'BVB'),
|
||
'de brakke grond': ('NH', 'AMS', 'M', 'DBG'),
|
||
|
||
# Museums - Den Haag
|
||
'mauritshuis': ('ZH', 'DHA', 'M', 'MH'),
|
||
'gemeentemuseum': ('ZH', 'DHA', 'M', 'GMH'),
|
||
'kunstmuseum den haag': ('ZH', 'DHA', 'M', 'KDH'),
|
||
'escher in het paleis': ('ZH', 'DHA', 'M', 'EHP'),
|
||
'museon': ('ZH', 'DHA', 'M', 'MUS'),
|
||
'omniversum': ('ZH', 'DHA', 'M', 'OMN'),
|
||
'louwman museum': ('ZH', 'DHA', 'M', 'LM'),
|
||
'museum de gevangenpoort': ('ZH', 'DHA', 'M', 'MDG'),
|
||
'museum meermanno': ('ZH', 'DHA', 'M', 'MMO'),
|
||
'bredius museum': ('ZH', 'DHA', 'M', 'BM'),
|
||
'panorama mesdag': ('ZH', 'DHA', 'M', 'PM'),
|
||
'madurodam': ('ZH', 'DHA', 'M', 'MAD'),
|
||
'haags historisch museum': ('ZH', 'DHA', 'M', 'HHM'),
|
||
'beelden aan zee': ('ZH', 'DHA', 'M', 'BAZ'),
|
||
'museum het paleis': ('ZH', 'DHA', 'M', 'MHP'),
|
||
|
||
# Museums - Rotterdam
|
||
'museum boijmans van beuningen': ('ZH', 'ROT', 'M', 'MBVB'),
|
||
'boijmans': ('ZH', 'ROT', 'M', 'MBVB'),
|
||
'maritiem museum': ('ZH', 'ROT', 'M', 'MM'),
|
||
'het nieuwe instituut': ('ZH', 'ROT', 'M', 'HNI'),
|
||
'chabot museum': ('ZH', 'ROT', 'M', 'CM'),
|
||
'kunsthal rotterdam': ('ZH', 'ROT', 'M', 'KR'),
|
||
'wereldmuseum': ('ZH', 'ROT', 'M', 'WM'),
|
||
'museum rotterdam': ('ZH', 'ROT', 'M', 'MR'),
|
||
'fotomuseum': ('ZH', 'ROT', 'M', 'NFM'),
|
||
'nederlands fotomuseum': ('ZH', 'ROT', 'M', 'NFM'),
|
||
'ss rotterdam': ('ZH', 'ROT', 'M', 'SSR'),
|
||
'fenixloods': ('ZH', 'ROT', 'M', 'FL'),
|
||
|
||
# Museums - Other cities
|
||
'airborne museum': ('GE', 'ARN', 'M', 'ABM'),
|
||
'kroller muller museum': ('GE', 'OTT', 'M', 'KMM'),
|
||
'naturalis': ('ZH', 'LEI', 'M', 'NAT'),
|
||
'museum catharijneconvent': ('UT', 'UTR', 'M', 'MC'),
|
||
'centraal museum': ('UT', 'UTR', 'M', 'CMU'),
|
||
'spoorwegmuseum': ('UT', 'UTR', 'M', 'SPW'),
|
||
'het utrechts archief': ('UT', 'UTR', 'A', 'HUA'),
|
||
'museum speelklok': ('UT', 'UTR', 'M', 'MS'),
|
||
'museum van oudheden': ('ZH', 'LEI', 'M', 'MVO'),
|
||
'molenmuseum de valk': ('ZH', 'LEI', 'M', 'MDV'),
|
||
'stedelijk museum schiedam': ('ZH', 'SCH', 'M', 'SMS'),
|
||
'bonnefantenmuseum': ('LI', 'MAA', 'M', 'BFM'),
|
||
'marres': ('LI', 'MAA', 'M', 'MAR'),
|
||
'museum aan het vrijthof': ('LI', 'MAA', 'M', 'MAV'),
|
||
'drents museum': ('DR', 'ASS', 'M', 'DM'),
|
||
'groninger museum': ('GR', 'GRO', 'M', 'GM'),
|
||
'fries museum': ('FR', 'LEE', 'M', 'FM'), # Requires word boundary (in SHORT_PATTERNS)
|
||
'westfries museum': ('NH', 'HOO', 'M', 'WFM'), # In Hoorn
|
||
'museum belvédère': ('FR', 'ORN', 'M', 'MB'),
|
||
'princessehof': ('FR', 'LEE', 'M', 'PH'),
|
||
'zuiderzeemuseum': ('NH', 'ENK', 'M', 'ZZM'),
|
||
'rijksmuseum muiderslot': ('NH', 'MUI', 'M', 'RMM'),
|
||
'teylers museum': ('NH', 'HAA', 'M', 'TYM'),
|
||
'frans hals museum': ('NH', 'HAA', 'M', 'FHM'),
|
||
'museum de fundatie': ('OV', 'ZWO', 'M', 'MDF'),
|
||
'museum twentse welle': ('OV', 'ENS', 'M', 'MTW'),
|
||
'rijksmuseum van oudheden': ('ZH', 'LEI', 'M', 'RMO'),
|
||
'museum volkenkunde': ('ZH', 'LEI', 'M', 'MVK'),
|
||
'museon-omniversum': ('ZH', 'DHA', 'M', 'MO'),
|
||
'literatuurmuseum': ('ZH', 'DHA', 'M', 'LM'),
|
||
'kinderboekenmuseum': ('ZH', 'DHA', 'M', 'KBM'),
|
||
'van abbemuseum': ('NB', 'EIN', 'M', 'VAM'),
|
||
'philips museum': ('NB', 'EIN', 'M', 'PHM'),
|
||
'textielmuseum': ('NB', 'TIL', 'M', 'TXM'),
|
||
'de pont': ('NB', 'TIL', 'M', 'DP'),
|
||
'noordbrabants museum': ('NB', 'DBO', 'M', 'NBM'),
|
||
'musis sacrum': ('GE', 'ARN', 'M', 'MUS'),
|
||
'museum arnhem': ('GE', 'ARN', 'M', 'MA'),
|
||
'afrika museum': ('GE', 'BER', 'M', 'AFM'),
|
||
'museum het valkhof': ('GE', 'NIJ', 'M', 'MHV'),
|
||
'hunebedcentrum': ('DR', 'BOR', 'M', 'HC'),
|
||
'museum drachten': ('FR', 'DRA', 'M', 'MDR'),
|
||
'openluchtmuseum': ('GE', 'ARN', 'M', 'OLM'),
|
||
'nederlands openluchtmuseum': ('GE', 'ARN', 'M', 'NOLM'),
|
||
|
||
# Archives
|
||
'stadsarchief amsterdam': ('NH', 'AMS', 'A', 'SAA'),
|
||
'nationaal archief': ('ZH', 'DHA', 'A', 'NA'),
|
||
'amsab': ('BE', 'GEN', 'A', 'AMS'), # Belgian
|
||
'noord-hollands archief': ('NH', 'HAA', 'A', 'NHA'),
|
||
'brabants historisch informatie centrum': ('NB', 'DBO', 'A', 'BHIC'),
|
||
'gelders archief': ('GE', 'ARN', 'A', 'GA'),
|
||
'zeeuws archief': ('ZE', 'MID', 'A', 'ZA'),
|
||
'tresoar': ('FR', 'LEE', 'A', 'TRE'),
|
||
'drents archief': ('DR', 'ASS', 'A', 'DA'),
|
||
'groninger archieven': ('GR', 'GRO', 'A', 'GRA'),
|
||
'historisch centrum overijssel': ('OV', 'ZWO', 'A', 'HCO'),
|
||
'regionaal archief tilburg': ('NB', 'TIL', 'A', 'RAT'),
|
||
'erfgoed brabant': ('NB', 'TIL', 'A', 'EB'),
|
||
'stadsarchief rotterdam': ('ZH', 'ROT', 'A', 'SAR'),
|
||
'stadsarchief delft': ('ZH', 'DEL', 'A', 'SAD'),
|
||
'regionaal historisch centrum limburg': ('LI', 'MAA', 'A', 'RHCL'),
|
||
|
||
# Libraries
|
||
'koninklijke bibliotheek': ('ZH', 'DHA', 'L', 'KB'),
|
||
'nationale bibliotheek': ('ZH', 'DHA', 'L', 'KB'),
|
||
'openbare bibliotheek amsterdam': ('NH', 'AMS', 'L', 'OBA'),
|
||
'oba': ('NH', 'AMS', 'L', 'OBA'),
|
||
'universiteitbibliotheek': ('NH', 'AMS', 'L', 'UBA'),
|
||
'atria': ('NH', 'AMS', 'L', 'ATR'),
|
||
'bibliotheek rotterdam': ('ZH', 'ROT', 'L', 'BR'),
|
||
'bibliotheek den haag': ('ZH', 'DHA', 'L', 'BDH'),
|
||
'bibliotheek utrecht': ('UT', 'UTR', 'L', 'BU'),
|
||
|
||
# Research
|
||
'african studies centre leiden': ('ZH', 'LEI', 'R', 'ASCL'),
|
||
'niod': ('NH', 'AMS', 'R', 'NIOD'),
|
||
'knaw': ('NH', 'AMS', 'R', 'KNAW'),
|
||
'koninklijke nederlandse akademie van wetenschappen': ('NH', 'AMS', 'R', 'KNAW'),
|
||
'nwo': ('ZH', 'DHA', 'R', 'NWO'),
|
||
'rivm': ('UT', 'BIL', 'R', 'RIVM'),
|
||
'tno': ('ZH', 'DHA', 'R', 'TNO'),
|
||
'meertens instituut': ('NH', 'AMS', 'R', 'MI'),
|
||
'huygens instituut': ('NH', 'AMS', 'R', 'HI'),
|
||
'internationaal instituut voor sociale geschiedenis': ('NH', 'AMS', 'R', 'IISG'),
|
||
'iisg': ('NH', 'AMS', 'R', 'IISG'),
|
||
'rathenau instituut': ('ZH', 'DHA', 'R', 'RAT'),
|
||
'planbureau voor de leefomgeving': ('ZH', 'DHA', 'R', 'PBL'),
|
||
'sociaal en cultureel planbureau': ('ZH', 'DHA', 'R', 'SCP'),
|
||
'cpb': ('ZH', 'DHA', 'R', 'CPB'),
|
||
'centraal planbureau': ('ZH', 'DHA', 'R', 'CPB'),
|
||
'knmi': ('UT', 'DEV', 'R', 'KNMI'),
|
||
'nivel': ('UT', 'UTR', 'R', 'NIV'),
|
||
'deltaresearch': ('ZH', 'DEL', 'R', 'DEL'),
|
||
'deltares': ('ZH', 'DEL', 'R', 'DEL'),
|
||
'nidi': ('ZH', 'DHA', 'R', 'NIDI'),
|
||
'dans': ('ZH', 'DHA', 'R', 'DANS'),
|
||
'surf': ('UT', 'UTR', 'R', 'SURF'),
|
||
|
||
# NGOs/Foundations
|
||
'amsterdams fonds voor de kunst': ('NH', 'AMS', 'N', 'AFK'),
|
||
'mondriaan fonds': ('NH', 'AMS', 'N', 'MF'),
|
||
'stimuleringsfonds': ('ZH', 'ROT', 'N', 'SF'),
|
||
'fonds voor cultuurparticipatie': ('UT', 'UTR', 'N', 'FCP'),
|
||
'fonds podiumkunsten': ('ZH', 'DHA', 'N', 'FPK'),
|
||
'letterenfonds': ('NH', 'AMS', 'N', 'LF'),
|
||
'filmfonds': ('NH', 'AMS', 'N', 'NFF'),
|
||
'nederlands filmfonds': ('NH', 'AMS', 'N', 'NFF'),
|
||
'bng cultuurfonds': ('ZH', 'DHA', 'N', 'BNG'),
|
||
'prins bernhard cultuurfonds': ('NH', 'AMS', 'N', 'PBC'),
|
||
'vsc': ('NH', 'AMS', 'N', 'VSC'),
|
||
'cultuur + ondernemen': ('NH', 'AMS', 'N', 'CO'),
|
||
'erfgoedvereniging heemschut': ('NH', 'AMS', 'N', 'EH'),
|
||
'heemschut': ('NH', 'AMS', 'N', 'EH'),
|
||
'boekmanstichting': ('NH', 'AMS', 'N', 'BS'),
|
||
'lira': ('NH', 'AMS', 'N', 'LIRA'),
|
||
'pictoright': ('NH', 'AMS', 'N', 'PR'),
|
||
'buma stemra': ('NH', 'AMS', 'N', 'BS'),
|
||
'senafonds': ('NH', 'AMS', 'N', 'SEN'),
|
||
|
||
# Performing Arts
|
||
'nederlands dans theater': ('ZH', 'DHA', 'M', 'NDT'),
|
||
'ndt': ('ZH', 'DHA', 'M', 'NDT'),
|
||
'het nationale ballet': ('NH', 'AMS', 'M', 'HNB'),
|
||
'nationale opera': ('NH', 'AMS', 'M', 'DNO'),
|
||
'de nationale opera & ballet': ('NH', 'AMS', 'M', 'NOB'),
|
||
'concertgebouw': ('NH', 'AMS', 'M', 'CG'),
|
||
'koninklijk concertgebouworkest': ('NH', 'AMS', 'M', 'KCO'),
|
||
'residentie orkest': ('ZH', 'DHA', 'M', 'RO'),
|
||
'rotterdams philharmonisch': ('ZH', 'ROT', 'M', 'RPO'),
|
||
'nederlands kamerorkest': ('NH', 'AMS', 'M', 'NKO'),
|
||
'holland festival': ('NH', 'AMS', 'M', 'HF'),
|
||
'internationaal theater amsterdam': ('NH', 'AMS', 'M', 'ITA'),
|
||
'ita': ('NH', 'AMS', 'M', 'ITA'),
|
||
'stadsschouwburg': ('NH', 'AMS', 'M', 'SSB'),
|
||
'theater carré': ('NH', 'AMS', 'M', 'TC'),
|
||
'de la mar theater': ('NH', 'AMS', 'M', 'DLM'),
|
||
'schouwburg': ('NH', 'AMS', 'M', 'SCH'),
|
||
'muziekgebouw aan t ij': ('NH', 'AMS', 'M', 'MATI'),
|
||
'bimhuis': ('NH', 'AMS', 'M', 'BH'),
|
||
'paradiso': ('NH', 'AMS', 'M', 'PAR'),
|
||
'melkweg': ('NH', 'AMS', 'M', 'MW'),
|
||
'doelen': ('ZH', 'ROT', 'M', 'DOE'),
|
||
'de doelen': ('ZH', 'ROT', 'M', 'DOE'),
|
||
'ahoy': ('ZH', 'ROT', 'M', 'AH'),
|
||
'tivoli vredenburg': ('UT', 'UTR', 'M', 'TV'),
|
||
'theater aan het spui': ('ZH', 'DHA', 'M', 'TAS'),
|
||
'zuiderstrandtheater': ('ZH', 'DHA', 'M', 'ZST'),
|
||
'lucent danstheater': ('ZH', 'DHA', 'M', 'LDT'),
|
||
'chassé theater': ('NB', 'BRE', 'M', 'CT'),
|
||
'parktheater': ('NB', 'EIN', 'M', 'PT'),
|
||
|
||
# Media/Broadcasting
|
||
'npo': ('NH', 'HIL', 'M', 'NPO'),
|
||
'nos': ('NH', 'HIL', 'M', 'NOS'),
|
||
'ntr': ('NH', 'HIL', 'M', 'NTR'),
|
||
'avro': ('NH', 'HIL', 'M', 'AVRO'),
|
||
'avrotros': ('NH', 'HIL', 'M', 'AT'),
|
||
'vara': ('NH', 'HIL', 'M', 'VARA'),
|
||
'bnnvara': ('NH', 'HIL', 'M', 'BV'),
|
||
'eo': ('NH', 'HIL', 'M', 'EO'),
|
||
'kro': ('NH', 'HIL', 'M', 'KRO'),
|
||
'kro-ncrv': ('NH', 'HIL', 'M', 'KN'),
|
||
'vpro': ('NH', 'HIL', 'M', 'VPRO'),
|
||
'max': ('NH', 'HIL', 'M', 'MAX'),
|
||
'omroep max': ('NH', 'HIL', 'M', 'MAX'),
|
||
'beeld en geluid': ('NH', 'HIL', 'M', 'BEG'),
|
||
'beelden en geluid': ('NH', 'HIL', 'M', 'BEG'),
|
||
|
||
# Religious/Holy Sites
|
||
'protestantse kerk in nederland': ('UT', 'UTR', 'H', 'PKN'),
|
||
'pkn': ('UT', 'UTR', 'H', 'PKN'),
|
||
'bisdom utrecht': ('UT', 'UTR', 'H', 'BU'),
|
||
'aartsbisdom utrecht': ('UT', 'UTR', 'H', 'ABU'),
|
||
'bisdom haarlem': ('NH', 'HAA', 'H', 'BH'),
|
||
'bisdom rotterdam': ('ZH', 'ROT', 'H', 'BR'),
|
||
'bisdom breda': ('NB', 'BRE', 'H', 'BB'),
|
||
'bisdom den bosch': ('NB', 'DBO', 'H', 'BDB'),
|
||
|
||
# Provincial/Regional
|
||
'rijnbrink': ('GE', 'ARN', 'N', 'RB'),
|
||
'erfgoed zeeland': ('ZE', 'MID', 'N', 'EZ'),
|
||
'erfgoed brabant': ('NB', 'TIL', 'N', 'EB'),
|
||
'erfgoed gelderland': ('GE', 'ARN', 'N', 'EG'),
|
||
'erfgoed overijssel': ('OV', 'ZWO', 'N', 'EO'),
|
||
'monumentenwacht': ('NH', 'AMS', 'N', 'MW'),
|
||
'erfgoedcentrum': ('UT', 'UTR', 'N', 'EC'),
|
||
}
|
||
|
||
# Additional city patterns to detect (Dutch cities)
|
||
CITY_PATTERNS = {
|
||
# Major cities
|
||
r'\bamsterdam\b': ('NH', 'AMS'),
|
||
r'\brotterdam\b': ('ZH', 'ROT'),
|
||
r'\bden haag\b': ('ZH', 'DHA'),
|
||
r'\bthe hague\b': ('ZH', 'DHA'),
|
||
r'\b\'s-gravenhage\b': ('ZH', 'DHA'),
|
||
r'\butrecht\b': ('UT', 'UTR'),
|
||
r'\beindhoven\b': ('NB', 'EIN'),
|
||
r'\bgroningen\b': ('GR', 'GRO'),
|
||
|
||
# Zuid-Holland
|
||
r'\bleiden\b': ('ZH', 'LEI'),
|
||
r'\bdelft\b': ('ZH', 'DEL'),
|
||
r'\bdordrecht\b': ('ZH', 'DOR'),
|
||
r'\bgouda\b': ('ZH', 'GOU'),
|
||
r'\bschiedam\b': ('ZH', 'SCH'),
|
||
r'\bzoetermeer\b': ('ZH', 'ZOE'),
|
||
r'\bwestland\b': ('ZH', 'WES'),
|
||
r'\balphen aan den rijn\b': ('ZH', 'ALP'),
|
||
r'\bvlaardingen\b': ('ZH', 'VLA'),
|
||
r'\bcapelle\b': ('ZH', 'CAP'),
|
||
r'\bvoorburg\b': ('ZH', 'VOO'),
|
||
r'\brijswijk\b': ('ZH', 'RIJ'),
|
||
|
||
# Noord-Holland
|
||
r'\bhaarlem\b': ('NH', 'HAA'),
|
||
r'\balkmaar\b': ('NH', 'ALK'),
|
||
r'\bhilversum\b': ('NH', 'HIL'),
|
||
r'\bzaandam\b': ('NH', 'ZAA'),
|
||
r'\bzaanstad\b': ('NH', 'ZAA'),
|
||
r'\bhoorn\b': ('NH', 'HOO'),
|
||
r'\benkhuizen\b': ('NH', 'ENK'),
|
||
r'\bedam\b': ('NH', 'EDA'),
|
||
r'\bvolendam\b': ('NH', 'VOL'),
|
||
r'\bhaarlemmermeer\b': ('NH', 'HLM'),
|
||
r'\bpurmerend\b': ('NH', 'PUR'),
|
||
r'\bmuiden\b': ('NH', 'MUI'),
|
||
r'\bnaarden\b': ('NH', 'NAA'),
|
||
r'\bbussum\b': ('NH', 'BUS'),
|
||
r'\bbloemendaal\b': ('NH', 'BLO'),
|
||
r'\bheemstede\b': ('NH', 'HEE'),
|
||
r'\blaren\b': ('NH', 'LAR'),
|
||
r'\bbergen\b': ('NH', 'BER'),
|
||
|
||
# Gelderland
|
||
r'\barnhem\b': ('GE', 'ARN'),
|
||
r'\bnijmegen\b': ('GE', 'NIJ'),
|
||
r'\bapeldoorn\b': ('GE', 'APE'),
|
||
r'\bede\b': ('GE', 'EDE'),
|
||
r'\bwageningen\b': ('GE', 'WAG'),
|
||
r'\bhattem\b': ('GE', 'HAT'),
|
||
r'\belburg\b': ('GE', 'ELB'),
|
||
r'\bharderwijk\b': ('GE', 'HAR'),
|
||
r'\bdoetinchem\b': ('GE', 'DOE'),
|
||
r'\bzutphen\b': ('GE', 'ZUT'),
|
||
r'\bzevenaar\b': ('GE', 'ZEV'),
|
||
r'\btiel\b': ('GE', 'TIE'),
|
||
r'\botterlo\b': ('GE', 'OTT'),
|
||
r'\bburen\b': ('GE', 'BUR'),
|
||
r'\bbarneveld\b': ('GE', 'BAR'),
|
||
r'\bepe\b': ('GE', 'EPE'),
|
||
r'\beerde\b': ('GE', 'EER'),
|
||
r'\bberkum\b': ('GE', 'BRK'),
|
||
|
||
# Noord-Brabant
|
||
r'\btilburg\b': ('NB', 'TIL'),
|
||
r'\bbreda\b': ('NB', 'BRE'),
|
||
r'\b\'s-hertogenbosch\b': ('NB', 'DBO'),
|
||
r'\bden bosch\b': ('NB', 'DBO'),
|
||
r'\bhelmond\b': ('NB', 'HEL'),
|
||
r'\bossen\b': ('NB', 'OSS'),
|
||
r'\broovendaal\b': ('NB', 'ROO'),
|
||
r'\bbergen op zoom\b': ('NB', 'BOZ'),
|
||
r'\bvught\b': ('NB', 'VUG'),
|
||
r'\bwaalwijk\b': ('NB', 'WAA'),
|
||
r'\bboxtel\b': ('NB', 'BOX'),
|
||
r'\bveldhoven\b': ('NB', 'VEL'),
|
||
r'\bbest\b': ('NB', 'BST'),
|
||
r'\bgeertruidenberg\b': ('NB', 'GEE'),
|
||
r'\bheusden\b': ('NB', 'HEU'),
|
||
|
||
# Limburg
|
||
r'\bmaastricht\b': ('LI', 'MAA'),
|
||
r'\bvenlo\b': ('LI', 'VEN'),
|
||
r'\broermond\b': ('LI', 'ROE'),
|
||
r'\bheerlen\b': ('LI', 'HEE'),
|
||
r'\bsittard\b': ('LI', 'SIT'),
|
||
r'\bgeleen\b': ('LI', 'GEL'),
|
||
r'\bweert\b': ('LI', 'WEE'),
|
||
r'\bvalkenburg\b': ('LI', 'VAL'),
|
||
r'\bkerkrade\b': ('LI', 'KER'),
|
||
r'\bbrunsum\b': ('LI', 'BRU'),
|
||
|
||
# Overijssel
|
||
r'\bzwolle\b': ('OV', 'ZWO'),
|
||
r'\bdeventer\b': ('OV', 'DEV'),
|
||
r'\bkampen\b': ('OV', 'KAM'),
|
||
r'\benschede\b': ('OV', 'ENS'),
|
||
r'\bhengelo\b': ('OV', 'HEN'),
|
||
r'\balmelo\b': ('OV', 'ALM'),
|
||
r'\boldenzaal\b': ('OV', 'OLD'),
|
||
r'\bsteenwijk\b': ('OV', 'STE'),
|
||
r'\bhasselt\b': ('OV', 'HAS'),
|
||
r'\bgiethoorn\b': ('OV', 'GIE'),
|
||
r'\braalte\b': ('OV', 'RAA'),
|
||
r'\bijsselmuiden\b': ('OV', 'IJS'),
|
||
|
||
# Friesland
|
||
r'\bleeuwarden\b': ('FR', 'LEE'),
|
||
r'\bljouwert\b': ('FR', 'LEE'),
|
||
r'\bdrachten\b': ('FR', 'DRA'),
|
||
r'\bheerenveen\b': ('FR', 'HVE'),
|
||
r'\bsneek\b': ('FR', 'SNE'),
|
||
r'\bfraneker\b': ('FR', 'FRA'),
|
||
r'\bharlingen\b': ('FR', 'HAR'),
|
||
r'\bbolsward\b': ('FR', 'BOL'),
|
||
r'\bworkum\b': ('FR', 'WOR'),
|
||
r'\bterschelling\b': ('FR', 'TER'),
|
||
r'\bameland\b': ('FR', 'AME'),
|
||
r'\boranjewoud\b': ('FR', 'ORN'),
|
||
|
||
# Drenthe
|
||
r'\bassen\b': ('DR', 'ASS'),
|
||
r'\bemmen\b': ('DR', 'EMM'),
|
||
r'\bmeppel\b': ('DR', 'MEP'),
|
||
r'\bhoogeveen\b': ('DR', 'HOO'),
|
||
r'\bcoevorden\b': ('DR', 'COE'),
|
||
r'\bborger\b': ('DR', 'BOR'),
|
||
|
||
# Groningen
|
||
r'\bgroningen\b': ('GR', 'GRO'),
|
||
r'\bveendam\b': ('GR', 'VEE'),
|
||
r'\bwinschoten\b': ('GR', 'WIN'),
|
||
r'\bdelfzijl\b': ('GR', 'DEL'),
|
||
r'\bappingedam\b': ('GR', 'APP'),
|
||
r'\bhoogezand\b': ('GR', 'HOO'),
|
||
|
||
# Zeeland
|
||
r'\bmiddelburg\b': ('ZE', 'MID'),
|
||
r'\bvlissingen\b': ('ZE', 'VLI'),
|
||
r'\bgoes\b': ('ZE', 'GOE'),
|
||
r'\bterneuzen\b': ('ZE', 'TER'),
|
||
r'\bzierikzee\b': ('ZE', 'ZIE'),
|
||
r'\bveere\b': ('ZE', 'VEE'),
|
||
r'\bwestkapelle\b': ('ZE', 'WKA'),
|
||
|
||
# Flevoland
|
||
r'\balmere\b': ('FL', 'ALM'),
|
||
r'\blelystad\b': ('FL', 'LEL'),
|
||
r'\bdronten\b': ('FL', 'DRO'),
|
||
r'\burk\b': ('FL', 'URK'),
|
||
r'\bzeewolde\b': ('FL', 'ZEE'),
|
||
}
|
||
|
||
# Non-Dutch indicators - files with these should be reclassified
|
||
NON_DUTCH_PATTERNS = {
|
||
# Saudi Arabia / Arabic
|
||
r'\bsaudi\b': 'SA',
|
||
r'\bمشاريع\b': 'SA',
|
||
r'\bوزارة\b': 'SA',
|
||
r'\bالسعودية\b': 'SA',
|
||
|
||
# France
|
||
r'\bsainte-m[eè]re\b': 'FR',
|
||
r'\bnouvelle-aquitaine\b': 'FR',
|
||
r'\bparis\b': 'FR',
|
||
r'\bfrance\b': 'FR',
|
||
r'\bfran[çc]ais\b': 'FR',
|
||
r'\bbnf\b': 'FR',
|
||
r'\bircam\b': 'FR',
|
||
r'\bfondation du patrimoine\b': 'FR',
|
||
r'\bministère\b': 'FR',
|
||
r'\blyon\b': 'FR',
|
||
r'\bmarseille\b': 'FR',
|
||
r'\bordeaux\b': 'FR',
|
||
r'\bnantes\b': 'FR',
|
||
r'\bstrasbourg\b': 'FR',
|
||
r'\blille\b': 'FR',
|
||
r'\btoulouse\b': 'FR',
|
||
r'\bnice\b': 'FR',
|
||
|
||
# Germany
|
||
r'\bberlin\b': 'DE',
|
||
r'\bweimar\b': 'DE',
|
||
r'\bstiftung\b': 'DE',
|
||
r'\bpreu[ßs]ischer\b': 'DE',
|
||
r'\bmunich\b': 'DE',
|
||
r'\bm[üu]nchen\b': 'DE',
|
||
r'\bfrankfurt\b': 'DE',
|
||
r'\bhamburg\b': 'DE',
|
||
r'\bk[öo]ln\b': 'DE',
|
||
r'\bd[üu]sseldorf\b': 'DE',
|
||
r'\bstuttgart\b': 'DE',
|
||
r'\bheidelberg\b': 'DE',
|
||
r'\bdresden\b': 'DE',
|
||
r'\bleipzig\b': 'DE',
|
||
r'\bgerman\b': 'DE',
|
||
r'\bgermany\b': 'DE',
|
||
r'\bzentral\b': 'DE',
|
||
r'\bzentrum\b': 'DE',
|
||
r'\bkunstgeschichte\b': 'DE',
|
||
r'\bteilhabe\b': 'DE',
|
||
r'\bkulturelle\b': 'DE',
|
||
|
||
# Belgium
|
||
r'\bbelgium\b': 'BE',
|
||
r'\bgent\b': 'BE',
|
||
r'\bantwerp\b': 'BE',
|
||
r'\bghent\b': 'BE',
|
||
r'\bbrussels\b': 'BE',
|
||
r'\bbrussel\b': 'BE',
|
||
r'\bbruxelles\b': 'BE',
|
||
r'\bleuven\b': 'BE',
|
||
r'\bliege\b': 'BE',
|
||
r'\bbelgique\b': 'BE',
|
||
r'\bflemish\b': 'BE',
|
||
r'\bvlaams\b': 'BE',
|
||
|
||
# United Kingdom
|
||
r'\bbritish\b': 'GB',
|
||
r'\blondon\b': 'GB',
|
||
r'\benglish\b': 'GB',
|
||
r'\bengland\b': 'GB',
|
||
r'\bscotland\b': 'GB',
|
||
r'\bwales\b': 'GB',
|
||
r'\bmanchester\b': 'GB',
|
||
r'\bbirmingham\b': 'GB',
|
||
r'\bedinburgh\b': 'GB',
|
||
r'\bcardiff\b': 'GB',
|
||
r'\boxford\b': 'GB',
|
||
r'\bcambridge\b': 'GB',
|
||
r'\bdurham\b': 'GB',
|
||
r'\broyal armouries\b': 'GB',
|
||
r'\broyal parks\b': 'GB',
|
||
r'\bthe british academy\b': 'GB',
|
||
|
||
# Italy
|
||
r'\broma\b': 'IT',
|
||
r'\brome\b': 'IT',
|
||
r'\bmilano\b': 'IT',
|
||
r'\bmilan\b': 'IT',
|
||
r'\bitalian\b': 'IT',
|
||
r'\bitaly\b': 'IT',
|
||
r'\bfirenze\b': 'IT',
|
||
r'\bflorence\b': 'IT',
|
||
r'\bvenice\b': 'IT',
|
||
r'\bvenezia\b': 'IT',
|
||
r'\bnaples\b': 'IT',
|
||
r'\bnapoli\b': 'IT',
|
||
r'\bartribune\b': 'IT',
|
||
|
||
# Denmark
|
||
r'\baalborg\b': 'DK',
|
||
r'\bcopenhagen\b': 'DK',
|
||
r'\bkøbenhavn\b': 'DK',
|
||
r'\bdanish\b': 'DK',
|
||
r'\bdenmark\b': 'DK',
|
||
r'\baarhus\b': 'DK',
|
||
|
||
# USA
|
||
r'\bwashington d\.?c\.?\b': 'US',
|
||
r'\bnew york\b': 'US',
|
||
r'\blos angeles\b': 'US',
|
||
r'\bchicago\b': 'US',
|
||
r'\bamerican\b': 'US',
|
||
r'\bstand ?with ?us\b': 'US',
|
||
|
||
# Indonesia
|
||
r'\bindonesia\b': 'ID',
|
||
r'\bjakarta\b': 'ID',
|
||
r'\btaman safari\b': 'ID',
|
||
|
||
# Other countries
|
||
r'\bafrican wildlife\b': 'KE',
|
||
r'\bkenya\b': 'KE',
|
||
r'\bisrael\b': 'IL',
|
||
r'\bjerusalem\b': 'IL',
|
||
r'\btel aviv\b': 'IL',
|
||
r'\bindia\b': 'IN',
|
||
r'\bindian\b': 'IN',
|
||
r'\bmumbai\b': 'IN',
|
||
r'\bdelhi\b': 'IN',
|
||
}
|
||
|
||
# Institution type inference
|
||
TYPE_KEYWORDS = {
|
||
'museum': 'M',
|
||
'musea': 'M',
|
||
'archief': 'A',
|
||
'archive': 'A',
|
||
'bibliotheek': 'L',
|
||
'library': 'L',
|
||
'universiteit': 'E',
|
||
'university': 'E',
|
||
'hogeschool': 'E',
|
||
'academie': 'E',
|
||
'academy': 'E',
|
||
'school': 'E',
|
||
'ministerie': 'O',
|
||
'ministry': 'O',
|
||
'gemeente': 'O',
|
||
'politie': 'O',
|
||
'rijks': 'O',
|
||
'dienst': 'O',
|
||
'stichting': 'N',
|
||
'foundation': 'N',
|
||
'fonds': 'N',
|
||
'fund': 'N',
|
||
'vereniging': 'S',
|
||
'society': 'S',
|
||
'association': 'S',
|
||
}
|
||
|
||
|
||
def normalize_name(name: str) -> str:
|
||
"""Normalize name for matching."""
|
||
return name.lower().strip()
|
||
|
||
|
||
# Short patterns that need word boundary matching
|
||
SHORT_PATTERNS = {'ind', 'eo', 'ntr', 'npo', 'nos', 'coa', 'cbs', 'tno', 'nwo', 'hku', 'tue', 'oba', 'svb', 'uwv', 'dnb', 'afm', 'fries museum'}
|
||
|
||
|
||
def lookup_known_org(name: str) -> Optional[Tuple[str, str, str, str]]:
|
||
"""Look up organization in known list."""
|
||
name_lower = normalize_name(name)
|
||
|
||
for pattern, info in KNOWN_ORGANIZATIONS.items():
|
||
# For short patterns, use word boundary matching
|
||
if pattern in SHORT_PATTERNS:
|
||
# Must be exact match or word-bounded
|
||
if re.search(r'\b' + re.escape(pattern) + r'\b', name_lower):
|
||
return info
|
||
else:
|
||
# For longer patterns, substring matching is fine
|
||
if pattern in name_lower:
|
||
return info
|
||
return None
|
||
|
||
|
||
def detect_city(name: str) -> Optional[Tuple[str, str]]:
|
||
"""Detect city from name."""
|
||
name_lower = normalize_name(name)
|
||
for pattern, (prov, city) in CITY_PATTERNS.items():
|
||
if re.search(pattern, name_lower):
|
||
return (prov, city)
|
||
return None
|
||
|
||
|
||
def detect_non_dutch(name: str) -> Optional[str]:
|
||
"""Detect if organization is not Dutch."""
|
||
name_lower = normalize_name(name)
|
||
for pattern, country in NON_DUTCH_PATTERNS.items():
|
||
if re.search(pattern, name_lower):
|
||
return country
|
||
return None
|
||
|
||
|
||
def infer_type(name: str) -> str:
|
||
"""Infer institution type from name."""
|
||
name_lower = normalize_name(name)
|
||
for keyword, type_code in TYPE_KEYWORDS.items():
|
||
if keyword in name_lower:
|
||
return type_code
|
||
return 'M' # Default to Museum
|
||
|
||
|
||
def generate_abbreviation(name: str) -> str:
|
||
"""Generate abbreviation from name."""
|
||
skip = {'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'en', 'of',
|
||
'the', 'a', 'an', 'of', 'for', 'and', 'or', 'at', 'in', 'on',
|
||
'stichting', 'museum', 'archief', 'bibliotheek'}
|
||
words = re.split(r'[\s\-\'\"\(\)]+', name)
|
||
abbrev = ''.join(w[0].upper() for w in words
|
||
if w.lower() not in skip and w and w[0].isalpha())
|
||
return abbrev[:8] if abbrev else 'UNK'
|
||
|
||
|
||
def load_yaml(filepath: Path) -> Optional[Dict]:
|
||
"""Load YAML file."""
|
||
try:
|
||
with open(filepath, 'r', encoding='utf-8') as f:
|
||
return yaml.safe_load(f)
|
||
except:
|
||
return None
|
||
|
||
|
||
def save_yaml(filepath: Path, data: Dict):
|
||
"""Save YAML file."""
|
||
with open(filepath, 'w', encoding='utf-8') as f:
|
||
yaml.dump(data, f, allow_unicode=True, default_flow_style=False,
|
||
sort_keys=False, width=120)
|
||
|
||
|
||
def resolve_pending_file(filepath: Path, custodian_dir: Path, dry_run: bool = True) -> Tuple[str, Optional[Path]]:
|
||
"""
|
||
Resolve a PENDING file.
|
||
|
||
Returns: (status, new_filepath)
|
||
Status: 'resolved', 'reclassified', 'collision', 'failed'
|
||
"""
|
||
data = load_yaml(filepath)
|
||
if not data:
|
||
return ('error', None)
|
||
|
||
name = data.get('custodian_name', {}).get('emic_name', '')
|
||
if not name:
|
||
return ('error', None)
|
||
|
||
# Strategy 1: Check if non-Dutch
|
||
new_country = detect_non_dutch(name)
|
||
if new_country and new_country != 'NL':
|
||
# Reclassify to different country
|
||
old_name = filepath.stem
|
||
new_name = old_name.replace('NL-XX-XXX-PENDING-', f'{new_country}-XX-XXX-PENDING-')
|
||
new_filepath = custodian_dir / f"{new_name}.yaml"
|
||
|
||
if new_filepath.exists():
|
||
return ('collision', None)
|
||
|
||
if not dry_run:
|
||
data['ghcid_current'] = new_name
|
||
save_yaml(new_filepath, data)
|
||
filepath.unlink()
|
||
|
||
return ('reclassified', new_filepath)
|
||
|
||
# Strategy 2: Known organization lookup
|
||
known = lookup_known_org(name)
|
||
if known:
|
||
prov, city, inst_type, abbrev = known
|
||
new_ghcid = f"NL-{prov}-{city}-{inst_type}-{abbrev}"
|
||
new_filepath = custodian_dir / f"{new_ghcid}.yaml"
|
||
|
||
if new_filepath.exists():
|
||
return ('collision', new_filepath)
|
||
|
||
if not dry_run:
|
||
data['ghcid_current'] = new_ghcid
|
||
if 'provenance' not in data:
|
||
data['provenance'] = {}
|
||
notes = data['provenance'].get('notes', [])
|
||
if isinstance(notes, str):
|
||
notes = [notes]
|
||
notes.append(f"GHCID resolved via known org lookup on {datetime.now(timezone.utc).isoformat()}")
|
||
data['provenance']['notes'] = notes
|
||
save_yaml(new_filepath, data)
|
||
filepath.unlink()
|
||
|
||
return ('resolved', new_filepath)
|
||
|
||
# Strategy 3: City name extraction
|
||
city_info = detect_city(name)
|
||
if city_info:
|
||
prov, city = city_info
|
||
inst_type = infer_type(name)
|
||
abbrev = generate_abbreviation(name)
|
||
new_ghcid = f"NL-{prov}-{city}-{inst_type}-{abbrev}"
|
||
new_filepath = custodian_dir / f"{new_ghcid}.yaml"
|
||
|
||
if new_filepath.exists():
|
||
return ('collision', new_filepath)
|
||
|
||
if not dry_run:
|
||
data['ghcid_current'] = new_ghcid
|
||
if 'provenance' not in data:
|
||
data['provenance'] = {}
|
||
notes = data['provenance'].get('notes', [])
|
||
if isinstance(notes, str):
|
||
notes = [notes]
|
||
notes.append(f"GHCID resolved via city extraction on {datetime.now(timezone.utc).isoformat()}")
|
||
data['provenance']['notes'] = notes
|
||
save_yaml(new_filepath, data)
|
||
filepath.unlink()
|
||
|
||
return ('resolved', new_filepath)
|
||
|
||
return ('failed', None)
|
||
|
||
|
||
def main():
|
||
import argparse
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument('--dry-run', action='store_true')
|
||
parser.add_argument('--limit', type=int, default=0)
|
||
parser.add_argument('--custodian-dir', type=Path,
|
||
default=Path('/Users/kempersc/apps/glam/data/custodian'))
|
||
args = parser.parse_args()
|
||
|
||
custodian_dir = args.custodian_dir
|
||
|
||
print("=" * 80)
|
||
print("COMPREHENSIVE PENDING FILE RESOLVER")
|
||
print("=" * 80)
|
||
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
|
||
if args.limit:
|
||
print(f"Limit: {args.limit} files")
|
||
print()
|
||
|
||
# Find NL PENDING files
|
||
pending_files = sorted(custodian_dir.glob('NL-XX-XXX-PENDING-*.yaml'))
|
||
if args.limit:
|
||
pending_files = pending_files[:args.limit]
|
||
|
||
print(f"Processing {len(pending_files)} files...")
|
||
print()
|
||
|
||
stats = {'resolved': 0, 'reclassified': 0, 'collision': 0, 'failed': 0, 'error': 0}
|
||
|
||
for filepath in pending_files:
|
||
data = load_yaml(filepath)
|
||
if not data:
|
||
stats['error'] += 1
|
||
continue
|
||
|
||
name = data.get('custodian_name', {}).get('emic_name', '')
|
||
status, new_path = resolve_pending_file(filepath, custodian_dir, args.dry_run)
|
||
stats[status] += 1
|
||
|
||
if status in ['resolved', 'reclassified']:
|
||
action = 'DRY RUN' if args.dry_run else status.upper()
|
||
print(f"[{action}] {name[:45]}")
|
||
if new_path:
|
||
print(f" -> {new_path.name}")
|
||
print()
|
||
|
||
print("=" * 80)
|
||
print("SUMMARY")
|
||
print("=" * 80)
|
||
for status, count in stats.items():
|
||
if count > 0:
|
||
print(f" {status}: {count}")
|
||
print(f" TOTAL: {sum(stats.values())}")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|