glam/scripts/resolve_pending_comprehensive.py
2026-01-09 20:35:19 +01:00

905 lines
33 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Comprehensive PENDING file resolver using multiple strategies:
1. Known organization lookup table
2. City name extraction from emic name
3. Country re-detection for misclassified files
4. Wikidata lookup for remaining
Usage:
python scripts/resolve_pending_comprehensive.py --dry-run
python scripts/resolve_pending_comprehensive.py --limit 100
python scripts/resolve_pending_comprehensive.py
"""
import os
import re
import yaml
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, Optional, Tuple, List
# Known Dutch organizations with their locations
# Format: 'name pattern': ('province', 'city_code', 'type', 'abbreviation')
KNOWN_ORGANIZATIONS = {
# Government - Ministeries
'ministerie van buitenlandse zaken': ('ZH', 'DHA', 'O', 'MBZ'),
'ministerie van justitie en veiligheid': ('ZH', 'DHA', 'O', 'MJV'),
'ministerie van onderwijs': ('ZH', 'DHA', 'O', 'MOC'),
'ministerie van defensie': ('ZH', 'DHA', 'O', 'MD'),
'ministerie van financien': ('ZH', 'DHA', 'O', 'MF'),
'ministerie van sociale zaken': ('ZH', 'DHA', 'O', 'MSZ'),
'ministerie van economische zaken': ('ZH', 'DHA', 'O', 'MEZ'),
'ministerie van volksgezondheid': ('ZH', 'DHA', 'O', 'MVW'),
'ministerie van binnenlandse zaken': ('ZH', 'DHA', 'O', 'MBZ'),
'ministerie van infrastructuur': ('ZH', 'DHA', 'O', 'MIW'),
'ministerie van landbouw': ('ZH', 'DHA', 'O', 'MLN'),
# Government - Agencies
'algemene rekenkamer': ('ZH', 'DHA', 'O', 'AR'),
'politie nederland': ('ZH', 'DHA', 'O', 'PN'),
'douane nederland': ('ZH', 'ROT', 'O', 'DN'),
'kadaster': ('GE', 'APE', 'O', 'K'),
'rijkswaterstaat': ('UT', 'UTR', 'O', 'RWS'),
'netherlands enterprise agency': ('ZH', 'DHA', 'O', 'NEA'),
'dienst uitvoering onderwijs': ('GR', 'GRO', 'O', 'DUO'),
'fiod': ('ZH', 'DHA', 'O', 'FIOD'),
'ssc-ict': ('ZH', 'DHA', 'O', 'SSC'),
'raad voor de kinderbescherming': ('ZH', 'DHA', 'O', 'RVK'),
'immigratie- en naturalisatiedienst': ('ZH', 'DHA', 'O', 'IND'),
'ind': ('ZH', 'DHA', 'O', 'IND'),
'coa': ('ZH', 'DHA', 'O', 'COA'),
'centraal orgaan opvang asielzoekers': ('ZH', 'DHA', 'O', 'COA'),
'sociale verzekeringsbank': ('NH', 'AME', 'O', 'SVB'),
'uwv': ('NH', 'AMS', 'O', 'UWV'),
'kamer van koophandel': ('UT', 'UTR', 'O', 'KVK'),
'autoriteit persoonsgegevens': ('ZH', 'DHA', 'O', 'AP'),
'belastingdienst': ('UT', 'UTR', 'O', 'BD'),
'autoriteit financiele markten': ('NH', 'AMS', 'O', 'AFM'),
'de nederlandsche bank': ('NH', 'AMS', 'O', 'DNB'),
'cbs': ('ZH', 'DHA', 'O', 'CBS'),
'centraal bureau voor de statistiek': ('ZH', 'DHA', 'O', 'CBS'),
'rijksdienst voor het cultureel erfgoed': ('GE', 'AME', 'O', 'RCE'),
'rijksdienst voor ondernemend nederland': ('ZH', 'DHA', 'O', 'RVO'),
'raad van state': ('ZH', 'DHA', 'O', 'RVS'),
'raad voor cultuur': ('ZH', 'DHA', 'O', 'RVC'),
# Education - Universities
'reinwardt academie': ('NH', 'AMS', 'E', 'RA'),
'academie minerva': ('GR', 'GRO', 'E', 'AM'),
'university of humanistic studies': ('UT', 'UTR', 'E', 'UHS'),
'erasmus university': ('ZH', 'ROT', 'E', 'EUR'),
'erasmus universiteit': ('ZH', 'ROT', 'E', 'EUR'),
'universiteit van amsterdam': ('NH', 'AMS', 'E', 'UVA'),
'vrije universiteit amsterdam': ('NH', 'AMS', 'E', 'VU'),
'universiteit leiden': ('ZH', 'LEI', 'E', 'UL'),
'leiden university': ('ZH', 'LEI', 'E', 'UL'),
'universiteit utrecht': ('UT', 'UTR', 'E', 'UU'),
'utrecht university': ('UT', 'UTR', 'E', 'UU'),
'rijksuniversiteit groningen': ('GR', 'GRO', 'E', 'RUG'),
'university of groningen': ('GR', 'GRO', 'E', 'RUG'),
'technische universiteit delft': ('ZH', 'DEL', 'E', 'TUD'),
'tu delft': ('ZH', 'DEL', 'E', 'TUD'),
'delft university': ('ZH', 'DEL', 'E', 'TUD'),
'technische universiteit eindhoven': ('NB', 'EIN', 'E', 'TUE'),
'tu eindhoven': ('NB', 'EIN', 'E', 'TUE'),
'tue': ('NB', 'EIN', 'E', 'TUE'),
'wageningen university': ('GE', 'WAG', 'E', 'WUR'),
'wageningen universiteit': ('GE', 'WAG', 'E', 'WUR'),
'radboud university': ('GE', 'NIJ', 'E', 'RU'),
'radboud universiteit': ('GE', 'NIJ', 'E', 'RU'),
'tilburg university': ('NB', 'TIL', 'E', 'TIU'),
'universiteit tilburg': ('NB', 'TIL', 'E', 'TIU'),
'maastricht university': ('LI', 'MAA', 'E', 'UM'),
'universiteit maastricht': ('LI', 'MAA', 'E', 'UM'),
'open universiteit': ('LI', 'HEE', 'E', 'OU'),
'nyenrode': ('NH', 'BRE', 'E', 'NYE'),
'royal academy of art': ('ZH', 'DHA', 'E', 'KABK'),
'koninklijke academie van beeldende kunsten': ('ZH', 'DHA', 'E', 'KABK'),
# Education - Hogescholen
'hogeschool van amsterdam': ('NH', 'AMS', 'E', 'HVA'),
'hogeschool rotterdam': ('ZH', 'ROT', 'E', 'HR'),
'hogeschool utrecht': ('UT', 'UTR', 'E', 'HU'),
'hogeschool inholland': ('NH', 'AMS', 'E', 'INH'),
'inholland': ('NH', 'AMS', 'E', 'INH'),
'hogeschool leiden': ('ZH', 'LEI', 'E', 'HL'),
'haagse hogeschool': ('ZH', 'DHA', 'E', 'HH'),
'saxion': ('OV', 'ENS', 'E', 'SAX'),
'avans': ('NB', 'BRE', 'E', 'AVA'),
'fontys': ('NB', 'EIN', 'E', 'FON'),
'zuyd hogeschool': ('LI', 'MAA', 'E', 'ZH'),
'hanzehogeschool': ('GR', 'GRO', 'E', 'HAN'),
'hku': ('UT', 'UTR', 'E', 'HKU'),
'design academy eindhoven': ('NB', 'EIN', 'E', 'DAE'),
'gerrit rietveld academie': ('NH', 'AMS', 'E', 'GRA'),
'koninklijk conservatorium': ('ZH', 'DHA', 'E', 'KC'),
'conservatorium van amsterdam': ('NH', 'AMS', 'E', 'CVA'),
'codarts': ('ZH', 'ROT', 'E', 'COD'),
'artez': ('GE', 'ARN', 'E', 'ARZ'),
# Museums - Amsterdam
'anne frank stichting': ('NH', 'AMS', 'M', 'AFS'),
'allard pierson': ('NH', 'AMS', 'M', 'AP'),
'van gogh museum': ('NH', 'AMS', 'M', 'VGM'),
'tropenmuseum': ('NH', 'AMS', 'M', 'TM'),
'joods historisch museum': ('NH', 'AMS', 'M', 'JHM'),
'koninklijk paleis amsterdam': ('NH', 'AMS', 'M', 'KPA'),
'stedelijk museum amsterdam': ('NH', 'AMS', 'M', 'SMA'),
'rijksmuseum': ('NH', 'AMS', 'M', 'RM'),
'ons lieve heer op solder': ('NH', 'AMS', 'M', 'OLHOS'),
'rembrandthuis': ('NH', 'AMS', 'M', 'RH'),
'amsterdam museum': ('NH', 'AMS', 'M', 'AM'),
'artis': ('NH', 'AMS', 'M', 'ART'),
'nemo science museum': ('NH', 'AMS', 'M', 'NEMO'),
'eye filmmuseum': ('NH', 'AMS', 'M', 'EYE'),
'moco museum': ('NH', 'AMS', 'M', 'MOCO'),
'hermitage amsterdam': ('NH', 'AMS', 'M', 'HA'),
'tassenmuseum hendrikje': ('NH', 'AMS', 'M', 'TMH'),
'willet-holthuysen': ('NH', 'AMS', 'M', 'WH'),
'geelvinck hinlopen huis': ('NH', 'AMS', 'M', 'GHH'),
'hortus botanicus': ('NH', 'AMS', 'M', 'HB'),
'multatuli museum': ('NH', 'AMS', 'M', 'MM'),
'beurs van berlage': ('NH', 'AMS', 'M', 'BVB'),
'de brakke grond': ('NH', 'AMS', 'M', 'DBG'),
# Museums - Den Haag
'mauritshuis': ('ZH', 'DHA', 'M', 'MH'),
'gemeentemuseum': ('ZH', 'DHA', 'M', 'GMH'),
'kunstmuseum den haag': ('ZH', 'DHA', 'M', 'KDH'),
'escher in het paleis': ('ZH', 'DHA', 'M', 'EHP'),
'museon': ('ZH', 'DHA', 'M', 'MUS'),
'omniversum': ('ZH', 'DHA', 'M', 'OMN'),
'louwman museum': ('ZH', 'DHA', 'M', 'LM'),
'museum de gevangenpoort': ('ZH', 'DHA', 'M', 'MDG'),
'museum meermanno': ('ZH', 'DHA', 'M', 'MMO'),
'bredius museum': ('ZH', 'DHA', 'M', 'BM'),
'panorama mesdag': ('ZH', 'DHA', 'M', 'PM'),
'madurodam': ('ZH', 'DHA', 'M', 'MAD'),
'haags historisch museum': ('ZH', 'DHA', 'M', 'HHM'),
'beelden aan zee': ('ZH', 'DHA', 'M', 'BAZ'),
'museum het paleis': ('ZH', 'DHA', 'M', 'MHP'),
# Museums - Rotterdam
'museum boijmans van beuningen': ('ZH', 'ROT', 'M', 'MBVB'),
'boijmans': ('ZH', 'ROT', 'M', 'MBVB'),
'maritiem museum': ('ZH', 'ROT', 'M', 'MM'),
'het nieuwe instituut': ('ZH', 'ROT', 'M', 'HNI'),
'chabot museum': ('ZH', 'ROT', 'M', 'CM'),
'kunsthal rotterdam': ('ZH', 'ROT', 'M', 'KR'),
'wereldmuseum': ('ZH', 'ROT', 'M', 'WM'),
'museum rotterdam': ('ZH', 'ROT', 'M', 'MR'),
'fotomuseum': ('ZH', 'ROT', 'M', 'NFM'),
'nederlands fotomuseum': ('ZH', 'ROT', 'M', 'NFM'),
'ss rotterdam': ('ZH', 'ROT', 'M', 'SSR'),
'fenixloods': ('ZH', 'ROT', 'M', 'FL'),
# Museums - Other cities
'airborne museum': ('GE', 'ARN', 'M', 'ABM'),
'kroller muller museum': ('GE', 'OTT', 'M', 'KMM'),
'naturalis': ('ZH', 'LEI', 'M', 'NAT'),
'museum catharijneconvent': ('UT', 'UTR', 'M', 'MC'),
'centraal museum': ('UT', 'UTR', 'M', 'CMU'),
'spoorwegmuseum': ('UT', 'UTR', 'M', 'SPW'),
'het utrechts archief': ('UT', 'UTR', 'A', 'HUA'),
'museum speelklok': ('UT', 'UTR', 'M', 'MS'),
'museum van oudheden': ('ZH', 'LEI', 'M', 'MVO'),
'molenmuseum de valk': ('ZH', 'LEI', 'M', 'MDV'),
'stedelijk museum schiedam': ('ZH', 'SCH', 'M', 'SMS'),
'bonnefantenmuseum': ('LI', 'MAA', 'M', 'BFM'),
'marres': ('LI', 'MAA', 'M', 'MAR'),
'museum aan het vrijthof': ('LI', 'MAA', 'M', 'MAV'),
'drents museum': ('DR', 'ASS', 'M', 'DM'),
'groninger museum': ('GR', 'GRO', 'M', 'GM'),
'fries museum': ('FR', 'LEE', 'M', 'FM'), # Requires word boundary (in SHORT_PATTERNS)
'westfries museum': ('NH', 'HOO', 'M', 'WFM'), # In Hoorn
'museum belvédère': ('FR', 'ORN', 'M', 'MB'),
'princessehof': ('FR', 'LEE', 'M', 'PH'),
'zuiderzeemuseum': ('NH', 'ENK', 'M', 'ZZM'),
'rijksmuseum muiderslot': ('NH', 'MUI', 'M', 'RMM'),
'teylers museum': ('NH', 'HAA', 'M', 'TYM'),
'frans hals museum': ('NH', 'HAA', 'M', 'FHM'),
'museum de fundatie': ('OV', 'ZWO', 'M', 'MDF'),
'museum twentse welle': ('OV', 'ENS', 'M', 'MTW'),
'rijksmuseum van oudheden': ('ZH', 'LEI', 'M', 'RMO'),
'museum volkenkunde': ('ZH', 'LEI', 'M', 'MVK'),
'museon-omniversum': ('ZH', 'DHA', 'M', 'MO'),
'literatuurmuseum': ('ZH', 'DHA', 'M', 'LM'),
'kinderboekenmuseum': ('ZH', 'DHA', 'M', 'KBM'),
'van abbemuseum': ('NB', 'EIN', 'M', 'VAM'),
'philips museum': ('NB', 'EIN', 'M', 'PHM'),
'textielmuseum': ('NB', 'TIL', 'M', 'TXM'),
'de pont': ('NB', 'TIL', 'M', 'DP'),
'noordbrabants museum': ('NB', 'DBO', 'M', 'NBM'),
'musis sacrum': ('GE', 'ARN', 'M', 'MUS'),
'museum arnhem': ('GE', 'ARN', 'M', 'MA'),
'afrika museum': ('GE', 'BER', 'M', 'AFM'),
'museum het valkhof': ('GE', 'NIJ', 'M', 'MHV'),
'hunebedcentrum': ('DR', 'BOR', 'M', 'HC'),
'museum drachten': ('FR', 'DRA', 'M', 'MDR'),
'openluchtmuseum': ('GE', 'ARN', 'M', 'OLM'),
'nederlands openluchtmuseum': ('GE', 'ARN', 'M', 'NOLM'),
# Archives
'stadsarchief amsterdam': ('NH', 'AMS', 'A', 'SAA'),
'nationaal archief': ('ZH', 'DHA', 'A', 'NA'),
'amsab': ('BE', 'GEN', 'A', 'AMS'), # Belgian
'noord-hollands archief': ('NH', 'HAA', 'A', 'NHA'),
'brabants historisch informatie centrum': ('NB', 'DBO', 'A', 'BHIC'),
'gelders archief': ('GE', 'ARN', 'A', 'GA'),
'zeeuws archief': ('ZE', 'MID', 'A', 'ZA'),
'tresoar': ('FR', 'LEE', 'A', 'TRE'),
'drents archief': ('DR', 'ASS', 'A', 'DA'),
'groninger archieven': ('GR', 'GRO', 'A', 'GRA'),
'historisch centrum overijssel': ('OV', 'ZWO', 'A', 'HCO'),
'regionaal archief tilburg': ('NB', 'TIL', 'A', 'RAT'),
'erfgoed brabant': ('NB', 'TIL', 'A', 'EB'),
'stadsarchief rotterdam': ('ZH', 'ROT', 'A', 'SAR'),
'stadsarchief delft': ('ZH', 'DEL', 'A', 'SAD'),
'regionaal historisch centrum limburg': ('LI', 'MAA', 'A', 'RHCL'),
# Libraries
'koninklijke bibliotheek': ('ZH', 'DHA', 'L', 'KB'),
'nationale bibliotheek': ('ZH', 'DHA', 'L', 'KB'),
'openbare bibliotheek amsterdam': ('NH', 'AMS', 'L', 'OBA'),
'oba': ('NH', 'AMS', 'L', 'OBA'),
'universiteitbibliotheek': ('NH', 'AMS', 'L', 'UBA'),
'atria': ('NH', 'AMS', 'L', 'ATR'),
'bibliotheek rotterdam': ('ZH', 'ROT', 'L', 'BR'),
'bibliotheek den haag': ('ZH', 'DHA', 'L', 'BDH'),
'bibliotheek utrecht': ('UT', 'UTR', 'L', 'BU'),
# Research
'african studies centre leiden': ('ZH', 'LEI', 'R', 'ASCL'),
'niod': ('NH', 'AMS', 'R', 'NIOD'),
'knaw': ('NH', 'AMS', 'R', 'KNAW'),
'koninklijke nederlandse akademie van wetenschappen': ('NH', 'AMS', 'R', 'KNAW'),
'nwo': ('ZH', 'DHA', 'R', 'NWO'),
'rivm': ('UT', 'BIL', 'R', 'RIVM'),
'tno': ('ZH', 'DHA', 'R', 'TNO'),
'meertens instituut': ('NH', 'AMS', 'R', 'MI'),
'huygens instituut': ('NH', 'AMS', 'R', 'HI'),
'internationaal instituut voor sociale geschiedenis': ('NH', 'AMS', 'R', 'IISG'),
'iisg': ('NH', 'AMS', 'R', 'IISG'),
'rathenau instituut': ('ZH', 'DHA', 'R', 'RAT'),
'planbureau voor de leefomgeving': ('ZH', 'DHA', 'R', 'PBL'),
'sociaal en cultureel planbureau': ('ZH', 'DHA', 'R', 'SCP'),
'cpb': ('ZH', 'DHA', 'R', 'CPB'),
'centraal planbureau': ('ZH', 'DHA', 'R', 'CPB'),
'knmi': ('UT', 'DEV', 'R', 'KNMI'),
'nivel': ('UT', 'UTR', 'R', 'NIV'),
'deltaresearch': ('ZH', 'DEL', 'R', 'DEL'),
'deltares': ('ZH', 'DEL', 'R', 'DEL'),
'nidi': ('ZH', 'DHA', 'R', 'NIDI'),
'dans': ('ZH', 'DHA', 'R', 'DANS'),
'surf': ('UT', 'UTR', 'R', 'SURF'),
# NGOs/Foundations
'amsterdams fonds voor de kunst': ('NH', 'AMS', 'N', 'AFK'),
'mondriaan fonds': ('NH', 'AMS', 'N', 'MF'),
'stimuleringsfonds': ('ZH', 'ROT', 'N', 'SF'),
'fonds voor cultuurparticipatie': ('UT', 'UTR', 'N', 'FCP'),
'fonds podiumkunsten': ('ZH', 'DHA', 'N', 'FPK'),
'letterenfonds': ('NH', 'AMS', 'N', 'LF'),
'filmfonds': ('NH', 'AMS', 'N', 'NFF'),
'nederlands filmfonds': ('NH', 'AMS', 'N', 'NFF'),
'bng cultuurfonds': ('ZH', 'DHA', 'N', 'BNG'),
'prins bernhard cultuurfonds': ('NH', 'AMS', 'N', 'PBC'),
'vsc': ('NH', 'AMS', 'N', 'VSC'),
'cultuur + ondernemen': ('NH', 'AMS', 'N', 'CO'),
'erfgoedvereniging heemschut': ('NH', 'AMS', 'N', 'EH'),
'heemschut': ('NH', 'AMS', 'N', 'EH'),
'boekmanstichting': ('NH', 'AMS', 'N', 'BS'),
'lira': ('NH', 'AMS', 'N', 'LIRA'),
'pictoright': ('NH', 'AMS', 'N', 'PR'),
'buma stemra': ('NH', 'AMS', 'N', 'BS'),
'senafonds': ('NH', 'AMS', 'N', 'SEN'),
# Performing Arts
'nederlands dans theater': ('ZH', 'DHA', 'M', 'NDT'),
'ndt': ('ZH', 'DHA', 'M', 'NDT'),
'het nationale ballet': ('NH', 'AMS', 'M', 'HNB'),
'nationale opera': ('NH', 'AMS', 'M', 'DNO'),
'de nationale opera & ballet': ('NH', 'AMS', 'M', 'NOB'),
'concertgebouw': ('NH', 'AMS', 'M', 'CG'),
'koninklijk concertgebouworkest': ('NH', 'AMS', 'M', 'KCO'),
'residentie orkest': ('ZH', 'DHA', 'M', 'RO'),
'rotterdams philharmonisch': ('ZH', 'ROT', 'M', 'RPO'),
'nederlands kamerorkest': ('NH', 'AMS', 'M', 'NKO'),
'holland festival': ('NH', 'AMS', 'M', 'HF'),
'internationaal theater amsterdam': ('NH', 'AMS', 'M', 'ITA'),
'ita': ('NH', 'AMS', 'M', 'ITA'),
'stadsschouwburg': ('NH', 'AMS', 'M', 'SSB'),
'theater carré': ('NH', 'AMS', 'M', 'TC'),
'de la mar theater': ('NH', 'AMS', 'M', 'DLM'),
'schouwburg': ('NH', 'AMS', 'M', 'SCH'),
'muziekgebouw aan t ij': ('NH', 'AMS', 'M', 'MATI'),
'bimhuis': ('NH', 'AMS', 'M', 'BH'),
'paradiso': ('NH', 'AMS', 'M', 'PAR'),
'melkweg': ('NH', 'AMS', 'M', 'MW'),
'doelen': ('ZH', 'ROT', 'M', 'DOE'),
'de doelen': ('ZH', 'ROT', 'M', 'DOE'),
'ahoy': ('ZH', 'ROT', 'M', 'AH'),
'tivoli vredenburg': ('UT', 'UTR', 'M', 'TV'),
'theater aan het spui': ('ZH', 'DHA', 'M', 'TAS'),
'zuiderstrandtheater': ('ZH', 'DHA', 'M', 'ZST'),
'lucent danstheater': ('ZH', 'DHA', 'M', 'LDT'),
'chassé theater': ('NB', 'BRE', 'M', 'CT'),
'parktheater': ('NB', 'EIN', 'M', 'PT'),
# Media/Broadcasting
'npo': ('NH', 'HIL', 'M', 'NPO'),
'nos': ('NH', 'HIL', 'M', 'NOS'),
'ntr': ('NH', 'HIL', 'M', 'NTR'),
'avro': ('NH', 'HIL', 'M', 'AVRO'),
'avrotros': ('NH', 'HIL', 'M', 'AT'),
'vara': ('NH', 'HIL', 'M', 'VARA'),
'bnnvara': ('NH', 'HIL', 'M', 'BV'),
'eo': ('NH', 'HIL', 'M', 'EO'),
'kro': ('NH', 'HIL', 'M', 'KRO'),
'kro-ncrv': ('NH', 'HIL', 'M', 'KN'),
'vpro': ('NH', 'HIL', 'M', 'VPRO'),
'max': ('NH', 'HIL', 'M', 'MAX'),
'omroep max': ('NH', 'HIL', 'M', 'MAX'),
'beeld en geluid': ('NH', 'HIL', 'M', 'BEG'),
'beelden en geluid': ('NH', 'HIL', 'M', 'BEG'),
# Religious/Holy Sites
'protestantse kerk in nederland': ('UT', 'UTR', 'H', 'PKN'),
'pkn': ('UT', 'UTR', 'H', 'PKN'),
'bisdom utrecht': ('UT', 'UTR', 'H', 'BU'),
'aartsbisdom utrecht': ('UT', 'UTR', 'H', 'ABU'),
'bisdom haarlem': ('NH', 'HAA', 'H', 'BH'),
'bisdom rotterdam': ('ZH', 'ROT', 'H', 'BR'),
'bisdom breda': ('NB', 'BRE', 'H', 'BB'),
'bisdom den bosch': ('NB', 'DBO', 'H', 'BDB'),
# Provincial/Regional
'rijnbrink': ('GE', 'ARN', 'N', 'RB'),
'erfgoed zeeland': ('ZE', 'MID', 'N', 'EZ'),
'erfgoed brabant': ('NB', 'TIL', 'N', 'EB'),
'erfgoed gelderland': ('GE', 'ARN', 'N', 'EG'),
'erfgoed overijssel': ('OV', 'ZWO', 'N', 'EO'),
'monumentenwacht': ('NH', 'AMS', 'N', 'MW'),
'erfgoedcentrum': ('UT', 'UTR', 'N', 'EC'),
}
# Additional city patterns to detect (Dutch cities)
CITY_PATTERNS = {
# Major cities
r'\bamsterdam\b': ('NH', 'AMS'),
r'\brotterdam\b': ('ZH', 'ROT'),
r'\bden haag\b': ('ZH', 'DHA'),
r'\bthe hague\b': ('ZH', 'DHA'),
r'\b\'s-gravenhage\b': ('ZH', 'DHA'),
r'\butrecht\b': ('UT', 'UTR'),
r'\beindhoven\b': ('NB', 'EIN'),
r'\bgroningen\b': ('GR', 'GRO'),
# Zuid-Holland
r'\bleiden\b': ('ZH', 'LEI'),
r'\bdelft\b': ('ZH', 'DEL'),
r'\bdordrecht\b': ('ZH', 'DOR'),
r'\bgouda\b': ('ZH', 'GOU'),
r'\bschiedam\b': ('ZH', 'SCH'),
r'\bzoetermeer\b': ('ZH', 'ZOE'),
r'\bwestland\b': ('ZH', 'WES'),
r'\balphen aan den rijn\b': ('ZH', 'ALP'),
r'\bvlaardingen\b': ('ZH', 'VLA'),
r'\bcapelle\b': ('ZH', 'CAP'),
r'\bvoorburg\b': ('ZH', 'VOO'),
r'\brijswijk\b': ('ZH', 'RIJ'),
# Noord-Holland
r'\bhaarlem\b': ('NH', 'HAA'),
r'\balkmaar\b': ('NH', 'ALK'),
r'\bhilversum\b': ('NH', 'HIL'),
r'\bzaandam\b': ('NH', 'ZAA'),
r'\bzaanstad\b': ('NH', 'ZAA'),
r'\bhoorn\b': ('NH', 'HOO'),
r'\benkhuizen\b': ('NH', 'ENK'),
r'\bedam\b': ('NH', 'EDA'),
r'\bvolendam\b': ('NH', 'VOL'),
r'\bhaarlemmermeer\b': ('NH', 'HLM'),
r'\bpurmerend\b': ('NH', 'PUR'),
r'\bmuiden\b': ('NH', 'MUI'),
r'\bnaarden\b': ('NH', 'NAA'),
r'\bbussum\b': ('NH', 'BUS'),
r'\bbloemendaal\b': ('NH', 'BLO'),
r'\bheemstede\b': ('NH', 'HEE'),
r'\blaren\b': ('NH', 'LAR'),
r'\bbergen\b': ('NH', 'BER'),
# Gelderland
r'\barnhem\b': ('GE', 'ARN'),
r'\bnijmegen\b': ('GE', 'NIJ'),
r'\bapeldoorn\b': ('GE', 'APE'),
r'\bede\b': ('GE', 'EDE'),
r'\bwageningen\b': ('GE', 'WAG'),
r'\bhattem\b': ('GE', 'HAT'),
r'\belburg\b': ('GE', 'ELB'),
r'\bharderwijk\b': ('GE', 'HAR'),
r'\bdoetinchem\b': ('GE', 'DOE'),
r'\bzutphen\b': ('GE', 'ZUT'),
r'\bzevenaar\b': ('GE', 'ZEV'),
r'\btiel\b': ('GE', 'TIE'),
r'\botterlo\b': ('GE', 'OTT'),
r'\bburen\b': ('GE', 'BUR'),
r'\bbarneveld\b': ('GE', 'BAR'),
r'\bepe\b': ('GE', 'EPE'),
r'\beerde\b': ('GE', 'EER'),
r'\bberkum\b': ('GE', 'BRK'),
# Noord-Brabant
r'\btilburg\b': ('NB', 'TIL'),
r'\bbreda\b': ('NB', 'BRE'),
r'\b\'s-hertogenbosch\b': ('NB', 'DBO'),
r'\bden bosch\b': ('NB', 'DBO'),
r'\bhelmond\b': ('NB', 'HEL'),
r'\bossen\b': ('NB', 'OSS'),
r'\broovendaal\b': ('NB', 'ROO'),
r'\bbergen op zoom\b': ('NB', 'BOZ'),
r'\bvught\b': ('NB', 'VUG'),
r'\bwaalwijk\b': ('NB', 'WAA'),
r'\bboxtel\b': ('NB', 'BOX'),
r'\bveldhoven\b': ('NB', 'VEL'),
r'\bbest\b': ('NB', 'BST'),
r'\bgeertruidenberg\b': ('NB', 'GEE'),
r'\bheusden\b': ('NB', 'HEU'),
# Limburg
r'\bmaastricht\b': ('LI', 'MAA'),
r'\bvenlo\b': ('LI', 'VEN'),
r'\broermond\b': ('LI', 'ROE'),
r'\bheerlen\b': ('LI', 'HEE'),
r'\bsittard\b': ('LI', 'SIT'),
r'\bgeleen\b': ('LI', 'GEL'),
r'\bweert\b': ('LI', 'WEE'),
r'\bvalkenburg\b': ('LI', 'VAL'),
r'\bkerkrade\b': ('LI', 'KER'),
r'\bbrunsum\b': ('LI', 'BRU'),
# Overijssel
r'\bzwolle\b': ('OV', 'ZWO'),
r'\bdeventer\b': ('OV', 'DEV'),
r'\bkampen\b': ('OV', 'KAM'),
r'\benschede\b': ('OV', 'ENS'),
r'\bhengelo\b': ('OV', 'HEN'),
r'\balmelo\b': ('OV', 'ALM'),
r'\boldenzaal\b': ('OV', 'OLD'),
r'\bsteenwijk\b': ('OV', 'STE'),
r'\bhasselt\b': ('OV', 'HAS'),
r'\bgiethoorn\b': ('OV', 'GIE'),
r'\braalte\b': ('OV', 'RAA'),
r'\bijsselmuiden\b': ('OV', 'IJS'),
# Friesland
r'\bleeuwarden\b': ('FR', 'LEE'),
r'\bljouwert\b': ('FR', 'LEE'),
r'\bdrachten\b': ('FR', 'DRA'),
r'\bheerenveen\b': ('FR', 'HVE'),
r'\bsneek\b': ('FR', 'SNE'),
r'\bfraneker\b': ('FR', 'FRA'),
r'\bharlingen\b': ('FR', 'HAR'),
r'\bbolsward\b': ('FR', 'BOL'),
r'\bworkum\b': ('FR', 'WOR'),
r'\bterschelling\b': ('FR', 'TER'),
r'\bameland\b': ('FR', 'AME'),
r'\boranjewoud\b': ('FR', 'ORN'),
# Drenthe
r'\bassen\b': ('DR', 'ASS'),
r'\bemmen\b': ('DR', 'EMM'),
r'\bmeppel\b': ('DR', 'MEP'),
r'\bhoogeveen\b': ('DR', 'HOO'),
r'\bcoevorden\b': ('DR', 'COE'),
r'\bborger\b': ('DR', 'BOR'),
# Groningen
r'\bgroningen\b': ('GR', 'GRO'),
r'\bveendam\b': ('GR', 'VEE'),
r'\bwinschoten\b': ('GR', 'WIN'),
r'\bdelfzijl\b': ('GR', 'DEL'),
r'\bappingedam\b': ('GR', 'APP'),
r'\bhoogezand\b': ('GR', 'HOO'),
# Zeeland
r'\bmiddelburg\b': ('ZE', 'MID'),
r'\bvlissingen\b': ('ZE', 'VLI'),
r'\bgoes\b': ('ZE', 'GOE'),
r'\bterneuzen\b': ('ZE', 'TER'),
r'\bzierikzee\b': ('ZE', 'ZIE'),
r'\bveere\b': ('ZE', 'VEE'),
r'\bwestkapelle\b': ('ZE', 'WKA'),
# Flevoland
r'\balmere\b': ('FL', 'ALM'),
r'\blelystad\b': ('FL', 'LEL'),
r'\bdronten\b': ('FL', 'DRO'),
r'\burk\b': ('FL', 'URK'),
r'\bzeewolde\b': ('FL', 'ZEE'),
}
# Non-Dutch indicators - files with these should be reclassified
NON_DUTCH_PATTERNS = {
# Saudi Arabia / Arabic
r'\bsaudi\b': 'SA',
r'\bمشاريع\b': 'SA',
r'\bوزارة\b': 'SA',
r'\bالسعودية\b': 'SA',
# France
r'\bsainte-m[eè]re\b': 'FR',
r'\bnouvelle-aquitaine\b': 'FR',
r'\bparis\b': 'FR',
r'\bfrance\b': 'FR',
r'\bfran[çc]ais\b': 'FR',
r'\bbnf\b': 'FR',
r'\bircam\b': 'FR',
r'\bfondation du patrimoine\b': 'FR',
r'\bministère\b': 'FR',
r'\blyon\b': 'FR',
r'\bmarseille\b': 'FR',
r'\bordeaux\b': 'FR',
r'\bnantes\b': 'FR',
r'\bstrasbourg\b': 'FR',
r'\blille\b': 'FR',
r'\btoulouse\b': 'FR',
r'\bnice\b': 'FR',
# Germany
r'\bberlin\b': 'DE',
r'\bweimar\b': 'DE',
r'\bstiftung\b': 'DE',
r'\bpreu[ßs]ischer\b': 'DE',
r'\bmunich\b': 'DE',
r'\bm[üu]nchen\b': 'DE',
r'\bfrankfurt\b': 'DE',
r'\bhamburg\b': 'DE',
r'\bk[öo]ln\b': 'DE',
r'\bd[üu]sseldorf\b': 'DE',
r'\bstuttgart\b': 'DE',
r'\bheidelberg\b': 'DE',
r'\bdresden\b': 'DE',
r'\bleipzig\b': 'DE',
r'\bgerman\b': 'DE',
r'\bgermany\b': 'DE',
r'\bzentral\b': 'DE',
r'\bzentrum\b': 'DE',
r'\bkunstgeschichte\b': 'DE',
r'\bteilhabe\b': 'DE',
r'\bkulturelle\b': 'DE',
# Belgium
r'\bbelgium\b': 'BE',
r'\bgent\b': 'BE',
r'\bantwerp\b': 'BE',
r'\bghent\b': 'BE',
r'\bbrussels\b': 'BE',
r'\bbrussel\b': 'BE',
r'\bbruxelles\b': 'BE',
r'\bleuven\b': 'BE',
r'\bliege\b': 'BE',
r'\bbelgique\b': 'BE',
r'\bflemish\b': 'BE',
r'\bvlaams\b': 'BE',
# United Kingdom
r'\bbritish\b': 'GB',
r'\blondon\b': 'GB',
r'\benglish\b': 'GB',
r'\bengland\b': 'GB',
r'\bscotland\b': 'GB',
r'\bwales\b': 'GB',
r'\bmanchester\b': 'GB',
r'\bbirmingham\b': 'GB',
r'\bedinburgh\b': 'GB',
r'\bcardiff\b': 'GB',
r'\boxford\b': 'GB',
r'\bcambridge\b': 'GB',
r'\bdurham\b': 'GB',
r'\broyal armouries\b': 'GB',
r'\broyal parks\b': 'GB',
r'\bthe british academy\b': 'GB',
# Italy
r'\broma\b': 'IT',
r'\brome\b': 'IT',
r'\bmilano\b': 'IT',
r'\bmilan\b': 'IT',
r'\bitalian\b': 'IT',
r'\bitaly\b': 'IT',
r'\bfirenze\b': 'IT',
r'\bflorence\b': 'IT',
r'\bvenice\b': 'IT',
r'\bvenezia\b': 'IT',
r'\bnaples\b': 'IT',
r'\bnapoli\b': 'IT',
r'\bartribune\b': 'IT',
# Denmark
r'\baalborg\b': 'DK',
r'\bcopenhagen\b': 'DK',
r'\bkøbenhavn\b': 'DK',
r'\bdanish\b': 'DK',
r'\bdenmark\b': 'DK',
r'\baarhus\b': 'DK',
# USA
r'\bwashington d\.?c\.?\b': 'US',
r'\bnew york\b': 'US',
r'\blos angeles\b': 'US',
r'\bchicago\b': 'US',
r'\bamerican\b': 'US',
r'\bstand ?with ?us\b': 'US',
# Indonesia
r'\bindonesia\b': 'ID',
r'\bjakarta\b': 'ID',
r'\btaman safari\b': 'ID',
# Other countries
r'\bafrican wildlife\b': 'KE',
r'\bkenya\b': 'KE',
r'\bisrael\b': 'IL',
r'\bjerusalem\b': 'IL',
r'\btel aviv\b': 'IL',
r'\bindia\b': 'IN',
r'\bindian\b': 'IN',
r'\bmumbai\b': 'IN',
r'\bdelhi\b': 'IN',
}
# Institution type inference
TYPE_KEYWORDS = {
'museum': 'M',
'musea': 'M',
'archief': 'A',
'archive': 'A',
'bibliotheek': 'L',
'library': 'L',
'universiteit': 'E',
'university': 'E',
'hogeschool': 'E',
'academie': 'E',
'academy': 'E',
'school': 'E',
'ministerie': 'O',
'ministry': 'O',
'gemeente': 'O',
'politie': 'O',
'rijks': 'O',
'dienst': 'O',
'stichting': 'N',
'foundation': 'N',
'fonds': 'N',
'fund': 'N',
'vereniging': 'S',
'society': 'S',
'association': 'S',
}
def normalize_name(name: str) -> str:
"""Normalize name for matching."""
return name.lower().strip()
# Short patterns that need word boundary matching
SHORT_PATTERNS = {'ind', 'eo', 'ntr', 'npo', 'nos', 'coa', 'cbs', 'tno', 'nwo', 'hku', 'tue', 'oba', 'svb', 'uwv', 'dnb', 'afm', 'fries museum'}
def lookup_known_org(name: str) -> Optional[Tuple[str, str, str, str]]:
"""Look up organization in known list."""
name_lower = normalize_name(name)
for pattern, info in KNOWN_ORGANIZATIONS.items():
# For short patterns, use word boundary matching
if pattern in SHORT_PATTERNS:
# Must be exact match or word-bounded
if re.search(r'\b' + re.escape(pattern) + r'\b', name_lower):
return info
else:
# For longer patterns, substring matching is fine
if pattern in name_lower:
return info
return None
def detect_city(name: str) -> Optional[Tuple[str, str]]:
"""Detect city from name."""
name_lower = normalize_name(name)
for pattern, (prov, city) in CITY_PATTERNS.items():
if re.search(pattern, name_lower):
return (prov, city)
return None
def detect_non_dutch(name: str) -> Optional[str]:
"""Detect if organization is not Dutch."""
name_lower = normalize_name(name)
for pattern, country in NON_DUTCH_PATTERNS.items():
if re.search(pattern, name_lower):
return country
return None
def infer_type(name: str) -> str:
"""Infer institution type from name."""
name_lower = normalize_name(name)
for keyword, type_code in TYPE_KEYWORDS.items():
if keyword in name_lower:
return type_code
return 'M' # Default to Museum
def generate_abbreviation(name: str) -> str:
"""Generate abbreviation from name."""
skip = {'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'en', 'of',
'the', 'a', 'an', 'of', 'for', 'and', 'or', 'at', 'in', 'on',
'stichting', 'museum', 'archief', 'bibliotheek'}
words = re.split(r'[\s\-\'\"\(\)]+', name)
abbrev = ''.join(w[0].upper() for w in words
if w.lower() not in skip and w and w[0].isalpha())
return abbrev[:8] if abbrev else 'UNK'
def load_yaml(filepath: Path) -> Optional[Dict]:
"""Load YAML file."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
except:
return None
def save_yaml(filepath: Path, data: Dict):
"""Save YAML file."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False,
sort_keys=False, width=120)
def resolve_pending_file(filepath: Path, custodian_dir: Path, dry_run: bool = True) -> Tuple[str, Optional[Path]]:
"""
Resolve a PENDING file.
Returns: (status, new_filepath)
Status: 'resolved', 'reclassified', 'collision', 'failed'
"""
data = load_yaml(filepath)
if not data:
return ('error', None)
name = data.get('custodian_name', {}).get('emic_name', '')
if not name:
return ('error', None)
# Strategy 1: Check if non-Dutch
new_country = detect_non_dutch(name)
if new_country and new_country != 'NL':
# Reclassify to different country
old_name = filepath.stem
new_name = old_name.replace('NL-XX-XXX-PENDING-', f'{new_country}-XX-XXX-PENDING-')
new_filepath = custodian_dir / f"{new_name}.yaml"
if new_filepath.exists():
return ('collision', None)
if not dry_run:
data['ghcid_current'] = new_name
save_yaml(new_filepath, data)
filepath.unlink()
return ('reclassified', new_filepath)
# Strategy 2: Known organization lookup
known = lookup_known_org(name)
if known:
prov, city, inst_type, abbrev = known
new_ghcid = f"NL-{prov}-{city}-{inst_type}-{abbrev}"
new_filepath = custodian_dir / f"{new_ghcid}.yaml"
if new_filepath.exists():
return ('collision', new_filepath)
if not dry_run:
data['ghcid_current'] = new_ghcid
if 'provenance' not in data:
data['provenance'] = {}
notes = data['provenance'].get('notes', [])
if isinstance(notes, str):
notes = [notes]
notes.append(f"GHCID resolved via known org lookup on {datetime.now(timezone.utc).isoformat()}")
data['provenance']['notes'] = notes
save_yaml(new_filepath, data)
filepath.unlink()
return ('resolved', new_filepath)
# Strategy 3: City name extraction
city_info = detect_city(name)
if city_info:
prov, city = city_info
inst_type = infer_type(name)
abbrev = generate_abbreviation(name)
new_ghcid = f"NL-{prov}-{city}-{inst_type}-{abbrev}"
new_filepath = custodian_dir / f"{new_ghcid}.yaml"
if new_filepath.exists():
return ('collision', new_filepath)
if not dry_run:
data['ghcid_current'] = new_ghcid
if 'provenance' not in data:
data['provenance'] = {}
notes = data['provenance'].get('notes', [])
if isinstance(notes, str):
notes = [notes]
notes.append(f"GHCID resolved via city extraction on {datetime.now(timezone.utc).isoformat()}")
data['provenance']['notes'] = notes
save_yaml(new_filepath, data)
filepath.unlink()
return ('resolved', new_filepath)
return ('failed', None)
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true')
parser.add_argument('--limit', type=int, default=0)
parser.add_argument('--custodian-dir', type=Path,
default=Path('/Users/kempersc/apps/glam/data/custodian'))
args = parser.parse_args()
custodian_dir = args.custodian_dir
print("=" * 80)
print("COMPREHENSIVE PENDING FILE RESOLVER")
print("=" * 80)
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
if args.limit:
print(f"Limit: {args.limit} files")
print()
# Find NL PENDING files
pending_files = sorted(custodian_dir.glob('NL-XX-XXX-PENDING-*.yaml'))
if args.limit:
pending_files = pending_files[:args.limit]
print(f"Processing {len(pending_files)} files...")
print()
stats = {'resolved': 0, 'reclassified': 0, 'collision': 0, 'failed': 0, 'error': 0}
for filepath in pending_files:
data = load_yaml(filepath)
if not data:
stats['error'] += 1
continue
name = data.get('custodian_name', {}).get('emic_name', '')
status, new_path = resolve_pending_file(filepath, custodian_dir, args.dry_run)
stats[status] += 1
if status in ['resolved', 'reclassified']:
action = 'DRY RUN' if args.dry_run else status.upper()
print(f"[{action}] {name[:45]}")
if new_path:
print(f" -> {new_path.name}")
print()
print("=" * 80)
print("SUMMARY")
print("=" * 80)
for status, count in stats.items():
if count > 0:
print(f" {status}: {count}")
print(f" TOTAL: {sum(stats.values())}")
if __name__ == '__main__':
main()