#!/usr/bin/env python3 """ Comprehensive PENDING file resolver using multiple strategies: 1. Known organization lookup table 2. City name extraction from emic name 3. Country re-detection for misclassified files 4. Wikidata lookup for remaining Usage: python scripts/resolve_pending_comprehensive.py --dry-run python scripts/resolve_pending_comprehensive.py --limit 100 python scripts/resolve_pending_comprehensive.py """ import os import re import yaml from pathlib import Path from datetime import datetime, timezone from typing import Dict, Optional, Tuple, List # Known Dutch organizations with their locations # Format: 'name pattern': ('province', 'city_code', 'type', 'abbreviation') KNOWN_ORGANIZATIONS = { # Government - Ministeries 'ministerie van buitenlandse zaken': ('ZH', 'DHA', 'O', 'MBZ'), 'ministerie van justitie en veiligheid': ('ZH', 'DHA', 'O', 'MJV'), 'ministerie van onderwijs': ('ZH', 'DHA', 'O', 'MOC'), 'ministerie van defensie': ('ZH', 'DHA', 'O', 'MD'), 'ministerie van financien': ('ZH', 'DHA', 'O', 'MF'), 'ministerie van sociale zaken': ('ZH', 'DHA', 'O', 'MSZ'), 'ministerie van economische zaken': ('ZH', 'DHA', 'O', 'MEZ'), 'ministerie van volksgezondheid': ('ZH', 'DHA', 'O', 'MVW'), 'ministerie van binnenlandse zaken': ('ZH', 'DHA', 'O', 'MBZ'), 'ministerie van infrastructuur': ('ZH', 'DHA', 'O', 'MIW'), 'ministerie van landbouw': ('ZH', 'DHA', 'O', 'MLN'), # Government - Agencies 'algemene rekenkamer': ('ZH', 'DHA', 'O', 'AR'), 'politie nederland': ('ZH', 'DHA', 'O', 'PN'), 'douane nederland': ('ZH', 'ROT', 'O', 'DN'), 'kadaster': ('GE', 'APE', 'O', 'K'), 'rijkswaterstaat': ('UT', 'UTR', 'O', 'RWS'), 'netherlands enterprise agency': ('ZH', 'DHA', 'O', 'NEA'), 'dienst uitvoering onderwijs': ('GR', 'GRO', 'O', 'DUO'), 'fiod': ('ZH', 'DHA', 'O', 'FIOD'), 'ssc-ict': ('ZH', 'DHA', 'O', 'SSC'), 'raad voor de kinderbescherming': ('ZH', 'DHA', 'O', 'RVK'), 'immigratie- en naturalisatiedienst': ('ZH', 'DHA', 'O', 'IND'), 'ind': ('ZH', 'DHA', 'O', 'IND'), 'coa': ('ZH', 'DHA', 'O', 'COA'), 'centraal orgaan opvang asielzoekers': ('ZH', 'DHA', 'O', 'COA'), 'sociale verzekeringsbank': ('NH', 'AME', 'O', 'SVB'), 'uwv': ('NH', 'AMS', 'O', 'UWV'), 'kamer van koophandel': ('UT', 'UTR', 'O', 'KVK'), 'autoriteit persoonsgegevens': ('ZH', 'DHA', 'O', 'AP'), 'belastingdienst': ('UT', 'UTR', 'O', 'BD'), 'autoriteit financiele markten': ('NH', 'AMS', 'O', 'AFM'), 'de nederlandsche bank': ('NH', 'AMS', 'O', 'DNB'), 'cbs': ('ZH', 'DHA', 'O', 'CBS'), 'centraal bureau voor de statistiek': ('ZH', 'DHA', 'O', 'CBS'), 'rijksdienst voor het cultureel erfgoed': ('GE', 'AME', 'O', 'RCE'), 'rijksdienst voor ondernemend nederland': ('ZH', 'DHA', 'O', 'RVO'), 'raad van state': ('ZH', 'DHA', 'O', 'RVS'), 'raad voor cultuur': ('ZH', 'DHA', 'O', 'RVC'), # Education - Universities 'reinwardt academie': ('NH', 'AMS', 'E', 'RA'), 'academie minerva': ('GR', 'GRO', 'E', 'AM'), 'university of humanistic studies': ('UT', 'UTR', 'E', 'UHS'), 'erasmus university': ('ZH', 'ROT', 'E', 'EUR'), 'erasmus universiteit': ('ZH', 'ROT', 'E', 'EUR'), 'universiteit van amsterdam': ('NH', 'AMS', 'E', 'UVA'), 'vrije universiteit amsterdam': ('NH', 'AMS', 'E', 'VU'), 'universiteit leiden': ('ZH', 'LEI', 'E', 'UL'), 'leiden university': ('ZH', 'LEI', 'E', 'UL'), 'universiteit utrecht': ('UT', 'UTR', 'E', 'UU'), 'utrecht university': ('UT', 'UTR', 'E', 'UU'), 'rijksuniversiteit groningen': ('GR', 'GRO', 'E', 'RUG'), 'university of groningen': ('GR', 'GRO', 'E', 'RUG'), 'technische universiteit delft': ('ZH', 'DEL', 'E', 'TUD'), 'tu delft': ('ZH', 'DEL', 'E', 'TUD'), 'delft university': ('ZH', 'DEL', 'E', 'TUD'), 'technische universiteit eindhoven': ('NB', 'EIN', 'E', 'TUE'), 'tu eindhoven': ('NB', 'EIN', 'E', 'TUE'), 'tue': ('NB', 'EIN', 'E', 'TUE'), 'wageningen university': ('GE', 'WAG', 'E', 'WUR'), 'wageningen universiteit': ('GE', 'WAG', 'E', 'WUR'), 'radboud university': ('GE', 'NIJ', 'E', 'RU'), 'radboud universiteit': ('GE', 'NIJ', 'E', 'RU'), 'tilburg university': ('NB', 'TIL', 'E', 'TIU'), 'universiteit tilburg': ('NB', 'TIL', 'E', 'TIU'), 'maastricht university': ('LI', 'MAA', 'E', 'UM'), 'universiteit maastricht': ('LI', 'MAA', 'E', 'UM'), 'open universiteit': ('LI', 'HEE', 'E', 'OU'), 'nyenrode': ('NH', 'BRE', 'E', 'NYE'), 'royal academy of art': ('ZH', 'DHA', 'E', 'KABK'), 'koninklijke academie van beeldende kunsten': ('ZH', 'DHA', 'E', 'KABK'), # Education - Hogescholen 'hogeschool van amsterdam': ('NH', 'AMS', 'E', 'HVA'), 'hogeschool rotterdam': ('ZH', 'ROT', 'E', 'HR'), 'hogeschool utrecht': ('UT', 'UTR', 'E', 'HU'), 'hogeschool inholland': ('NH', 'AMS', 'E', 'INH'), 'inholland': ('NH', 'AMS', 'E', 'INH'), 'hogeschool leiden': ('ZH', 'LEI', 'E', 'HL'), 'haagse hogeschool': ('ZH', 'DHA', 'E', 'HH'), 'saxion': ('OV', 'ENS', 'E', 'SAX'), 'avans': ('NB', 'BRE', 'E', 'AVA'), 'fontys': ('NB', 'EIN', 'E', 'FON'), 'zuyd hogeschool': ('LI', 'MAA', 'E', 'ZH'), 'hanzehogeschool': ('GR', 'GRO', 'E', 'HAN'), 'hku': ('UT', 'UTR', 'E', 'HKU'), 'design academy eindhoven': ('NB', 'EIN', 'E', 'DAE'), 'gerrit rietveld academie': ('NH', 'AMS', 'E', 'GRA'), 'koninklijk conservatorium': ('ZH', 'DHA', 'E', 'KC'), 'conservatorium van amsterdam': ('NH', 'AMS', 'E', 'CVA'), 'codarts': ('ZH', 'ROT', 'E', 'COD'), 'artez': ('GE', 'ARN', 'E', 'ARZ'), # Museums - Amsterdam 'anne frank stichting': ('NH', 'AMS', 'M', 'AFS'), 'allard pierson': ('NH', 'AMS', 'M', 'AP'), 'van gogh museum': ('NH', 'AMS', 'M', 'VGM'), 'tropenmuseum': ('NH', 'AMS', 'M', 'TM'), 'joods historisch museum': ('NH', 'AMS', 'M', 'JHM'), 'koninklijk paleis amsterdam': ('NH', 'AMS', 'M', 'KPA'), 'stedelijk museum amsterdam': ('NH', 'AMS', 'M', 'SMA'), 'rijksmuseum': ('NH', 'AMS', 'M', 'RM'), 'ons lieve heer op solder': ('NH', 'AMS', 'M', 'OLHOS'), 'rembrandthuis': ('NH', 'AMS', 'M', 'RH'), 'amsterdam museum': ('NH', 'AMS', 'M', 'AM'), 'artis': ('NH', 'AMS', 'M', 'ART'), 'nemo science museum': ('NH', 'AMS', 'M', 'NEMO'), 'eye filmmuseum': ('NH', 'AMS', 'M', 'EYE'), 'moco museum': ('NH', 'AMS', 'M', 'MOCO'), 'hermitage amsterdam': ('NH', 'AMS', 'M', 'HA'), 'tassenmuseum hendrikje': ('NH', 'AMS', 'M', 'TMH'), 'willet-holthuysen': ('NH', 'AMS', 'M', 'WH'), 'geelvinck hinlopen huis': ('NH', 'AMS', 'M', 'GHH'), 'hortus botanicus': ('NH', 'AMS', 'M', 'HB'), 'multatuli museum': ('NH', 'AMS', 'M', 'MM'), 'beurs van berlage': ('NH', 'AMS', 'M', 'BVB'), 'de brakke grond': ('NH', 'AMS', 'M', 'DBG'), # Museums - Den Haag 'mauritshuis': ('ZH', 'DHA', 'M', 'MH'), 'gemeentemuseum': ('ZH', 'DHA', 'M', 'GMH'), 'kunstmuseum den haag': ('ZH', 'DHA', 'M', 'KDH'), 'escher in het paleis': ('ZH', 'DHA', 'M', 'EHP'), 'museon': ('ZH', 'DHA', 'M', 'MUS'), 'omniversum': ('ZH', 'DHA', 'M', 'OMN'), 'louwman museum': ('ZH', 'DHA', 'M', 'LM'), 'museum de gevangenpoort': ('ZH', 'DHA', 'M', 'MDG'), 'museum meermanno': ('ZH', 'DHA', 'M', 'MMO'), 'bredius museum': ('ZH', 'DHA', 'M', 'BM'), 'panorama mesdag': ('ZH', 'DHA', 'M', 'PM'), 'madurodam': ('ZH', 'DHA', 'M', 'MAD'), 'haags historisch museum': ('ZH', 'DHA', 'M', 'HHM'), 'beelden aan zee': ('ZH', 'DHA', 'M', 'BAZ'), 'museum het paleis': ('ZH', 'DHA', 'M', 'MHP'), # Museums - Rotterdam 'museum boijmans van beuningen': ('ZH', 'ROT', 'M', 'MBVB'), 'boijmans': ('ZH', 'ROT', 'M', 'MBVB'), 'maritiem museum': ('ZH', 'ROT', 'M', 'MM'), 'het nieuwe instituut': ('ZH', 'ROT', 'M', 'HNI'), 'chabot museum': ('ZH', 'ROT', 'M', 'CM'), 'kunsthal rotterdam': ('ZH', 'ROT', 'M', 'KR'), 'wereldmuseum': ('ZH', 'ROT', 'M', 'WM'), 'museum rotterdam': ('ZH', 'ROT', 'M', 'MR'), 'fotomuseum': ('ZH', 'ROT', 'M', 'NFM'), 'nederlands fotomuseum': ('ZH', 'ROT', 'M', 'NFM'), 'ss rotterdam': ('ZH', 'ROT', 'M', 'SSR'), 'fenixloods': ('ZH', 'ROT', 'M', 'FL'), # Museums - Other cities 'airborne museum': ('GE', 'ARN', 'M', 'ABM'), 'kroller muller museum': ('GE', 'OTT', 'M', 'KMM'), 'naturalis': ('ZH', 'LEI', 'M', 'NAT'), 'museum catharijneconvent': ('UT', 'UTR', 'M', 'MC'), 'centraal museum': ('UT', 'UTR', 'M', 'CMU'), 'spoorwegmuseum': ('UT', 'UTR', 'M', 'SPW'), 'het utrechts archief': ('UT', 'UTR', 'A', 'HUA'), 'museum speelklok': ('UT', 'UTR', 'M', 'MS'), 'museum van oudheden': ('ZH', 'LEI', 'M', 'MVO'), 'molenmuseum de valk': ('ZH', 'LEI', 'M', 'MDV'), 'stedelijk museum schiedam': ('ZH', 'SCH', 'M', 'SMS'), 'bonnefantenmuseum': ('LI', 'MAA', 'M', 'BFM'), 'marres': ('LI', 'MAA', 'M', 'MAR'), 'museum aan het vrijthof': ('LI', 'MAA', 'M', 'MAV'), 'drents museum': ('DR', 'ASS', 'M', 'DM'), 'groninger museum': ('GR', 'GRO', 'M', 'GM'), 'fries museum': ('FR', 'LEE', 'M', 'FM'), # Requires word boundary (in SHORT_PATTERNS) 'westfries museum': ('NH', 'HOO', 'M', 'WFM'), # In Hoorn 'museum belvédère': ('FR', 'ORN', 'M', 'MB'), 'princessehof': ('FR', 'LEE', 'M', 'PH'), 'zuiderzeemuseum': ('NH', 'ENK', 'M', 'ZZM'), 'rijksmuseum muiderslot': ('NH', 'MUI', 'M', 'RMM'), 'teylers museum': ('NH', 'HAA', 'M', 'TYM'), 'frans hals museum': ('NH', 'HAA', 'M', 'FHM'), 'museum de fundatie': ('OV', 'ZWO', 'M', 'MDF'), 'museum twentse welle': ('OV', 'ENS', 'M', 'MTW'), 'rijksmuseum van oudheden': ('ZH', 'LEI', 'M', 'RMO'), 'museum volkenkunde': ('ZH', 'LEI', 'M', 'MVK'), 'museon-omniversum': ('ZH', 'DHA', 'M', 'MO'), 'literatuurmuseum': ('ZH', 'DHA', 'M', 'LM'), 'kinderboekenmuseum': ('ZH', 'DHA', 'M', 'KBM'), 'van abbemuseum': ('NB', 'EIN', 'M', 'VAM'), 'philips museum': ('NB', 'EIN', 'M', 'PHM'), 'textielmuseum': ('NB', 'TIL', 'M', 'TXM'), 'de pont': ('NB', 'TIL', 'M', 'DP'), 'noordbrabants museum': ('NB', 'DBO', 'M', 'NBM'), 'musis sacrum': ('GE', 'ARN', 'M', 'MUS'), 'museum arnhem': ('GE', 'ARN', 'M', 'MA'), 'afrika museum': ('GE', 'BER', 'M', 'AFM'), 'museum het valkhof': ('GE', 'NIJ', 'M', 'MHV'), 'hunebedcentrum': ('DR', 'BOR', 'M', 'HC'), 'museum drachten': ('FR', 'DRA', 'M', 'MDR'), 'openluchtmuseum': ('GE', 'ARN', 'M', 'OLM'), 'nederlands openluchtmuseum': ('GE', 'ARN', 'M', 'NOLM'), # Archives 'stadsarchief amsterdam': ('NH', 'AMS', 'A', 'SAA'), 'nationaal archief': ('ZH', 'DHA', 'A', 'NA'), 'amsab': ('BE', 'GEN', 'A', 'AMS'), # Belgian 'noord-hollands archief': ('NH', 'HAA', 'A', 'NHA'), 'brabants historisch informatie centrum': ('NB', 'DBO', 'A', 'BHIC'), 'gelders archief': ('GE', 'ARN', 'A', 'GA'), 'zeeuws archief': ('ZE', 'MID', 'A', 'ZA'), 'tresoar': ('FR', 'LEE', 'A', 'TRE'), 'drents archief': ('DR', 'ASS', 'A', 'DA'), 'groninger archieven': ('GR', 'GRO', 'A', 'GRA'), 'historisch centrum overijssel': ('OV', 'ZWO', 'A', 'HCO'), 'regionaal archief tilburg': ('NB', 'TIL', 'A', 'RAT'), 'erfgoed brabant': ('NB', 'TIL', 'A', 'EB'), 'stadsarchief rotterdam': ('ZH', 'ROT', 'A', 'SAR'), 'stadsarchief delft': ('ZH', 'DEL', 'A', 'SAD'), 'regionaal historisch centrum limburg': ('LI', 'MAA', 'A', 'RHCL'), # Libraries 'koninklijke bibliotheek': ('ZH', 'DHA', 'L', 'KB'), 'nationale bibliotheek': ('ZH', 'DHA', 'L', 'KB'), 'openbare bibliotheek amsterdam': ('NH', 'AMS', 'L', 'OBA'), 'oba': ('NH', 'AMS', 'L', 'OBA'), 'universiteitbibliotheek': ('NH', 'AMS', 'L', 'UBA'), 'atria': ('NH', 'AMS', 'L', 'ATR'), 'bibliotheek rotterdam': ('ZH', 'ROT', 'L', 'BR'), 'bibliotheek den haag': ('ZH', 'DHA', 'L', 'BDH'), 'bibliotheek utrecht': ('UT', 'UTR', 'L', 'BU'), # Research 'african studies centre leiden': ('ZH', 'LEI', 'R', 'ASCL'), 'niod': ('NH', 'AMS', 'R', 'NIOD'), 'knaw': ('NH', 'AMS', 'R', 'KNAW'), 'koninklijke nederlandse akademie van wetenschappen': ('NH', 'AMS', 'R', 'KNAW'), 'nwo': ('ZH', 'DHA', 'R', 'NWO'), 'rivm': ('UT', 'BIL', 'R', 'RIVM'), 'tno': ('ZH', 'DHA', 'R', 'TNO'), 'meertens instituut': ('NH', 'AMS', 'R', 'MI'), 'huygens instituut': ('NH', 'AMS', 'R', 'HI'), 'internationaal instituut voor sociale geschiedenis': ('NH', 'AMS', 'R', 'IISG'), 'iisg': ('NH', 'AMS', 'R', 'IISG'), 'rathenau instituut': ('ZH', 'DHA', 'R', 'RAT'), 'planbureau voor de leefomgeving': ('ZH', 'DHA', 'R', 'PBL'), 'sociaal en cultureel planbureau': ('ZH', 'DHA', 'R', 'SCP'), 'cpb': ('ZH', 'DHA', 'R', 'CPB'), 'centraal planbureau': ('ZH', 'DHA', 'R', 'CPB'), 'knmi': ('UT', 'DEV', 'R', 'KNMI'), 'nivel': ('UT', 'UTR', 'R', 'NIV'), 'deltaresearch': ('ZH', 'DEL', 'R', 'DEL'), 'deltares': ('ZH', 'DEL', 'R', 'DEL'), 'nidi': ('ZH', 'DHA', 'R', 'NIDI'), 'dans': ('ZH', 'DHA', 'R', 'DANS'), 'surf': ('UT', 'UTR', 'R', 'SURF'), # NGOs/Foundations 'amsterdams fonds voor de kunst': ('NH', 'AMS', 'N', 'AFK'), 'mondriaan fonds': ('NH', 'AMS', 'N', 'MF'), 'stimuleringsfonds': ('ZH', 'ROT', 'N', 'SF'), 'fonds voor cultuurparticipatie': ('UT', 'UTR', 'N', 'FCP'), 'fonds podiumkunsten': ('ZH', 'DHA', 'N', 'FPK'), 'letterenfonds': ('NH', 'AMS', 'N', 'LF'), 'filmfonds': ('NH', 'AMS', 'N', 'NFF'), 'nederlands filmfonds': ('NH', 'AMS', 'N', 'NFF'), 'bng cultuurfonds': ('ZH', 'DHA', 'N', 'BNG'), 'prins bernhard cultuurfonds': ('NH', 'AMS', 'N', 'PBC'), 'vsc': ('NH', 'AMS', 'N', 'VSC'), 'cultuur + ondernemen': ('NH', 'AMS', 'N', 'CO'), 'erfgoedvereniging heemschut': ('NH', 'AMS', 'N', 'EH'), 'heemschut': ('NH', 'AMS', 'N', 'EH'), 'boekmanstichting': ('NH', 'AMS', 'N', 'BS'), 'lira': ('NH', 'AMS', 'N', 'LIRA'), 'pictoright': ('NH', 'AMS', 'N', 'PR'), 'buma stemra': ('NH', 'AMS', 'N', 'BS'), 'senafonds': ('NH', 'AMS', 'N', 'SEN'), # Performing Arts 'nederlands dans theater': ('ZH', 'DHA', 'M', 'NDT'), 'ndt': ('ZH', 'DHA', 'M', 'NDT'), 'het nationale ballet': ('NH', 'AMS', 'M', 'HNB'), 'nationale opera': ('NH', 'AMS', 'M', 'DNO'), 'de nationale opera & ballet': ('NH', 'AMS', 'M', 'NOB'), 'concertgebouw': ('NH', 'AMS', 'M', 'CG'), 'koninklijk concertgebouworkest': ('NH', 'AMS', 'M', 'KCO'), 'residentie orkest': ('ZH', 'DHA', 'M', 'RO'), 'rotterdams philharmonisch': ('ZH', 'ROT', 'M', 'RPO'), 'nederlands kamerorkest': ('NH', 'AMS', 'M', 'NKO'), 'holland festival': ('NH', 'AMS', 'M', 'HF'), 'internationaal theater amsterdam': ('NH', 'AMS', 'M', 'ITA'), 'ita': ('NH', 'AMS', 'M', 'ITA'), 'stadsschouwburg': ('NH', 'AMS', 'M', 'SSB'), 'theater carré': ('NH', 'AMS', 'M', 'TC'), 'de la mar theater': ('NH', 'AMS', 'M', 'DLM'), 'schouwburg': ('NH', 'AMS', 'M', 'SCH'), 'muziekgebouw aan t ij': ('NH', 'AMS', 'M', 'MATI'), 'bimhuis': ('NH', 'AMS', 'M', 'BH'), 'paradiso': ('NH', 'AMS', 'M', 'PAR'), 'melkweg': ('NH', 'AMS', 'M', 'MW'), 'doelen': ('ZH', 'ROT', 'M', 'DOE'), 'de doelen': ('ZH', 'ROT', 'M', 'DOE'), 'ahoy': ('ZH', 'ROT', 'M', 'AH'), 'tivoli vredenburg': ('UT', 'UTR', 'M', 'TV'), 'theater aan het spui': ('ZH', 'DHA', 'M', 'TAS'), 'zuiderstrandtheater': ('ZH', 'DHA', 'M', 'ZST'), 'lucent danstheater': ('ZH', 'DHA', 'M', 'LDT'), 'chassé theater': ('NB', 'BRE', 'M', 'CT'), 'parktheater': ('NB', 'EIN', 'M', 'PT'), # Media/Broadcasting 'npo': ('NH', 'HIL', 'M', 'NPO'), 'nos': ('NH', 'HIL', 'M', 'NOS'), 'ntr': ('NH', 'HIL', 'M', 'NTR'), 'avro': ('NH', 'HIL', 'M', 'AVRO'), 'avrotros': ('NH', 'HIL', 'M', 'AT'), 'vara': ('NH', 'HIL', 'M', 'VARA'), 'bnnvara': ('NH', 'HIL', 'M', 'BV'), 'eo': ('NH', 'HIL', 'M', 'EO'), 'kro': ('NH', 'HIL', 'M', 'KRO'), 'kro-ncrv': ('NH', 'HIL', 'M', 'KN'), 'vpro': ('NH', 'HIL', 'M', 'VPRO'), 'max': ('NH', 'HIL', 'M', 'MAX'), 'omroep max': ('NH', 'HIL', 'M', 'MAX'), 'beeld en geluid': ('NH', 'HIL', 'M', 'BEG'), 'beelden en geluid': ('NH', 'HIL', 'M', 'BEG'), # Religious/Holy Sites 'protestantse kerk in nederland': ('UT', 'UTR', 'H', 'PKN'), 'pkn': ('UT', 'UTR', 'H', 'PKN'), 'bisdom utrecht': ('UT', 'UTR', 'H', 'BU'), 'aartsbisdom utrecht': ('UT', 'UTR', 'H', 'ABU'), 'bisdom haarlem': ('NH', 'HAA', 'H', 'BH'), 'bisdom rotterdam': ('ZH', 'ROT', 'H', 'BR'), 'bisdom breda': ('NB', 'BRE', 'H', 'BB'), 'bisdom den bosch': ('NB', 'DBO', 'H', 'BDB'), # Provincial/Regional 'rijnbrink': ('GE', 'ARN', 'N', 'RB'), 'erfgoed zeeland': ('ZE', 'MID', 'N', 'EZ'), 'erfgoed brabant': ('NB', 'TIL', 'N', 'EB'), 'erfgoed gelderland': ('GE', 'ARN', 'N', 'EG'), 'erfgoed overijssel': ('OV', 'ZWO', 'N', 'EO'), 'monumentenwacht': ('NH', 'AMS', 'N', 'MW'), 'erfgoedcentrum': ('UT', 'UTR', 'N', 'EC'), } # Additional city patterns to detect (Dutch cities) CITY_PATTERNS = { # Major cities r'\bamsterdam\b': ('NH', 'AMS'), r'\brotterdam\b': ('ZH', 'ROT'), r'\bden haag\b': ('ZH', 'DHA'), r'\bthe hague\b': ('ZH', 'DHA'), r'\b\'s-gravenhage\b': ('ZH', 'DHA'), r'\butrecht\b': ('UT', 'UTR'), r'\beindhoven\b': ('NB', 'EIN'), r'\bgroningen\b': ('GR', 'GRO'), # Zuid-Holland r'\bleiden\b': ('ZH', 'LEI'), r'\bdelft\b': ('ZH', 'DEL'), r'\bdordrecht\b': ('ZH', 'DOR'), r'\bgouda\b': ('ZH', 'GOU'), r'\bschiedam\b': ('ZH', 'SCH'), r'\bzoetermeer\b': ('ZH', 'ZOE'), r'\bwestland\b': ('ZH', 'WES'), r'\balphen aan den rijn\b': ('ZH', 'ALP'), r'\bvlaardingen\b': ('ZH', 'VLA'), r'\bcapelle\b': ('ZH', 'CAP'), r'\bvoorburg\b': ('ZH', 'VOO'), r'\brijswijk\b': ('ZH', 'RIJ'), # Noord-Holland r'\bhaarlem\b': ('NH', 'HAA'), r'\balkmaar\b': ('NH', 'ALK'), r'\bhilversum\b': ('NH', 'HIL'), r'\bzaandam\b': ('NH', 'ZAA'), r'\bzaanstad\b': ('NH', 'ZAA'), r'\bhoorn\b': ('NH', 'HOO'), r'\benkhuizen\b': ('NH', 'ENK'), r'\bedam\b': ('NH', 'EDA'), r'\bvolendam\b': ('NH', 'VOL'), r'\bhaarlemmermeer\b': ('NH', 'HLM'), r'\bpurmerend\b': ('NH', 'PUR'), r'\bmuiden\b': ('NH', 'MUI'), r'\bnaarden\b': ('NH', 'NAA'), r'\bbussum\b': ('NH', 'BUS'), r'\bbloemendaal\b': ('NH', 'BLO'), r'\bheemstede\b': ('NH', 'HEE'), r'\blaren\b': ('NH', 'LAR'), r'\bbergen\b': ('NH', 'BER'), # Gelderland r'\barnhem\b': ('GE', 'ARN'), r'\bnijmegen\b': ('GE', 'NIJ'), r'\bapeldoorn\b': ('GE', 'APE'), r'\bede\b': ('GE', 'EDE'), r'\bwageningen\b': ('GE', 'WAG'), r'\bhattem\b': ('GE', 'HAT'), r'\belburg\b': ('GE', 'ELB'), r'\bharderwijk\b': ('GE', 'HAR'), r'\bdoetinchem\b': ('GE', 'DOE'), r'\bzutphen\b': ('GE', 'ZUT'), r'\bzevenaar\b': ('GE', 'ZEV'), r'\btiel\b': ('GE', 'TIE'), r'\botterlo\b': ('GE', 'OTT'), r'\bburen\b': ('GE', 'BUR'), r'\bbarneveld\b': ('GE', 'BAR'), r'\bepe\b': ('GE', 'EPE'), r'\beerde\b': ('GE', 'EER'), r'\bberkum\b': ('GE', 'BRK'), # Noord-Brabant r'\btilburg\b': ('NB', 'TIL'), r'\bbreda\b': ('NB', 'BRE'), r'\b\'s-hertogenbosch\b': ('NB', 'DBO'), r'\bden bosch\b': ('NB', 'DBO'), r'\bhelmond\b': ('NB', 'HEL'), r'\bossen\b': ('NB', 'OSS'), r'\broovendaal\b': ('NB', 'ROO'), r'\bbergen op zoom\b': ('NB', 'BOZ'), r'\bvught\b': ('NB', 'VUG'), r'\bwaalwijk\b': ('NB', 'WAA'), r'\bboxtel\b': ('NB', 'BOX'), r'\bveldhoven\b': ('NB', 'VEL'), r'\bbest\b': ('NB', 'BST'), r'\bgeertruidenberg\b': ('NB', 'GEE'), r'\bheusden\b': ('NB', 'HEU'), # Limburg r'\bmaastricht\b': ('LI', 'MAA'), r'\bvenlo\b': ('LI', 'VEN'), r'\broermond\b': ('LI', 'ROE'), r'\bheerlen\b': ('LI', 'HEE'), r'\bsittard\b': ('LI', 'SIT'), r'\bgeleen\b': ('LI', 'GEL'), r'\bweert\b': ('LI', 'WEE'), r'\bvalkenburg\b': ('LI', 'VAL'), r'\bkerkrade\b': ('LI', 'KER'), r'\bbrunsum\b': ('LI', 'BRU'), # Overijssel r'\bzwolle\b': ('OV', 'ZWO'), r'\bdeventer\b': ('OV', 'DEV'), r'\bkampen\b': ('OV', 'KAM'), r'\benschede\b': ('OV', 'ENS'), r'\bhengelo\b': ('OV', 'HEN'), r'\balmelo\b': ('OV', 'ALM'), r'\boldenzaal\b': ('OV', 'OLD'), r'\bsteenwijk\b': ('OV', 'STE'), r'\bhasselt\b': ('OV', 'HAS'), r'\bgiethoorn\b': ('OV', 'GIE'), r'\braalte\b': ('OV', 'RAA'), r'\bijsselmuiden\b': ('OV', 'IJS'), # Friesland r'\bleeuwarden\b': ('FR', 'LEE'), r'\bljouwert\b': ('FR', 'LEE'), r'\bdrachten\b': ('FR', 'DRA'), r'\bheerenveen\b': ('FR', 'HVE'), r'\bsneek\b': ('FR', 'SNE'), r'\bfraneker\b': ('FR', 'FRA'), r'\bharlingen\b': ('FR', 'HAR'), r'\bbolsward\b': ('FR', 'BOL'), r'\bworkum\b': ('FR', 'WOR'), r'\bterschelling\b': ('FR', 'TER'), r'\bameland\b': ('FR', 'AME'), r'\boranjewoud\b': ('FR', 'ORN'), # Drenthe r'\bassen\b': ('DR', 'ASS'), r'\bemmen\b': ('DR', 'EMM'), r'\bmeppel\b': ('DR', 'MEP'), r'\bhoogeveen\b': ('DR', 'HOO'), r'\bcoevorden\b': ('DR', 'COE'), r'\bborger\b': ('DR', 'BOR'), # Groningen r'\bgroningen\b': ('GR', 'GRO'), r'\bveendam\b': ('GR', 'VEE'), r'\bwinschoten\b': ('GR', 'WIN'), r'\bdelfzijl\b': ('GR', 'DEL'), r'\bappingedam\b': ('GR', 'APP'), r'\bhoogezand\b': ('GR', 'HOO'), # Zeeland r'\bmiddelburg\b': ('ZE', 'MID'), r'\bvlissingen\b': ('ZE', 'VLI'), r'\bgoes\b': ('ZE', 'GOE'), r'\bterneuzen\b': ('ZE', 'TER'), r'\bzierikzee\b': ('ZE', 'ZIE'), r'\bveere\b': ('ZE', 'VEE'), r'\bwestkapelle\b': ('ZE', 'WKA'), # Flevoland r'\balmere\b': ('FL', 'ALM'), r'\blelystad\b': ('FL', 'LEL'), r'\bdronten\b': ('FL', 'DRO'), r'\burk\b': ('FL', 'URK'), r'\bzeewolde\b': ('FL', 'ZEE'), } # Non-Dutch indicators - files with these should be reclassified NON_DUTCH_PATTERNS = { # Saudi Arabia / Arabic r'\bsaudi\b': 'SA', r'\bمشاريع\b': 'SA', r'\bوزارة\b': 'SA', r'\bالسعودية\b': 'SA', # France r'\bsainte-m[eè]re\b': 'FR', r'\bnouvelle-aquitaine\b': 'FR', r'\bparis\b': 'FR', r'\bfrance\b': 'FR', r'\bfran[çc]ais\b': 'FR', r'\bbnf\b': 'FR', r'\bircam\b': 'FR', r'\bfondation du patrimoine\b': 'FR', r'\bministère\b': 'FR', r'\blyon\b': 'FR', r'\bmarseille\b': 'FR', r'\bordeaux\b': 'FR', r'\bnantes\b': 'FR', r'\bstrasbourg\b': 'FR', r'\blille\b': 'FR', r'\btoulouse\b': 'FR', r'\bnice\b': 'FR', # Germany r'\bberlin\b': 'DE', r'\bweimar\b': 'DE', r'\bstiftung\b': 'DE', r'\bpreu[ßs]ischer\b': 'DE', r'\bmunich\b': 'DE', r'\bm[üu]nchen\b': 'DE', r'\bfrankfurt\b': 'DE', r'\bhamburg\b': 'DE', r'\bk[öo]ln\b': 'DE', r'\bd[üu]sseldorf\b': 'DE', r'\bstuttgart\b': 'DE', r'\bheidelberg\b': 'DE', r'\bdresden\b': 'DE', r'\bleipzig\b': 'DE', r'\bgerman\b': 'DE', r'\bgermany\b': 'DE', r'\bzentral\b': 'DE', r'\bzentrum\b': 'DE', r'\bkunstgeschichte\b': 'DE', r'\bteilhabe\b': 'DE', r'\bkulturelle\b': 'DE', # Belgium r'\bbelgium\b': 'BE', r'\bgent\b': 'BE', r'\bantwerp\b': 'BE', r'\bghent\b': 'BE', r'\bbrussels\b': 'BE', r'\bbrussel\b': 'BE', r'\bbruxelles\b': 'BE', r'\bleuven\b': 'BE', r'\bliege\b': 'BE', r'\bbelgique\b': 'BE', r'\bflemish\b': 'BE', r'\bvlaams\b': 'BE', # United Kingdom r'\bbritish\b': 'GB', r'\blondon\b': 'GB', r'\benglish\b': 'GB', r'\bengland\b': 'GB', r'\bscotland\b': 'GB', r'\bwales\b': 'GB', r'\bmanchester\b': 'GB', r'\bbirmingham\b': 'GB', r'\bedinburgh\b': 'GB', r'\bcardiff\b': 'GB', r'\boxford\b': 'GB', r'\bcambridge\b': 'GB', r'\bdurham\b': 'GB', r'\broyal armouries\b': 'GB', r'\broyal parks\b': 'GB', r'\bthe british academy\b': 'GB', # Italy r'\broma\b': 'IT', r'\brome\b': 'IT', r'\bmilano\b': 'IT', r'\bmilan\b': 'IT', r'\bitalian\b': 'IT', r'\bitaly\b': 'IT', r'\bfirenze\b': 'IT', r'\bflorence\b': 'IT', r'\bvenice\b': 'IT', r'\bvenezia\b': 'IT', r'\bnaples\b': 'IT', r'\bnapoli\b': 'IT', r'\bartribune\b': 'IT', # Denmark r'\baalborg\b': 'DK', r'\bcopenhagen\b': 'DK', r'\bkøbenhavn\b': 'DK', r'\bdanish\b': 'DK', r'\bdenmark\b': 'DK', r'\baarhus\b': 'DK', # USA r'\bwashington d\.?c\.?\b': 'US', r'\bnew york\b': 'US', r'\blos angeles\b': 'US', r'\bchicago\b': 'US', r'\bamerican\b': 'US', r'\bstand ?with ?us\b': 'US', # Indonesia r'\bindonesia\b': 'ID', r'\bjakarta\b': 'ID', r'\btaman safari\b': 'ID', # Other countries r'\bafrican wildlife\b': 'KE', r'\bkenya\b': 'KE', r'\bisrael\b': 'IL', r'\bjerusalem\b': 'IL', r'\btel aviv\b': 'IL', r'\bindia\b': 'IN', r'\bindian\b': 'IN', r'\bmumbai\b': 'IN', r'\bdelhi\b': 'IN', } # Institution type inference TYPE_KEYWORDS = { 'museum': 'M', 'musea': 'M', 'archief': 'A', 'archive': 'A', 'bibliotheek': 'L', 'library': 'L', 'universiteit': 'E', 'university': 'E', 'hogeschool': 'E', 'academie': 'E', 'academy': 'E', 'school': 'E', 'ministerie': 'O', 'ministry': 'O', 'gemeente': 'O', 'politie': 'O', 'rijks': 'O', 'dienst': 'O', 'stichting': 'N', 'foundation': 'N', 'fonds': 'N', 'fund': 'N', 'vereniging': 'S', 'society': 'S', 'association': 'S', } def normalize_name(name: str) -> str: """Normalize name for matching.""" return name.lower().strip() # Short patterns that need word boundary matching SHORT_PATTERNS = {'ind', 'eo', 'ntr', 'npo', 'nos', 'coa', 'cbs', 'tno', 'nwo', 'hku', 'tue', 'oba', 'svb', 'uwv', 'dnb', 'afm', 'fries museum'} def lookup_known_org(name: str) -> Optional[Tuple[str, str, str, str]]: """Look up organization in known list.""" name_lower = normalize_name(name) for pattern, info in KNOWN_ORGANIZATIONS.items(): # For short patterns, use word boundary matching if pattern in SHORT_PATTERNS: # Must be exact match or word-bounded if re.search(r'\b' + re.escape(pattern) + r'\b', name_lower): return info else: # For longer patterns, substring matching is fine if pattern in name_lower: return info return None def detect_city(name: str) -> Optional[Tuple[str, str]]: """Detect city from name.""" name_lower = normalize_name(name) for pattern, (prov, city) in CITY_PATTERNS.items(): if re.search(pattern, name_lower): return (prov, city) return None def detect_non_dutch(name: str) -> Optional[str]: """Detect if organization is not Dutch.""" name_lower = normalize_name(name) for pattern, country in NON_DUTCH_PATTERNS.items(): if re.search(pattern, name_lower): return country return None def infer_type(name: str) -> str: """Infer institution type from name.""" name_lower = normalize_name(name) for keyword, type_code in TYPE_KEYWORDS.items(): if keyword in name_lower: return type_code return 'M' # Default to Museum def generate_abbreviation(name: str) -> str: """Generate abbreviation from name.""" skip = {'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'en', 'of', 'the', 'a', 'an', 'of', 'for', 'and', 'or', 'at', 'in', 'on', 'stichting', 'museum', 'archief', 'bibliotheek'} words = re.split(r'[\s\-\'\"\(\)]+', name) abbrev = ''.join(w[0].upper() for w in words if w.lower() not in skip and w and w[0].isalpha()) return abbrev[:8] if abbrev else 'UNK' def load_yaml(filepath: Path) -> Optional[Dict]: """Load YAML file.""" try: with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) except: return None def save_yaml(filepath: Path, data: Dict): """Save YAML file.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) def resolve_pending_file(filepath: Path, custodian_dir: Path, dry_run: bool = True) -> Tuple[str, Optional[Path]]: """ Resolve a PENDING file. Returns: (status, new_filepath) Status: 'resolved', 'reclassified', 'collision', 'failed' """ data = load_yaml(filepath) if not data: return ('error', None) name = data.get('custodian_name', {}).get('emic_name', '') if not name: return ('error', None) # Strategy 1: Check if non-Dutch new_country = detect_non_dutch(name) if new_country and new_country != 'NL': # Reclassify to different country old_name = filepath.stem new_name = old_name.replace('NL-XX-XXX-PENDING-', f'{new_country}-XX-XXX-PENDING-') new_filepath = custodian_dir / f"{new_name}.yaml" if new_filepath.exists(): return ('collision', None) if not dry_run: data['ghcid_current'] = new_name save_yaml(new_filepath, data) filepath.unlink() return ('reclassified', new_filepath) # Strategy 2: Known organization lookup known = lookup_known_org(name) if known: prov, city, inst_type, abbrev = known new_ghcid = f"NL-{prov}-{city}-{inst_type}-{abbrev}" new_filepath = custodian_dir / f"{new_ghcid}.yaml" if new_filepath.exists(): return ('collision', new_filepath) if not dry_run: data['ghcid_current'] = new_ghcid if 'provenance' not in data: data['provenance'] = {} notes = data['provenance'].get('notes', []) if isinstance(notes, str): notes = [notes] notes.append(f"GHCID resolved via known org lookup on {datetime.now(timezone.utc).isoformat()}") data['provenance']['notes'] = notes save_yaml(new_filepath, data) filepath.unlink() return ('resolved', new_filepath) # Strategy 3: City name extraction city_info = detect_city(name) if city_info: prov, city = city_info inst_type = infer_type(name) abbrev = generate_abbreviation(name) new_ghcid = f"NL-{prov}-{city}-{inst_type}-{abbrev}" new_filepath = custodian_dir / f"{new_ghcid}.yaml" if new_filepath.exists(): return ('collision', new_filepath) if not dry_run: data['ghcid_current'] = new_ghcid if 'provenance' not in data: data['provenance'] = {} notes = data['provenance'].get('notes', []) if isinstance(notes, str): notes = [notes] notes.append(f"GHCID resolved via city extraction on {datetime.now(timezone.utc).isoformat()}") data['provenance']['notes'] = notes save_yaml(new_filepath, data) filepath.unlink() return ('resolved', new_filepath) return ('failed', None) def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--dry-run', action='store_true') parser.add_argument('--limit', type=int, default=0) parser.add_argument('--custodian-dir', type=Path, default=Path('/Users/kempersc/apps/glam/data/custodian')) args = parser.parse_args() custodian_dir = args.custodian_dir print("=" * 80) print("COMPREHENSIVE PENDING FILE RESOLVER") print("=" * 80) print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}") if args.limit: print(f"Limit: {args.limit} files") print() # Find NL PENDING files pending_files = sorted(custodian_dir.glob('NL-XX-XXX-PENDING-*.yaml')) if args.limit: pending_files = pending_files[:args.limit] print(f"Processing {len(pending_files)} files...") print() stats = {'resolved': 0, 'reclassified': 0, 'collision': 0, 'failed': 0, 'error': 0} for filepath in pending_files: data = load_yaml(filepath) if not data: stats['error'] += 1 continue name = data.get('custodian_name', {}).get('emic_name', '') status, new_path = resolve_pending_file(filepath, custodian_dir, args.dry_run) stats[status] += 1 if status in ['resolved', 'reclassified']: action = 'DRY RUN' if args.dry_run else status.upper() print(f"[{action}] {name[:45]}") if new_path: print(f" -> {new_path.name}") print() print("=" * 80) print("SUMMARY") print("=" * 80) for status, count in stats.items(): if count > 0: print(f" {status}: {count}") print(f" TOTAL: {sum(stats.values())}") if __name__ == '__main__': main()