#!/usr/bin/env python3 """ Final precise matching of LinkedIn custodians to existing YAML files. Expanded manual mapping based on research. """ import json import os from pathlib import Path from datetime import datetime BASE_DIR = Path("/Users/kempersc/apps/glam") CUSTODIAN_DIR = BASE_DIR / "data" / "custodian" TO_PROFILE_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodians_to_profile.json" OUTPUT_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodian_yaml_matches_final.json" # Comprehensive manual mapping (LinkedIn slug -> YAML filename) KNOWN_MAPPINGS = { # Major Dutch institutions "rijksmuseum": "NL-NH-AMS-M-RM.yaml", "kb-nationale-bibliotheek": "NL-ZH-DHA-L-KB-kb_nationale_bibliotheek.yaml", "van-gogh-museum": "NL-NH-AMS-M-GM-van_gogh_museum.yaml", "nationaal-archief": "NL-ZH-DHA-A-NA.yaml", "nederlands-instituut-voor-beeld-en-geluid": "NL-NH-HIL-M-NIBG.yaml", "nederlands-openluchtmuseum": "NL-GE-ARN-M-NOM.yaml", "stedelijk-museum-amsterdam": "NL-NH-AMS-M-SMA.yaml", "eye-filmmuseum": "NL-NH-AMS-M-EFM-eye_filmmuseum.yaml", "anne-frank-stichting": "NL-NH-AMS-M-AFH.yaml", "amsterdam-museum": "NL-NH-AMS-M-AM.yaml", "paleis-het-loo": "NL-GE-APE-M-NMPL.yaml", "zuiderzeemuseum": "NL-NH-ENK-M-ZM.yaml", "huygens-institute": "NL-NH-AMS-R-HI.yaml", "niod-institute-for-war-holocaust-and-genocide-studies": "NL-NH-AMS-A-NIWHGS.yaml", "kroller-muller-museum": "NL-GE-OTT-M-KMM.yaml", "noord-hollands-archief": "NL-NH-HAA-A-NHA.yaml", "allard-pierson": "NL-NH-AMS-M-AP.yaml", "het-utrechts-archief": "NL-UT-UTR-A-UA.yaml", "gelders-archief": "NL-GE-ARN-A-GA.yaml", "collectie-overijssel": "NL-OV-ZWO-A-CO-collectie_overijssel.yaml", "nieuwe-instituut": "NL-ZH-ROT-A-NI.yaml", # More Dutch institutions "museum-boijmans-van-beuningen": "NL-ZH-ROT-M-MBVB.yaml", "mauritshuis": "NL-ZH-DHA-M-MH.yaml", "museum-het-valkhof": "NL-GE-NIJ-M-MHV.yaml", "museum-rembrandthuis": "NL-NH-AMS-M-MRH.yaml", # Regional archives "regionaal-archief-alkmaar": "NL-NH-ALK-A-RAA.yaml", "regionaal-archief-tilburg": "NL-NB-TIL-A-RAT.yaml", "regionaal-archief-rivierenland": "NL-GE-TIE-A-RAR.yaml", "regionaal-archief-nijmegen": "NL-GE-NIJ-A-RAN.yaml", "regionaal-archief-zuid-utrecht": "NL-UT-WOU-A-RAZU.yaml", "waterlands-archief": "NL-NH-PUR-A-WA.yaml", "west-brabants-archief": "NL-NB-BER-A-WBA.yaml", "het-flevolands-archief": "NL-FL-LEL-A-FA.yaml", "drents-archief": "NL-DR-ASS-A-DA.yaml", "westfries-archief": "NL-NH-HOO-A-WFA.yaml", # Other Dutch museums/institutions "zeeuws-museum": "NL-ZE-MID-M-ZM.yaml", "h-art-museum": "NL-NH-AMS-M-HM.yaml", # formerly Hermitage Amsterdam "nationaal-militair-museum": "NL-UT-SOE-M-NMM.yaml", "museum-bronbeek": "NL-GE-ARN-M-MB.yaml", "airborne-museum": "NL-GE-OOS-M-AM.yaml", "nemo-science-museum": "NL-NH-AMS-M-NEMO.yaml", "naturalis-biodiversity-center": "NL-ZH-LEI-M-NBC.yaml", "museum-arnhem": "NL-GE-ARN-M-MA.yaml", "coda-apeldoorn": "NL-GE-APE-M-C.yaml", # Libraries "tu-delft-library": "NL-ZH-DEL-L-TUD.yaml", "de-bblthk": "NL-GE-WAG-L-BBL.yaml", # Other organizations "raad-voor-cultuur": None, # Advisory council, not a custodian "fonds-voor-cultuurparticipatie": None, # Fund, not a custodian "stichting-kunst-cultuur": None, # Provincial support org "cultuur-oost": None, # Provincial support org "european-cultural-foundation": None, # Foundation, not a custodian } # International institutions (outside NL scope for now) INTERNATIONAL = { "national-library-of-australia", "internet-archive", "yad-vashem-the-world-holocaust-remembrance-center", "caen-memorial", "the-art-loss-register", "arolsen-archives", "centre-for-media-communication-and-information-sciences", "culture-action-europe", "advn-archief-voor-nationale-bewegingen", } def main(): print("Loading custodians to profile...") with open(TO_PROFILE_FILE) as f: to_profile_data = json.load(f) custodians = to_profile_data['custodians'] print(f"Found {len(custodians)} custodians to match\n") results = { 'matched': [], 'international': [], 'not_heritage_custodian': [], 'needs_yaml_creation': [], 'summary': {}, 'generated_at': datetime.now().isoformat() } for custodian in custodians: slug = custodian['slug'] name = custodian['name'] heritage_count = custodian['heritage_count'] # Check if international if slug in INTERNATIONAL: results['international'].append({ 'custodian': custodian, 'note': 'International institution - outside NL scope' }) continue # Check known mappings if slug in KNOWN_MAPPINGS: yaml_file = KNOWN_MAPPINGS[slug] if yaml_file is None: results['not_heritage_custodian'].append({ 'custodian': custodian, 'note': 'Organization exists but is not a heritage custodian (fund, council, etc.)' }) else: yaml_path = CUSTODIAN_DIR / yaml_file if yaml_path.exists(): results['matched'].append({ 'custodian': custodian, 'yaml_file': str(yaml_path), 'yaml_filename': yaml_file, 'match_method': 'manual_mapping' }) else: results['needs_yaml_creation'].append({ 'custodian': custodian, 'expected_file': yaml_file, 'note': 'Expected YAML file does not exist' }) else: # Not in mapping - needs investigation or creation results['needs_yaml_creation'].append({ 'custodian': custodian, 'note': 'No mapping exists - needs YAML creation or research' }) # Summary results['summary'] = { 'total': len(custodians), 'matched_to_yaml': len(results['matched']), 'international': len(results['international']), 'not_heritage_custodian': len(results['not_heritage_custodian']), 'needs_yaml_creation': len(results['needs_yaml_creation']) } # Print results print("=" * 70) print("FINAL MATCHING RESULTS") print("=" * 70) print(f"Total custodians: {results['summary']['total']}") print(f"Matched to existing YAML: {results['summary']['matched_to_yaml']}") print(f"International (out of scope): {results['summary']['international']}") print(f"Not heritage custodians: {results['summary']['not_heritage_custodian']}") print(f"Needs YAML creation: {results['summary']['needs_yaml_creation']}") print("\n" + "=" * 70) print("MATCHED TO EXISTING YAML FILES") print("=" * 70) for i, m in enumerate(results['matched'], 1): c = m['custodian'] print(f"{i:2}. {c['name'][:45]:<45} | {c['heritage_count']:>3} staff | {m['yaml_filename']}") print("\n" + "=" * 70) print("NEEDS YAML CREATION (Dutch heritage institutions)") print("=" * 70) for i, m in enumerate([x for x in results['needs_yaml_creation'] if x['custodian']['heritage_count'] > 0][:40], 1): c = m['custodian'] loc = c.get('location', {}) city = loc.get('city', 'Unknown') if isinstance(loc, dict) else 'Unknown' print(f"{i:2}. {c['name'][:45]:<45} | {c['heritage_count']:>3} staff | {city}") print("\n" + "=" * 70) print("INTERNATIONAL (outside current scope)") print("=" * 70) for i, m in enumerate(results['international'], 1): c = m['custodian'] print(f"{i:2}. {c['name'][:50]:<50} | {c['heritage_count']:>3} staff") # Save results with open(OUTPUT_FILE, 'w') as f: json.dump(results, f, indent=2, default=str) print(f"\nResults saved to: {OUTPUT_FILE}") return results if __name__ == "__main__": main()