glam/scripts/match_custodians_to_yaml_final.py
2025-12-15 22:31:41 +01:00

208 lines
8.1 KiB
Python

#!/usr/bin/env python3
"""
Final precise matching of LinkedIn custodians to existing YAML files.
Expanded manual mapping based on research.
"""
import json
import os
from pathlib import Path
from datetime import datetime
BASE_DIR = Path("/Users/kempersc/apps/glam")
CUSTODIAN_DIR = BASE_DIR / "data" / "custodian"
TO_PROFILE_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodians_to_profile.json"
OUTPUT_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodian_yaml_matches_final.json"
# Comprehensive manual mapping (LinkedIn slug -> YAML filename)
KNOWN_MAPPINGS = {
# Major Dutch institutions
"rijksmuseum": "NL-NH-AMS-M-RM.yaml",
"kb-nationale-bibliotheek": "NL-ZH-DHA-L-KB-kb_nationale_bibliotheek.yaml",
"van-gogh-museum": "NL-NH-AMS-M-GM-van_gogh_museum.yaml",
"nationaal-archief": "NL-ZH-DHA-A-NA.yaml",
"nederlands-instituut-voor-beeld-en-geluid": "NL-NH-HIL-M-NIBG.yaml",
"nederlands-openluchtmuseum": "NL-GE-ARN-M-NOM.yaml",
"stedelijk-museum-amsterdam": "NL-NH-AMS-M-SMA.yaml",
"eye-filmmuseum": "NL-NH-AMS-M-EFM-eye_filmmuseum.yaml",
"anne-frank-stichting": "NL-NH-AMS-M-AFH.yaml",
"amsterdam-museum": "NL-NH-AMS-M-AM.yaml",
"paleis-het-loo": "NL-GE-APE-M-NMPL.yaml",
"zuiderzeemuseum": "NL-NH-ENK-M-ZM.yaml",
"huygens-institute": "NL-NH-AMS-R-HI.yaml",
"niod-institute-for-war-holocaust-and-genocide-studies": "NL-NH-AMS-A-NIWHGS.yaml",
"kroller-muller-museum": "NL-GE-OTT-M-KMM.yaml",
"noord-hollands-archief": "NL-NH-HAA-A-NHA.yaml",
"allard-pierson": "NL-NH-AMS-M-AP.yaml",
"het-utrechts-archief": "NL-UT-UTR-A-UA.yaml",
"gelders-archief": "NL-GE-ARN-A-GA.yaml",
"collectie-overijssel": "NL-OV-ZWO-A-CO-collectie_overijssel.yaml",
"nieuwe-instituut": "NL-ZH-ROT-A-NI.yaml",
# More Dutch institutions
"museum-boijmans-van-beuningen": "NL-ZH-ROT-M-MBVB.yaml",
"mauritshuis": "NL-ZH-DHA-M-MH.yaml",
"museum-het-valkhof": "NL-GE-NIJ-M-MHV.yaml",
"museum-rembrandthuis": "NL-NH-AMS-M-MRH.yaml",
# Regional archives
"regionaal-archief-alkmaar": "NL-NH-ALK-A-RAA.yaml",
"regionaal-archief-tilburg": "NL-NB-TIL-A-RAT.yaml",
"regionaal-archief-rivierenland": "NL-GE-TIE-A-RAR.yaml",
"regionaal-archief-nijmegen": "NL-GE-NIJ-A-RAN.yaml",
"regionaal-archief-zuid-utrecht": "NL-UT-WOU-A-RAZU.yaml",
"waterlands-archief": "NL-NH-PUR-A-WA.yaml",
"west-brabants-archief": "NL-NB-BER-A-WBA.yaml",
"het-flevolands-archief": "NL-FL-LEL-A-FA.yaml",
"drents-archief": "NL-DR-ASS-A-DA.yaml",
"westfries-archief": "NL-NH-HOO-A-WFA.yaml",
# Other Dutch museums/institutions
"zeeuws-museum": "NL-ZE-MID-M-ZM.yaml",
"h-art-museum": "NL-NH-AMS-M-HM.yaml", # formerly Hermitage Amsterdam
"nationaal-militair-museum": "NL-UT-SOE-M-NMM.yaml",
"museum-bronbeek": "NL-GE-ARN-M-MB.yaml",
"airborne-museum": "NL-GE-OOS-M-AM.yaml",
"nemo-science-museum": "NL-NH-AMS-M-NEMO.yaml",
"naturalis-biodiversity-center": "NL-ZH-LEI-M-NBC.yaml",
"museum-arnhem": "NL-GE-ARN-M-MA.yaml",
"coda-apeldoorn": "NL-GE-APE-M-C.yaml",
# Libraries
"tu-delft-library": "NL-ZH-DEL-L-TUD.yaml",
"de-bblthk": "NL-GE-WAG-L-BBL.yaml",
# Other organizations
"raad-voor-cultuur": None, # Advisory council, not a custodian
"fonds-voor-cultuurparticipatie": None, # Fund, not a custodian
"stichting-kunst-cultuur": None, # Provincial support org
"cultuur-oost": None, # Provincial support org
"european-cultural-foundation": None, # Foundation, not a custodian
}
# International institutions (outside NL scope for now)
INTERNATIONAL = {
"national-library-of-australia",
"internet-archive",
"yad-vashem-the-world-holocaust-remembrance-center",
"caen-memorial",
"the-art-loss-register",
"arolsen-archives",
"centre-for-media-communication-and-information-sciences",
"culture-action-europe",
"advn-archief-voor-nationale-bewegingen",
}
def main():
print("Loading custodians to profile...")
with open(TO_PROFILE_FILE) as f:
to_profile_data = json.load(f)
custodians = to_profile_data['custodians']
print(f"Found {len(custodians)} custodians to match\n")
results = {
'matched': [],
'international': [],
'not_heritage_custodian': [],
'needs_yaml_creation': [],
'summary': {},
'generated_at': datetime.now().isoformat()
}
for custodian in custodians:
slug = custodian['slug']
name = custodian['name']
heritage_count = custodian['heritage_count']
# Check if international
if slug in INTERNATIONAL:
results['international'].append({
'custodian': custodian,
'note': 'International institution - outside NL scope'
})
continue
# Check known mappings
if slug in KNOWN_MAPPINGS:
yaml_file = KNOWN_MAPPINGS[slug]
if yaml_file is None:
results['not_heritage_custodian'].append({
'custodian': custodian,
'note': 'Organization exists but is not a heritage custodian (fund, council, etc.)'
})
else:
yaml_path = CUSTODIAN_DIR / yaml_file
if yaml_path.exists():
results['matched'].append({
'custodian': custodian,
'yaml_file': str(yaml_path),
'yaml_filename': yaml_file,
'match_method': 'manual_mapping'
})
else:
results['needs_yaml_creation'].append({
'custodian': custodian,
'expected_file': yaml_file,
'note': 'Expected YAML file does not exist'
})
else:
# Not in mapping - needs investigation or creation
results['needs_yaml_creation'].append({
'custodian': custodian,
'note': 'No mapping exists - needs YAML creation or research'
})
# Summary
results['summary'] = {
'total': len(custodians),
'matched_to_yaml': len(results['matched']),
'international': len(results['international']),
'not_heritage_custodian': len(results['not_heritage_custodian']),
'needs_yaml_creation': len(results['needs_yaml_creation'])
}
# Print results
print("=" * 70)
print("FINAL MATCHING RESULTS")
print("=" * 70)
print(f"Total custodians: {results['summary']['total']}")
print(f"Matched to existing YAML: {results['summary']['matched_to_yaml']}")
print(f"International (out of scope): {results['summary']['international']}")
print(f"Not heritage custodians: {results['summary']['not_heritage_custodian']}")
print(f"Needs YAML creation: {results['summary']['needs_yaml_creation']}")
print("\n" + "=" * 70)
print("MATCHED TO EXISTING YAML FILES")
print("=" * 70)
for i, m in enumerate(results['matched'], 1):
c = m['custodian']
print(f"{i:2}. {c['name'][:45]:<45} | {c['heritage_count']:>3} staff | {m['yaml_filename']}")
print("\n" + "=" * 70)
print("NEEDS YAML CREATION (Dutch heritage institutions)")
print("=" * 70)
for i, m in enumerate([x for x in results['needs_yaml_creation'] if x['custodian']['heritage_count'] > 0][:40], 1):
c = m['custodian']
loc = c.get('location', {})
city = loc.get('city', 'Unknown') if isinstance(loc, dict) else 'Unknown'
print(f"{i:2}. {c['name'][:45]:<45} | {c['heritage_count']:>3} staff | {city}")
print("\n" + "=" * 70)
print("INTERNATIONAL (outside current scope)")
print("=" * 70)
for i, m in enumerate(results['international'], 1):
c = m['custodian']
print(f"{i:2}. {c['name'][:50]:<50} | {c['heritage_count']:>3} staff")
# Save results
with open(OUTPUT_FILE, 'w') as f:
json.dump(results, f, indent=2, default=str)
print(f"\nResults saved to: {OUTPUT_FILE}")
return results
if __name__ == "__main__":
main()