#!/usr/bin/env python3 """ Add alternative_names to Category 2 entries (legal name vs trade name variations). These are VALID entries where custodian_name is correct but we need to capture the legal name as an alternative. """ import yaml import os from datetime import datetime # Define entries to update with their alternative names CATEGORY2_UPDATES = { "0040_museum_contemporary_tibetan_art.yaml": { "alternative_names": [ {"name": "Stichting Tibet House Holland", "source": "original_entry", "note": "Legal foundation name (KvK registration)"}, {"name": "Tibet House", "source": "inferred", "note": "Short name variant"} ], "note": "Museum of Contemporary Tibetan Art is the public name; Stichting Tibet House Holland is the legal entity" }, "0138_Q13447121.yaml": { "alternative_names": [ {"name": "Stichting CODA", "source": "original_entry", "note": "Legal foundation name"}, {"name": "CODA Museum", "source": "google_maps", "note": "Google Maps listing name"} ], "note": "CODA Apeldoorn is the museum name; Stichting CODA is the legal foundation" }, "0399_heemkundevereniging_roggel.yaml": { "alternative_names": [ {"name": "Museumboerderij De Leimskoel", "source": "google_maps", "note": "Museum farm operated by the society"} ], "note": "Heemkundevereniging Roggel operates Museumboerderij De Leimskoel" }, "0424_oce_museum.yaml": { "alternative_names": [ {"name": "Océ Museum", "source": "inferred", "note": "Common short name"}, {"name": "Stichting Behoud Erfgoed Nederlands Lucht- en Ruimtevaartcentrum NLR", "source": "inferred", "note": "Related heritage foundation (if applicable)"} ], "note": "Heritage organization preserving Océ (now Canon) history, located at Canon Customer Experience Center" }, "0432_Q56460988.yaml": { "alternative_names": [ {"name": "Stichting De Domijnen", "source": "original_entry", "note": "Legal foundation name"}, {"name": "De Domijnen", "source": "inferred", "note": "Short name variant"}, {"name": "Filmhuis ZICHT", "source": "google_maps", "note": "Cinema venue in same building"} ], "note": "Museum De Domijnen is the public name; Stichting De Domijnen is the legal foundation" }, "0697_Q2218933.yaml": { "alternative_names": [ {"name": "Museumhuis Barnaart", "source": "google_maps", "note": "Public museum name"}, {"name": "Museumhuis Hendrick de Keyser - Huis Barnaart", "source": "inferred", "note": "Full official name"} ], "note": "Historic house museum, part of Vereniging Hendrick de Keyser museum houses" }, "0752_unknown.yaml": { "alternative_names": [ {"name": "Oudheidkundige Vereniging Medenblick", "source": "original_entry", "note": "Parent historical society"}, {"name": "OVM", "source": "inferred", "note": "Abbreviation"} ], "note": "Historisch Museum Medemblik is operated by Oudheidkundige Vereniging Medenblick" }, "0770_unknown.yaml": { "alternative_names": [ {"name": "Stichting Behoud Erfgoed Nederlands Lucht- en Ruimtevaartcentrum NLR", "source": "original_entry", "note": "Full legal name"}, {"name": "NLR Heritage Museum", "source": "inferred", "note": "English name variant"} ], "note": "Abbreviated name vs full legal foundation name" }, "0843_Q85311353.yaml": { "alternative_names": [ {"name": "Museum De Waag", "source": "google_maps", "note": "Museum location in De Waag building"}, {"name": "Speelgoedmuseum Deventer", "source": "original_entry", "note": "Toy museum location"}, {"name": "Deventer Verhaal (locaties: Speelgoedmuseum & Museum De Waag)", "source": "original_entry", "note": "Full name with locations"} ], "note": "Deventer Verhaal is the umbrella organization operating multiple museum locations" }, "0893_unknown.yaml": { "alternative_names": [ {"name": "Historische Kring Ommen", "source": "original_entry", "note": "Parent historical society"} ], "note": "Museum-Ommen is operated by Historische Kring Ommen" }, "1517_rijksmuseum_amsterdam.yaml": { "alternative_names": [ {"name": "Rijksmuseum Amsterdam", "source": "original_entry", "note": "Full name with city"}, {"name": "Rijks", "source": "inferred", "note": "Colloquial short name"} ], "note": "Rijksmuseum is the common public name" }, "1610_eye_film_instituut_nederland.yaml": { "alternative_names": [ {"name": "EYE Film Instituut Nederland", "source": "original_entry", "note": "Full legal name"}, {"name": "EYE", "source": "inferred", "note": "Common abbreviation"}, {"name": "Eye Film Institute Netherlands", "source": "inferred", "note": "English name"} ], "note": "Eye Filmmuseum is the public museum name; EYE Film Instituut Nederland is the legal entity" } } ENTRIES_DIR = "/Users/kempersc/apps/glam/data/nde/enriched/entries" def update_entry(filename: str, updates: dict): """Update an entry with alternative names.""" filepath = os.path.join(ENTRIES_DIR, filename) if not os.path.exists(filepath): print(f" ⚠ File not found: {filename}") return False with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Add alternative_names if not present if 'alternative_names' not in data: data['alternative_names'] = [] # Add new alternative names (avoid duplicates) existing_names = {alt.get('name', '') for alt in data.get('alternative_names', [])} for alt in updates['alternative_names']: if alt['name'] not in existing_names: data['alternative_names'].append(alt) existing_names.add(alt['name']) # Update custodian_name with note if present if 'custodian_name' in data: if 'note' in updates: data['custodian_name']['name_variation_note'] = updates['note'] # Boost confidence for validated entries if data['custodian_name'].get('confidence', 0) < 0.8: data['custodian_name']['confidence'] = 0.85 data['custodian_name']['manual_review'] = True data['custodian_name']['manual_review_timestamp'] = datetime.now().isoformat() data['custodian_name']['manual_review_note'] = "Category 2: Legal name vs trade name variation - both names valid" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return True def main(): print("Adding alternative_names to Category 2 entries...") print("=" * 60) updated = 0 for filename, updates in CATEGORY2_UPDATES.items(): print(f"Processing: {filename}") if update_entry(filename, updates): print(f" ✓ Updated with {len(updates['alternative_names'])} alternative names") updated += 1 print("=" * 60) print(f"Updated {updated}/{len(CATEGORY2_UPDATES)} entries") if __name__ == "__main__": main()