glam/scripts/add_alternative_names_category2.py
2025-12-01 23:55:55 +01:00

155 lines
7.2 KiB
Python

#!/usr/bin/env python3
"""
Add alternative_names to Category 2 entries (legal name vs trade name variations).
These are VALID entries where custodian_name is correct but we need to capture
the legal name as an alternative.
"""
import yaml
import os
from datetime import datetime
# Define entries to update with their alternative names
CATEGORY2_UPDATES = {
"0040_museum_contemporary_tibetan_art.yaml": {
"alternative_names": [
{"name": "Stichting Tibet House Holland", "source": "original_entry", "note": "Legal foundation name (KvK registration)"},
{"name": "Tibet House", "source": "inferred", "note": "Short name variant"}
],
"note": "Museum of Contemporary Tibetan Art is the public name; Stichting Tibet House Holland is the legal entity"
},
"0138_Q13447121.yaml": {
"alternative_names": [
{"name": "Stichting CODA", "source": "original_entry", "note": "Legal foundation name"},
{"name": "CODA Museum", "source": "google_maps", "note": "Google Maps listing name"}
],
"note": "CODA Apeldoorn is the museum name; Stichting CODA is the legal foundation"
},
"0399_heemkundevereniging_roggel.yaml": {
"alternative_names": [
{"name": "Museumboerderij De Leimskoel", "source": "google_maps", "note": "Museum farm operated by the society"}
],
"note": "Heemkundevereniging Roggel operates Museumboerderij De Leimskoel"
},
"0424_oce_museum.yaml": {
"alternative_names": [
{"name": "Océ Museum", "source": "inferred", "note": "Common short name"},
{"name": "Stichting Behoud Erfgoed Nederlands Lucht- en Ruimtevaartcentrum NLR", "source": "inferred", "note": "Related heritage foundation (if applicable)"}
],
"note": "Heritage organization preserving Océ (now Canon) history, located at Canon Customer Experience Center"
},
"0432_Q56460988.yaml": {
"alternative_names": [
{"name": "Stichting De Domijnen", "source": "original_entry", "note": "Legal foundation name"},
{"name": "De Domijnen", "source": "inferred", "note": "Short name variant"},
{"name": "Filmhuis ZICHT", "source": "google_maps", "note": "Cinema venue in same building"}
],
"note": "Museum De Domijnen is the public name; Stichting De Domijnen is the legal foundation"
},
"0697_Q2218933.yaml": {
"alternative_names": [
{"name": "Museumhuis Barnaart", "source": "google_maps", "note": "Public museum name"},
{"name": "Museumhuis Hendrick de Keyser - Huis Barnaart", "source": "inferred", "note": "Full official name"}
],
"note": "Historic house museum, part of Vereniging Hendrick de Keyser museum houses"
},
"0752_unknown.yaml": {
"alternative_names": [
{"name": "Oudheidkundige Vereniging Medenblick", "source": "original_entry", "note": "Parent historical society"},
{"name": "OVM", "source": "inferred", "note": "Abbreviation"}
],
"note": "Historisch Museum Medemblik is operated by Oudheidkundige Vereniging Medenblick"
},
"0770_unknown.yaml": {
"alternative_names": [
{"name": "Stichting Behoud Erfgoed Nederlands Lucht- en Ruimtevaartcentrum NLR", "source": "original_entry", "note": "Full legal name"},
{"name": "NLR Heritage Museum", "source": "inferred", "note": "English name variant"}
],
"note": "Abbreviated name vs full legal foundation name"
},
"0843_Q85311353.yaml": {
"alternative_names": [
{"name": "Museum De Waag", "source": "google_maps", "note": "Museum location in De Waag building"},
{"name": "Speelgoedmuseum Deventer", "source": "original_entry", "note": "Toy museum location"},
{"name": "Deventer Verhaal (locaties: Speelgoedmuseum & Museum De Waag)", "source": "original_entry", "note": "Full name with locations"}
],
"note": "Deventer Verhaal is the umbrella organization operating multiple museum locations"
},
"0893_unknown.yaml": {
"alternative_names": [
{"name": "Historische Kring Ommen", "source": "original_entry", "note": "Parent historical society"}
],
"note": "Museum-Ommen is operated by Historische Kring Ommen"
},
"1517_rijksmuseum_amsterdam.yaml": {
"alternative_names": [
{"name": "Rijksmuseum Amsterdam", "source": "original_entry", "note": "Full name with city"},
{"name": "Rijks", "source": "inferred", "note": "Colloquial short name"}
],
"note": "Rijksmuseum is the common public name"
},
"1610_eye_film_instituut_nederland.yaml": {
"alternative_names": [
{"name": "EYE Film Instituut Nederland", "source": "original_entry", "note": "Full legal name"},
{"name": "EYE", "source": "inferred", "note": "Common abbreviation"},
{"name": "Eye Film Institute Netherlands", "source": "inferred", "note": "English name"}
],
"note": "Eye Filmmuseum is the public museum name; EYE Film Instituut Nederland is the legal entity"
}
}
ENTRIES_DIR = "/Users/kempersc/apps/glam/data/nde/enriched/entries"
def update_entry(filename: str, updates: dict):
"""Update an entry with alternative names."""
filepath = os.path.join(ENTRIES_DIR, filename)
if not os.path.exists(filepath):
print(f" ⚠ File not found: {filename}")
return False
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Add alternative_names if not present
if 'alternative_names' not in data:
data['alternative_names'] = []
# Add new alternative names (avoid duplicates)
existing_names = {alt.get('name', '') for alt in data.get('alternative_names', [])}
for alt in updates['alternative_names']:
if alt['name'] not in existing_names:
data['alternative_names'].append(alt)
existing_names.add(alt['name'])
# Update custodian_name with note if present
if 'custodian_name' in data:
if 'note' in updates:
data['custodian_name']['name_variation_note'] = updates['note']
# Boost confidence for validated entries
if data['custodian_name'].get('confidence', 0) < 0.8:
data['custodian_name']['confidence'] = 0.85
data['custodian_name']['manual_review'] = True
data['custodian_name']['manual_review_timestamp'] = datetime.now().isoformat()
data['custodian_name']['manual_review_note'] = "Category 2: Legal name vs trade name variation - both names valid"
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return True
def main():
print("Adding alternative_names to Category 2 entries...")
print("=" * 60)
updated = 0
for filename, updates in CATEGORY2_UPDATES.items():
print(f"Processing: {filename}")
if update_entry(filename, updates):
print(f" ✓ Updated with {len(updates['alternative_names'])} alternative names")
updated += 1
print("=" * 60)
print(f"Updated {updated}/{len(CATEGORY2_UPDATES)} entries")
if __name__ == "__main__":
main()