155 lines
7.2 KiB
Python
155 lines
7.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Add alternative_names to Category 2 entries (legal name vs trade name variations).
|
|
These are VALID entries where custodian_name is correct but we need to capture
|
|
the legal name as an alternative.
|
|
"""
|
|
import yaml
|
|
import os
|
|
from datetime import datetime
|
|
|
|
# Define entries to update with their alternative names
|
|
CATEGORY2_UPDATES = {
|
|
"0040_museum_contemporary_tibetan_art.yaml": {
|
|
"alternative_names": [
|
|
{"name": "Stichting Tibet House Holland", "source": "original_entry", "note": "Legal foundation name (KvK registration)"},
|
|
{"name": "Tibet House", "source": "inferred", "note": "Short name variant"}
|
|
],
|
|
"note": "Museum of Contemporary Tibetan Art is the public name; Stichting Tibet House Holland is the legal entity"
|
|
},
|
|
"0138_Q13447121.yaml": {
|
|
"alternative_names": [
|
|
{"name": "Stichting CODA", "source": "original_entry", "note": "Legal foundation name"},
|
|
{"name": "CODA Museum", "source": "google_maps", "note": "Google Maps listing name"}
|
|
],
|
|
"note": "CODA Apeldoorn is the museum name; Stichting CODA is the legal foundation"
|
|
},
|
|
"0399_heemkundevereniging_roggel.yaml": {
|
|
"alternative_names": [
|
|
{"name": "Museumboerderij De Leimskoel", "source": "google_maps", "note": "Museum farm operated by the society"}
|
|
],
|
|
"note": "Heemkundevereniging Roggel operates Museumboerderij De Leimskoel"
|
|
},
|
|
"0424_oce_museum.yaml": {
|
|
"alternative_names": [
|
|
{"name": "Océ Museum", "source": "inferred", "note": "Common short name"},
|
|
{"name": "Stichting Behoud Erfgoed Nederlands Lucht- en Ruimtevaartcentrum NLR", "source": "inferred", "note": "Related heritage foundation (if applicable)"}
|
|
],
|
|
"note": "Heritage organization preserving Océ (now Canon) history, located at Canon Customer Experience Center"
|
|
},
|
|
"0432_Q56460988.yaml": {
|
|
"alternative_names": [
|
|
{"name": "Stichting De Domijnen", "source": "original_entry", "note": "Legal foundation name"},
|
|
{"name": "De Domijnen", "source": "inferred", "note": "Short name variant"},
|
|
{"name": "Filmhuis ZICHT", "source": "google_maps", "note": "Cinema venue in same building"}
|
|
],
|
|
"note": "Museum De Domijnen is the public name; Stichting De Domijnen is the legal foundation"
|
|
},
|
|
"0697_Q2218933.yaml": {
|
|
"alternative_names": [
|
|
{"name": "Museumhuis Barnaart", "source": "google_maps", "note": "Public museum name"},
|
|
{"name": "Museumhuis Hendrick de Keyser - Huis Barnaart", "source": "inferred", "note": "Full official name"}
|
|
],
|
|
"note": "Historic house museum, part of Vereniging Hendrick de Keyser museum houses"
|
|
},
|
|
"0752_unknown.yaml": {
|
|
"alternative_names": [
|
|
{"name": "Oudheidkundige Vereniging Medenblick", "source": "original_entry", "note": "Parent historical society"},
|
|
{"name": "OVM", "source": "inferred", "note": "Abbreviation"}
|
|
],
|
|
"note": "Historisch Museum Medemblik is operated by Oudheidkundige Vereniging Medenblick"
|
|
},
|
|
"0770_unknown.yaml": {
|
|
"alternative_names": [
|
|
{"name": "Stichting Behoud Erfgoed Nederlands Lucht- en Ruimtevaartcentrum NLR", "source": "original_entry", "note": "Full legal name"},
|
|
{"name": "NLR Heritage Museum", "source": "inferred", "note": "English name variant"}
|
|
],
|
|
"note": "Abbreviated name vs full legal foundation name"
|
|
},
|
|
"0843_Q85311353.yaml": {
|
|
"alternative_names": [
|
|
{"name": "Museum De Waag", "source": "google_maps", "note": "Museum location in De Waag building"},
|
|
{"name": "Speelgoedmuseum Deventer", "source": "original_entry", "note": "Toy museum location"},
|
|
{"name": "Deventer Verhaal (locaties: Speelgoedmuseum & Museum De Waag)", "source": "original_entry", "note": "Full name with locations"}
|
|
],
|
|
"note": "Deventer Verhaal is the umbrella organization operating multiple museum locations"
|
|
},
|
|
"0893_unknown.yaml": {
|
|
"alternative_names": [
|
|
{"name": "Historische Kring Ommen", "source": "original_entry", "note": "Parent historical society"}
|
|
],
|
|
"note": "Museum-Ommen is operated by Historische Kring Ommen"
|
|
},
|
|
"1517_rijksmuseum_amsterdam.yaml": {
|
|
"alternative_names": [
|
|
{"name": "Rijksmuseum Amsterdam", "source": "original_entry", "note": "Full name with city"},
|
|
{"name": "Rijks", "source": "inferred", "note": "Colloquial short name"}
|
|
],
|
|
"note": "Rijksmuseum is the common public name"
|
|
},
|
|
"1610_eye_film_instituut_nederland.yaml": {
|
|
"alternative_names": [
|
|
{"name": "EYE Film Instituut Nederland", "source": "original_entry", "note": "Full legal name"},
|
|
{"name": "EYE", "source": "inferred", "note": "Common abbreviation"},
|
|
{"name": "Eye Film Institute Netherlands", "source": "inferred", "note": "English name"}
|
|
],
|
|
"note": "Eye Filmmuseum is the public museum name; EYE Film Instituut Nederland is the legal entity"
|
|
}
|
|
}
|
|
|
|
ENTRIES_DIR = "/Users/kempersc/apps/glam/data/nde/enriched/entries"
|
|
|
|
def update_entry(filename: str, updates: dict):
|
|
"""Update an entry with alternative names."""
|
|
filepath = os.path.join(ENTRIES_DIR, filename)
|
|
|
|
if not os.path.exists(filepath):
|
|
print(f" ⚠ File not found: {filename}")
|
|
return False
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Add alternative_names if not present
|
|
if 'alternative_names' not in data:
|
|
data['alternative_names'] = []
|
|
|
|
# Add new alternative names (avoid duplicates)
|
|
existing_names = {alt.get('name', '') for alt in data.get('alternative_names', [])}
|
|
for alt in updates['alternative_names']:
|
|
if alt['name'] not in existing_names:
|
|
data['alternative_names'].append(alt)
|
|
existing_names.add(alt['name'])
|
|
|
|
# Update custodian_name with note if present
|
|
if 'custodian_name' in data:
|
|
if 'note' in updates:
|
|
data['custodian_name']['name_variation_note'] = updates['note']
|
|
# Boost confidence for validated entries
|
|
if data['custodian_name'].get('confidence', 0) < 0.8:
|
|
data['custodian_name']['confidence'] = 0.85
|
|
data['custodian_name']['manual_review'] = True
|
|
data['custodian_name']['manual_review_timestamp'] = datetime.now().isoformat()
|
|
data['custodian_name']['manual_review_note'] = "Category 2: Legal name vs trade name variation - both names valid"
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return True
|
|
|
|
def main():
|
|
print("Adding alternative_names to Category 2 entries...")
|
|
print("=" * 60)
|
|
|
|
updated = 0
|
|
for filename, updates in CATEGORY2_UPDATES.items():
|
|
print(f"Processing: {filename}")
|
|
if update_entry(filename, updates):
|
|
print(f" ✓ Updated with {len(updates['alternative_names'])} alternative names")
|
|
updated += 1
|
|
|
|
print("=" * 60)
|
|
print(f"Updated {updated}/{len(CATEGORY2_UPDATES)} entries")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|