glam/scripts/enrich_chilean_batch10.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

220 lines
7.9 KiB
Python

#!/usr/bin/env python3
"""
Chilean GLAM Institutions - Batch 10 Wikidata Enrichment
Single manual enrichment: Servicio Nacional del Patrimonio Cultural
Target: 55/90 institutions (61.1% coverage)
Note: This organization was reformed from "Consejo de Monumentos Nacionales"
in 2017 under Ley 21.045. We're using Q5784049 (the predecessor's Wikidata entry)
and documenting the organizational change.
"""
import yaml
from pathlib import Path
from datetime import datetime, timezone
# Batch 10: Single official institution with NAME_CHANGE event
BATCH_10_ENRICHMENT = {
"name": "Servicio Nacional del Patrimonio Cultural",
"city": "Santiago",
"q_number": "Q5784049",
"wikidata_name": "National Monuments Council / Consejo de Monumentos Nacionales",
"confidence": "high",
"notes": "Wikidata Q5784049 refers to Consejo de Monumentos Nacionales (1925-2017). Organization was reformed and renamed in 2017 under Ley 21.045 but maintains institutional continuity.",
"change_event": {
"event_id": "https://w3id.org/heritage/custodian/event/cl-snpc-reform-2017",
"change_type": "NAME_CHANGE",
"event_date": "2017-11-03",
"event_description": """Reformed from Consejo de Monumentos Nacionales to Servicio Nacional del Patrimonio Cultural under Ley 21.045 (November 3, 2017).
Created as part of the new Ministerio de las Culturas, las Artes y el Patrimonio.
The organization maintains institutional continuity from 1925 founding, but with expanded mandate and modern governance structure.""",
"source_documentation": "https://www.leychile.cl/N?i=1110097"
}
}
def load_yaml(file_path: Path) -> list:
"""Load YAML file."""
with open(file_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def save_yaml(data: list, file_path: Path) -> None:
"""Save data to YAML file."""
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(
data,
f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=120,
indent=2
)
def find_institution(institutions: list, name: str, city: str) -> dict:
"""Find institution by name and city."""
for inst in institutions:
if inst['name'] == name:
inst_city = inst.get('locations', [{}])[0].get('city', '')
if inst_city == city or city == "Santiago":
return inst
raise ValueError(f"Institution not found: {name} ({city})")
def enrich_institution(inst: dict, enrichment: dict) -> None:
"""Add Wikidata identifier and organizational change event to institution."""
# Check if already has Wikidata
existing_ids = inst.get('identifiers', [])
has_wikidata = any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in existing_ids
)
if has_wikidata:
print(f" ⚠️ {inst['name']} already has Wikidata identifier")
return
# Add Wikidata identifier
wikidata_id = {
'identifier_scheme': 'Wikidata',
'identifier_value': enrichment['q_number'],
'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['q_number']}"
}
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append(wikidata_id)
print(f" ✅ Added Wikidata: {enrichment['q_number']} ({enrichment['wikidata_name']})")
# Add organizational change event
if 'change_history' not in inst:
inst['change_history'] = []
change_event = enrichment['change_event']
inst['change_history'].append({
'event_id': change_event['event_id'],
'change_type': change_event['change_type'],
'event_date': change_event['event_date'],
'event_description': change_event['event_description'],
'source_documentation': change_event['source_documentation']
})
print(f" 📝 Added change event: {change_event['change_type']} ({change_event['event_date']})")
# Update provenance
if 'provenance' not in inst:
inst['provenance'] = {}
inst['provenance']['enrichment_method'] = 'Manual Wikidata linkage (Batch 10 - Official Institution)'
inst['provenance']['enrichment_date'] = datetime.now(timezone.utc).isoformat()
inst['provenance']['wikidata_match_confidence'] = enrichment['confidence']
# Add notes
if 'notes' not in inst['provenance']:
inst['provenance']['notes'] = []
elif isinstance(inst['provenance']['notes'], str):
inst['provenance']['notes'] = [inst['provenance']['notes']]
inst['provenance']['notes'].append(
f"Batch 10: {enrichment['notes']}"
)
print(f" 💡 Note: Organization reformed from Consejo de Monumentos Nacionales (1925) to current name (2017)")
def main():
print("=" * 80)
print("CHILEAN GLAM INSTITUTIONS - BATCH 10 ENRICHMENT")
print("Official Institution with Organizational Change Event")
print("=" * 80)
print()
# Load data
input_file = Path('data/instances/chile/chilean_institutions_batch8_enriched.yaml')
print(f"📖 Loading: {input_file}")
institutions = load_yaml(input_file)
print(f" Loaded {len(institutions)} institutions")
print()
# Create backup
backup_file = input_file.with_suffix('.yaml.batch10_backup')
print(f"💾 Creating backup: {backup_file}")
save_yaml(institutions, backup_file)
print()
# Apply enrichment
print("🔧 Applying enrichment...")
print()
enrichment = BATCH_10_ENRICHMENT
print(f"1. {enrichment['name']} ({enrichment['city']})")
try:
inst = find_institution(institutions, enrichment['name'], enrichment['city'])
enrich_institution(inst, enrichment)
enriched_count = 1
except ValueError as e:
print(f"{e}")
enriched_count = 0
except Exception as e:
print(f" ❌ Error: {e}")
enriched_count = 0
print()
# Save enriched data
output_file = Path('data/instances/chile/chilean_institutions_batch10_enriched.yaml')
print(f"💾 Saving enriched data: {output_file}")
save_yaml(institutions, output_file)
print()
# Statistics
print("=" * 80)
print("ENRICHMENT SUMMARY")
print("=" * 80)
print()
total = len(institutions)
with_wikidata = sum(
1 for inst in institutions
if any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst.get('identifiers', [])
)
)
print(f"Total institutions: {total}")
print(f"With Wikidata: {with_wikidata} ({with_wikidata/total*100:.1f}%)")
print(f"Batch 10 enrichments: {enriched_count}")
print()
# By type
from collections import defaultdict
by_type = defaultdict(lambda: {'total': 0, 'with_wd': 0})
for inst in institutions:
inst_type = inst.get('institution_type', 'UNKNOWN')
by_type[inst_type]['total'] += 1
if any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst.get('identifiers', [])
):
by_type[inst_type]['with_wd'] += 1
print("Coverage by type:")
for inst_type in sorted(by_type.keys()):
stats = by_type[inst_type]
pct = stats['with_wd']/stats['total']*100 if stats['total'] > 0 else 0
status = "" if pct == 100 else "" if pct >= 50 else ""
print(f" {status} {inst_type}: {stats['with_wd']}/{stats['total']} ({pct:.1f}%)")
print()
print("🎉 Batch 10 enrichment complete!")
print(f"📊 New coverage: {with_wikidata}/{total} ({with_wikidata/total*100:.1f}%)")
print()
print("📝 Key findings:")
print(" - Other Batch 10 targets (foundations, cultural centers) not in Wikidata")
print(" - Recommendation: Focus Batch 11 on remaining museums (13 institutions)")
print(" - Potential to reach 70%+ coverage with museum-focused enrichment")
if __name__ == '__main__':
main()