- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
220 lines
7.9 KiB
Python
220 lines
7.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Chilean GLAM Institutions - Batch 10 Wikidata Enrichment
|
|
Single manual enrichment: Servicio Nacional del Patrimonio Cultural
|
|
|
|
Target: 55/90 institutions (61.1% coverage)
|
|
|
|
Note: This organization was reformed from "Consejo de Monumentos Nacionales"
|
|
in 2017 under Ley 21.045. We're using Q5784049 (the predecessor's Wikidata entry)
|
|
and documenting the organizational change.
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
# Batch 10: Single official institution with NAME_CHANGE event
|
|
BATCH_10_ENRICHMENT = {
|
|
"name": "Servicio Nacional del Patrimonio Cultural",
|
|
"city": "Santiago",
|
|
"q_number": "Q5784049",
|
|
"wikidata_name": "National Monuments Council / Consejo de Monumentos Nacionales",
|
|
"confidence": "high",
|
|
"notes": "Wikidata Q5784049 refers to Consejo de Monumentos Nacionales (1925-2017). Organization was reformed and renamed in 2017 under Ley 21.045 but maintains institutional continuity.",
|
|
"change_event": {
|
|
"event_id": "https://w3id.org/heritage/custodian/event/cl-snpc-reform-2017",
|
|
"change_type": "NAME_CHANGE",
|
|
"event_date": "2017-11-03",
|
|
"event_description": """Reformed from Consejo de Monumentos Nacionales to Servicio Nacional del Patrimonio Cultural under Ley 21.045 (November 3, 2017).
|
|
Created as part of the new Ministerio de las Culturas, las Artes y el Patrimonio.
|
|
The organization maintains institutional continuity from 1925 founding, but with expanded mandate and modern governance structure.""",
|
|
"source_documentation": "https://www.leychile.cl/N?i=1110097"
|
|
}
|
|
}
|
|
|
|
def load_yaml(file_path: Path) -> list:
|
|
"""Load YAML file."""
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def save_yaml(data: list, file_path: Path) -> None:
|
|
"""Save data to YAML file."""
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(
|
|
data,
|
|
f,
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
width=120,
|
|
indent=2
|
|
)
|
|
|
|
def find_institution(institutions: list, name: str, city: str) -> dict:
|
|
"""Find institution by name and city."""
|
|
for inst in institutions:
|
|
if inst['name'] == name:
|
|
inst_city = inst.get('locations', [{}])[0].get('city', '')
|
|
if inst_city == city or city == "Santiago":
|
|
return inst
|
|
raise ValueError(f"Institution not found: {name} ({city})")
|
|
|
|
def enrich_institution(inst: dict, enrichment: dict) -> None:
|
|
"""Add Wikidata identifier and organizational change event to institution."""
|
|
|
|
# Check if already has Wikidata
|
|
existing_ids = inst.get('identifiers', [])
|
|
has_wikidata = any(
|
|
id_obj.get('identifier_scheme') == 'Wikidata'
|
|
for id_obj in existing_ids
|
|
)
|
|
|
|
if has_wikidata:
|
|
print(f" ⚠️ {inst['name']} already has Wikidata identifier")
|
|
return
|
|
|
|
# Add Wikidata identifier
|
|
wikidata_id = {
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': enrichment['q_number'],
|
|
'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['q_number']}"
|
|
}
|
|
|
|
if 'identifiers' not in inst:
|
|
inst['identifiers'] = []
|
|
|
|
inst['identifiers'].append(wikidata_id)
|
|
print(f" ✅ Added Wikidata: {enrichment['q_number']} ({enrichment['wikidata_name']})")
|
|
|
|
# Add organizational change event
|
|
if 'change_history' not in inst:
|
|
inst['change_history'] = []
|
|
|
|
change_event = enrichment['change_event']
|
|
inst['change_history'].append({
|
|
'event_id': change_event['event_id'],
|
|
'change_type': change_event['change_type'],
|
|
'event_date': change_event['event_date'],
|
|
'event_description': change_event['event_description'],
|
|
'source_documentation': change_event['source_documentation']
|
|
})
|
|
print(f" 📝 Added change event: {change_event['change_type']} ({change_event['event_date']})")
|
|
|
|
# Update provenance
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
|
|
inst['provenance']['enrichment_method'] = 'Manual Wikidata linkage (Batch 10 - Official Institution)'
|
|
inst['provenance']['enrichment_date'] = datetime.now(timezone.utc).isoformat()
|
|
inst['provenance']['wikidata_match_confidence'] = enrichment['confidence']
|
|
|
|
# Add notes
|
|
if 'notes' not in inst['provenance']:
|
|
inst['provenance']['notes'] = []
|
|
elif isinstance(inst['provenance']['notes'], str):
|
|
inst['provenance']['notes'] = [inst['provenance']['notes']]
|
|
|
|
inst['provenance']['notes'].append(
|
|
f"Batch 10: {enrichment['notes']}"
|
|
)
|
|
|
|
print(f" 💡 Note: Organization reformed from Consejo de Monumentos Nacionales (1925) to current name (2017)")
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("CHILEAN GLAM INSTITUTIONS - BATCH 10 ENRICHMENT")
|
|
print("Official Institution with Organizational Change Event")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load data
|
|
input_file = Path('data/instances/chile/chilean_institutions_batch8_enriched.yaml')
|
|
print(f"📖 Loading: {input_file}")
|
|
institutions = load_yaml(input_file)
|
|
print(f" Loaded {len(institutions)} institutions")
|
|
print()
|
|
|
|
# Create backup
|
|
backup_file = input_file.with_suffix('.yaml.batch10_backup')
|
|
print(f"💾 Creating backup: {backup_file}")
|
|
save_yaml(institutions, backup_file)
|
|
print()
|
|
|
|
# Apply enrichment
|
|
print("🔧 Applying enrichment...")
|
|
print()
|
|
|
|
enrichment = BATCH_10_ENRICHMENT
|
|
print(f"1. {enrichment['name']} ({enrichment['city']})")
|
|
|
|
try:
|
|
inst = find_institution(institutions, enrichment['name'], enrichment['city'])
|
|
enrich_institution(inst, enrichment)
|
|
enriched_count = 1
|
|
except ValueError as e:
|
|
print(f" ❌ {e}")
|
|
enriched_count = 0
|
|
except Exception as e:
|
|
print(f" ❌ Error: {e}")
|
|
enriched_count = 0
|
|
|
|
print()
|
|
|
|
# Save enriched data
|
|
output_file = Path('data/instances/chile/chilean_institutions_batch10_enriched.yaml')
|
|
print(f"💾 Saving enriched data: {output_file}")
|
|
save_yaml(institutions, output_file)
|
|
print()
|
|
|
|
# Statistics
|
|
print("=" * 80)
|
|
print("ENRICHMENT SUMMARY")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
total = len(institutions)
|
|
with_wikidata = sum(
|
|
1 for inst in institutions
|
|
if any(
|
|
id_obj.get('identifier_scheme') == 'Wikidata'
|
|
for id_obj in inst.get('identifiers', [])
|
|
)
|
|
)
|
|
|
|
print(f"Total institutions: {total}")
|
|
print(f"With Wikidata: {with_wikidata} ({with_wikidata/total*100:.1f}%)")
|
|
print(f"Batch 10 enrichments: {enriched_count}")
|
|
print()
|
|
|
|
# By type
|
|
from collections import defaultdict
|
|
by_type = defaultdict(lambda: {'total': 0, 'with_wd': 0})
|
|
|
|
for inst in institutions:
|
|
inst_type = inst.get('institution_type', 'UNKNOWN')
|
|
by_type[inst_type]['total'] += 1
|
|
if any(
|
|
id_obj.get('identifier_scheme') == 'Wikidata'
|
|
for id_obj in inst.get('identifiers', [])
|
|
):
|
|
by_type[inst_type]['with_wd'] += 1
|
|
|
|
print("Coverage by type:")
|
|
for inst_type in sorted(by_type.keys()):
|
|
stats = by_type[inst_type]
|
|
pct = stats['with_wd']/stats['total']*100 if stats['total'] > 0 else 0
|
|
status = "✅" if pct == 100 else "⭐" if pct >= 50 else ""
|
|
print(f" {status} {inst_type}: {stats['with_wd']}/{stats['total']} ({pct:.1f}%)")
|
|
print()
|
|
|
|
print("🎉 Batch 10 enrichment complete!")
|
|
print(f"📊 New coverage: {with_wikidata}/{total} ({with_wikidata/total*100:.1f}%)")
|
|
print()
|
|
print("📝 Key findings:")
|
|
print(" - Other Batch 10 targets (foundations, cultural centers) not in Wikidata")
|
|
print(" - Recommendation: Focus Batch 11 on remaining museums (13 institutions)")
|
|
print(" - Potential to reach 70%+ coverage with museum-focused enrichment")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|