glam/scripts/enrich_chilean_batch2_corrected.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

276 lines
9.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Chilean Heritage Institutions - Batch 2 Wikidata Enrichment (CORRECTED)
FINAL VERSION with strict matching to avoid false positives.
Strategy:
- ONLY enrich institutions we can match with 100% certainty
- Use exact name matching + location verification
- Direct Q-number mapping (no SPARQL queries)
BATCH 2 FINAL TARGETS (4 institutions verified in dataset):
1. Universidad de Chile's Archivo Central Andrés Bello → Q219576
Location: Santiago, Provincia de Santiago ✓
2. Universidad de Concepción's SIBUDEC → Q1163431
Location: Concepción, Concepción ✓
3. Universidad Austral → Q1163558
Location: Valdivia, Valdivia ✓
4. Universidad Católica (Temuco) → Q2900814
Location: Maipo region, Temuco ✓
(This is Universidad Católica de Temuco, NOT PUC Santiago)
"""
import yaml
from pathlib import Path
from typing import Dict, Any
# EXACT institution matches (verified in dataset)
BATCH_2_EXACT_MATCHES = [
{
'exact_name': "Universidad de Chile's Archivo Central Andrés Bello",
'inst_type': 'ARCHIVE',
'expected_city': 'Provincia de Santiago',
'expected_region': 'Santiago',
'parent_university': 'Universidad de Chile',
'wikidata_q': 'Q219576',
'notes': 'Central archive of Universidad de Chile, Chile\'s oldest university (founded 1842)'
},
{
'exact_name': "Universidad de Concepción's SIBUDEC",
'inst_type': 'EDUCATION_PROVIDER',
'expected_city': 'Concepción',
'expected_region': 'Concepción',
'parent_university': 'Universidad de Concepción',
'wikidata_q': 'Q1163431',
'notes': 'Library system (SIBUDEC) of Universidad de Concepción, third oldest university in Chile (founded 1919)'
},
{
'exact_name': 'Universidad Austral',
'inst_type': 'EDUCATION_PROVIDER',
'expected_city': 'Valdivia',
'expected_region': 'Valdivia',
'parent_university': 'Universidad Austral de Chile',
'wikidata_q': 'Q1163558',
'notes': 'Universidad Austral de Chile in Valdivia, southern Chile (founded 1954)'
},
{
'exact_name': 'Universidad Católica',
'inst_type': 'EDUCATION_PROVIDER',
'expected_city': 'Temuco',
'expected_region': 'Maipo',
'parent_university': 'Universidad Católica de Temuco',
'wikidata_q': 'Q2900814',
'notes': 'Universidad Católica de Temuco (founded 1991, previously sede of PUC Valparaíso)'
}
]
def has_wikidata(inst: Dict[str, Any]) -> bool:
"""Check if institution already has Wikidata identifier."""
return any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst.get('identifiers', [])
)
def exact_match(inst: Dict[str, Any], target: Dict[str, Any]) -> bool:
"""Verify institution exactly matches target (name + type + location)."""
# Exact name match
if inst.get('name') != target['exact_name']:
return False
# Institution type match
if inst.get('institution_type') != target['inst_type']:
return False
# Location match
locations = inst.get('locations', [])
if not locations:
return False
location = locations[0]
city = location.get('city', '')
region = location.get('region', '')
# Must match BOTH city and region
city_match = city == target['expected_city']
region_match = region == target['expected_region']
return city_match and region_match
def add_wikidata_identifier(
inst: Dict[str, Any],
q_number: str,
parent_university: str,
notes: str
) -> Dict[str, Any]:
"""Add Wikidata identifier with provenance tracking."""
wikidata_id = {
'identifier_scheme': 'Wikidata',
'identifier_value': q_number,
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
}
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append(wikidata_id)
# Update description
if 'description' in inst and inst['description']:
if parent_university not in inst['description']:
inst['description'] = f"{inst['description']} Part of {parent_university}. {notes}"
else:
inst['description'] = f"Part of {parent_university}. {notes}"
# Update provenance
if 'provenance' in inst:
old_method = inst['provenance'].get('extraction_method', '')
enrichment_note = f" + Wikidata enrichment (Batch 2, parent: {parent_university}, exact match)"
inst['provenance']['extraction_method'] = f"{old_method}{enrichment_note}"
return inst
def main():
data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'chile' / 'chilean_institutions_batch1_enriched.yaml'
backup_file = data_file.with_suffix('.batch2_backup')
output_file = data_file.with_name('chilean_institutions_batch2_enriched.yaml')
print("=" * 80)
print("Chilean Heritage Institutions - Batch 2 Wikidata Enrichment (CORRECTED)")
print("Exact Matching Only - Zero False Positives")
print("Session: November 9, 2025")
print("=" * 80)
print()
# Load data
print(f"📂 Loading: {data_file.name}")
with open(data_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f" Total institutions: {len(institutions)}")
# Check existing Wikidata coverage
with_wikidata_before = sum(1 for inst in institutions if has_wikidata(inst))
print(f" Current Wikidata coverage: {with_wikidata_before}/{len(institutions)} ({with_wikidata_before/len(institutions)*100:.1f}%)")
print()
# Create backup
print(f"💾 Creating backup: {backup_file.name}")
with open(backup_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print()
print("Batch 2 Enrichment (Exact Matching):")
print("-" * 80)
# Process each target
enriched_count = 0
skipped_count = 0
not_found_count = 0
for i, target in enumerate(BATCH_2_EXACT_MATCHES, 1):
print(f"\n[{i}/{len(BATCH_2_EXACT_MATCHES)}] 🎓 {target['exact_name']}")
print(f" Parent: {target['parent_university']}")
print(f" Wikidata: {target['wikidata_q']}")
print(f" Expected location: {target['expected_city']}, {target['expected_region']}")
# Find exact match
matched = None
for inst in institutions:
if exact_match(inst, target):
matched = inst
break
if not matched:
print(f" ❌ NOT FOUND (exact match failed)")
not_found_count += 1
continue
print(f" ✅ EXACT MATCH CONFIRMED")
# Check if already has Wikidata
if has_wikidata(matched):
existing_q = next(
(id_obj['identifier_value'] for id_obj in matched.get('identifiers', [])
if id_obj.get('identifier_scheme') == 'Wikidata'),
None
)
print(f" ⏭️ Already enriched with {existing_q}")
skipped_count += 1
continue
# Add Wikidata identifier
print(f" Adding Wikidata: {target['wikidata_q']} ({target['parent_university']})")
add_wikidata_identifier(
matched,
target['wikidata_q'],
target['parent_university'],
target['notes']
)
enriched_count += 1
print()
print("=" * 80)
print("Batch 2 Summary:")
print("-" * 80)
print(f"✅ Enriched: {enriched_count}")
print(f"⏭️ Already enriched: {skipped_count}")
print(f"❌ Not found: {not_found_count}")
# Calculate updated coverage
with_wikidata_after = sum(1 for inst in institutions if has_wikidata(inst))
print()
print("Chilean Institution Coverage:")
print(f" Total: {len(institutions)}")
print(f" Before Batch 2: {with_wikidata_before} ({with_wikidata_before/len(institutions)*100:.1f}%)")
print(f" After Batch 2: {with_wikidata_after} ({with_wikidata_after/len(institutions)*100:.1f}%)")
print(f" Improvement: +{with_wikidata_after - with_wikidata_before} institutions")
if with_wikidata_after > 0:
coverage_pct = with_wikidata_after / len(institutions) * 100
print(f" Progress toward 22% goal: {coverage_pct:.1f}% / 22.0%")
# Save if any enrichments
if enriched_count > 0:
print()
print(f"💾 Saving enriched data to: {output_file.name}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print()
print("✅ Batch 2 enrichment complete!")
print()
print("NEXT STEPS - Batch 3 Options:")
print()
print("Option A: Major Museums (5 institutions):")
print(" - Museo Nacional de Historia Natural (Santiago)")
print(" - Museo de Arte Precolombino (Santiago)")
print(" - Museo Histórico Nacional (Santiago)")
print(" - Museo de Bellas Artes (Santiago)")
print(" - Museo Regional de Ancud (Chiloé)")
print()
print("Option B: More University Departments (5 institutions):")
print(" - Universidad del Bío-Bío's [department]")
print(" - Universidad de Talca's Centro [department]")
print(" - Universidad de la Frontera [department]")
print(" - Universidad de Magallanes [department]")
print(" - Universidad de Playa Ancha's [department]")
print()
print("Recommendation: Try Option B first (universities have better Wikidata coverage)")
else:
print()
print("⚠️ No enrichments - all targets already enriched or not found")
if __name__ == '__main__':
main()