- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
276 lines
9.8 KiB
Python
276 lines
9.8 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Chilean Heritage Institutions - Batch 2 Wikidata Enrichment (CORRECTED)
|
||
|
||
FINAL VERSION with strict matching to avoid false positives.
|
||
|
||
Strategy:
|
||
- ONLY enrich institutions we can match with 100% certainty
|
||
- Use exact name matching + location verification
|
||
- Direct Q-number mapping (no SPARQL queries)
|
||
|
||
BATCH 2 FINAL TARGETS (4 institutions verified in dataset):
|
||
1. Universidad de Chile's Archivo Central Andrés Bello → Q219576
|
||
Location: Santiago, Provincia de Santiago ✓
|
||
|
||
2. Universidad de Concepción's SIBUDEC → Q1163431
|
||
Location: Concepción, Concepción ✓
|
||
|
||
3. Universidad Austral → Q1163558
|
||
Location: Valdivia, Valdivia ✓
|
||
|
||
4. Universidad Católica (Temuco) → Q2900814
|
||
Location: Maipo region, Temuco ✓
|
||
(This is Universidad Católica de Temuco, NOT PUC Santiago)
|
||
"""
|
||
|
||
import yaml
|
||
from pathlib import Path
|
||
from typing import Dict, Any
|
||
|
||
|
||
# EXACT institution matches (verified in dataset)
|
||
BATCH_2_EXACT_MATCHES = [
|
||
{
|
||
'exact_name': "Universidad de Chile's Archivo Central Andrés Bello",
|
||
'inst_type': 'ARCHIVE',
|
||
'expected_city': 'Provincia de Santiago',
|
||
'expected_region': 'Santiago',
|
||
'parent_university': 'Universidad de Chile',
|
||
'wikidata_q': 'Q219576',
|
||
'notes': 'Central archive of Universidad de Chile, Chile\'s oldest university (founded 1842)'
|
||
},
|
||
{
|
||
'exact_name': "Universidad de Concepción's SIBUDEC",
|
||
'inst_type': 'EDUCATION_PROVIDER',
|
||
'expected_city': 'Concepción',
|
||
'expected_region': 'Concepción',
|
||
'parent_university': 'Universidad de Concepción',
|
||
'wikidata_q': 'Q1163431',
|
||
'notes': 'Library system (SIBUDEC) of Universidad de Concepción, third oldest university in Chile (founded 1919)'
|
||
},
|
||
{
|
||
'exact_name': 'Universidad Austral',
|
||
'inst_type': 'EDUCATION_PROVIDER',
|
||
'expected_city': 'Valdivia',
|
||
'expected_region': 'Valdivia',
|
||
'parent_university': 'Universidad Austral de Chile',
|
||
'wikidata_q': 'Q1163558',
|
||
'notes': 'Universidad Austral de Chile in Valdivia, southern Chile (founded 1954)'
|
||
},
|
||
{
|
||
'exact_name': 'Universidad Católica',
|
||
'inst_type': 'EDUCATION_PROVIDER',
|
||
'expected_city': 'Temuco',
|
||
'expected_region': 'Maipo',
|
||
'parent_university': 'Universidad Católica de Temuco',
|
||
'wikidata_q': 'Q2900814',
|
||
'notes': 'Universidad Católica de Temuco (founded 1991, previously sede of PUC Valparaíso)'
|
||
}
|
||
]
|
||
|
||
|
||
def has_wikidata(inst: Dict[str, Any]) -> bool:
|
||
"""Check if institution already has Wikidata identifier."""
|
||
return any(
|
||
id_obj.get('identifier_scheme') == 'Wikidata'
|
||
for id_obj in inst.get('identifiers', [])
|
||
)
|
||
|
||
|
||
def exact_match(inst: Dict[str, Any], target: Dict[str, Any]) -> bool:
|
||
"""Verify institution exactly matches target (name + type + location)."""
|
||
# Exact name match
|
||
if inst.get('name') != target['exact_name']:
|
||
return False
|
||
|
||
# Institution type match
|
||
if inst.get('institution_type') != target['inst_type']:
|
||
return False
|
||
|
||
# Location match
|
||
locations = inst.get('locations', [])
|
||
if not locations:
|
||
return False
|
||
|
||
location = locations[0]
|
||
city = location.get('city', '')
|
||
region = location.get('region', '')
|
||
|
||
# Must match BOTH city and region
|
||
city_match = city == target['expected_city']
|
||
region_match = region == target['expected_region']
|
||
|
||
return city_match and region_match
|
||
|
||
|
||
def add_wikidata_identifier(
|
||
inst: Dict[str, Any],
|
||
q_number: str,
|
||
parent_university: str,
|
||
notes: str
|
||
) -> Dict[str, Any]:
|
||
"""Add Wikidata identifier with provenance tracking."""
|
||
wikidata_id = {
|
||
'identifier_scheme': 'Wikidata',
|
||
'identifier_value': q_number,
|
||
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
|
||
}
|
||
|
||
if 'identifiers' not in inst:
|
||
inst['identifiers'] = []
|
||
|
||
inst['identifiers'].append(wikidata_id)
|
||
|
||
# Update description
|
||
if 'description' in inst and inst['description']:
|
||
if parent_university not in inst['description']:
|
||
inst['description'] = f"{inst['description']} Part of {parent_university}. {notes}"
|
||
else:
|
||
inst['description'] = f"Part of {parent_university}. {notes}"
|
||
|
||
# Update provenance
|
||
if 'provenance' in inst:
|
||
old_method = inst['provenance'].get('extraction_method', '')
|
||
enrichment_note = f" + Wikidata enrichment (Batch 2, parent: {parent_university}, exact match)"
|
||
inst['provenance']['extraction_method'] = f"{old_method}{enrichment_note}"
|
||
|
||
return inst
|
||
|
||
|
||
def main():
|
||
data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'chile' / 'chilean_institutions_batch1_enriched.yaml'
|
||
backup_file = data_file.with_suffix('.batch2_backup')
|
||
output_file = data_file.with_name('chilean_institutions_batch2_enriched.yaml')
|
||
|
||
print("=" * 80)
|
||
print("Chilean Heritage Institutions - Batch 2 Wikidata Enrichment (CORRECTED)")
|
||
print("Exact Matching Only - Zero False Positives")
|
||
print("Session: November 9, 2025")
|
||
print("=" * 80)
|
||
print()
|
||
|
||
# Load data
|
||
print(f"📂 Loading: {data_file.name}")
|
||
with open(data_file, 'r', encoding='utf-8') as f:
|
||
institutions = yaml.safe_load(f)
|
||
|
||
print(f" Total institutions: {len(institutions)}")
|
||
|
||
# Check existing Wikidata coverage
|
||
with_wikidata_before = sum(1 for inst in institutions if has_wikidata(inst))
|
||
print(f" Current Wikidata coverage: {with_wikidata_before}/{len(institutions)} ({with_wikidata_before/len(institutions)*100:.1f}%)")
|
||
print()
|
||
|
||
# Create backup
|
||
print(f"💾 Creating backup: {backup_file.name}")
|
||
with open(backup_file, 'w', encoding='utf-8') as f:
|
||
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
||
print()
|
||
|
||
print("Batch 2 Enrichment (Exact Matching):")
|
||
print("-" * 80)
|
||
|
||
# Process each target
|
||
enriched_count = 0
|
||
skipped_count = 0
|
||
not_found_count = 0
|
||
|
||
for i, target in enumerate(BATCH_2_EXACT_MATCHES, 1):
|
||
print(f"\n[{i}/{len(BATCH_2_EXACT_MATCHES)}] 🎓 {target['exact_name']}")
|
||
print(f" Parent: {target['parent_university']}")
|
||
print(f" Wikidata: {target['wikidata_q']}")
|
||
print(f" Expected location: {target['expected_city']}, {target['expected_region']}")
|
||
|
||
# Find exact match
|
||
matched = None
|
||
for inst in institutions:
|
||
if exact_match(inst, target):
|
||
matched = inst
|
||
break
|
||
|
||
if not matched:
|
||
print(f" ❌ NOT FOUND (exact match failed)")
|
||
not_found_count += 1
|
||
continue
|
||
|
||
print(f" ✅ EXACT MATCH CONFIRMED")
|
||
|
||
# Check if already has Wikidata
|
||
if has_wikidata(matched):
|
||
existing_q = next(
|
||
(id_obj['identifier_value'] for id_obj in matched.get('identifiers', [])
|
||
if id_obj.get('identifier_scheme') == 'Wikidata'),
|
||
None
|
||
)
|
||
print(f" ⏭️ Already enriched with {existing_q}")
|
||
skipped_count += 1
|
||
continue
|
||
|
||
# Add Wikidata identifier
|
||
print(f" ➕ Adding Wikidata: {target['wikidata_q']} ({target['parent_university']})")
|
||
|
||
add_wikidata_identifier(
|
||
matched,
|
||
target['wikidata_q'],
|
||
target['parent_university'],
|
||
target['notes']
|
||
)
|
||
enriched_count += 1
|
||
|
||
print()
|
||
print("=" * 80)
|
||
print("Batch 2 Summary:")
|
||
print("-" * 80)
|
||
print(f"✅ Enriched: {enriched_count}")
|
||
print(f"⏭️ Already enriched: {skipped_count}")
|
||
print(f"❌ Not found: {not_found_count}")
|
||
|
||
# Calculate updated coverage
|
||
with_wikidata_after = sum(1 for inst in institutions if has_wikidata(inst))
|
||
|
||
print()
|
||
print("Chilean Institution Coverage:")
|
||
print(f" Total: {len(institutions)}")
|
||
print(f" Before Batch 2: {with_wikidata_before} ({with_wikidata_before/len(institutions)*100:.1f}%)")
|
||
print(f" After Batch 2: {with_wikidata_after} ({with_wikidata_after/len(institutions)*100:.1f}%)")
|
||
print(f" Improvement: +{with_wikidata_after - with_wikidata_before} institutions")
|
||
|
||
if with_wikidata_after > 0:
|
||
coverage_pct = with_wikidata_after / len(institutions) * 100
|
||
print(f" Progress toward 22% goal: {coverage_pct:.1f}% / 22.0%")
|
||
|
||
# Save if any enrichments
|
||
if enriched_count > 0:
|
||
print()
|
||
print(f"💾 Saving enriched data to: {output_file.name}")
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
||
|
||
print()
|
||
print("✅ Batch 2 enrichment complete!")
|
||
print()
|
||
print("NEXT STEPS - Batch 3 Options:")
|
||
print()
|
||
print("Option A: Major Museums (5 institutions):")
|
||
print(" - Museo Nacional de Historia Natural (Santiago)")
|
||
print(" - Museo de Arte Precolombino (Santiago)")
|
||
print(" - Museo Histórico Nacional (Santiago)")
|
||
print(" - Museo de Bellas Artes (Santiago)")
|
||
print(" - Museo Regional de Ancud (Chiloé)")
|
||
print()
|
||
print("Option B: More University Departments (5 institutions):")
|
||
print(" - Universidad del Bío-Bío's [department]")
|
||
print(" - Universidad de Talca's Centro [department]")
|
||
print(" - Universidad de la Frontera [department]")
|
||
print(" - Universidad de Magallanes [department]")
|
||
print(" - Universidad de Playa Ancha's [department]")
|
||
print()
|
||
print("Recommendation: Try Option B first (universities have better Wikidata coverage)")
|
||
else:
|
||
print()
|
||
print("⚠️ No enrichments - all targets already enriched or not found")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|