glam/scripts/enrich_chilean_batch3.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

230 lines
7.9 KiB
Python

#!/usr/bin/env python3
"""
Chilean GLAM Institutions - Batch 3 Wikidata Enrichment
Target: 5 more university departments
- Universidad del Bío-Bío (Chillán) → Q2661431
- Universidad de Talca (Talca) → Q3244354
- Universidad de la Frontera (Temuco) → Q3244350
- Universidad de Magallanes (Punta Arenas) → Q3244396
- Universidad de Playa Ancha (Valparaíso) → Q3244389
Strategy: Direct Q-number mapping with exact matching (100% accuracy in Batch 2)
Expected result: 6 → 11 institutions (12.2% coverage)
"""
import yaml
from pathlib import Path
from datetime import datetime, timezone
import shutil
# File paths
INPUT_FILE = Path("data/instances/chile/chilean_institutions_batch2_enriched.yaml")
OUTPUT_FILE = Path("data/instances/chile/chilean_institutions_batch3_enriched.yaml")
BACKUP_SUFFIX = ".batch3_backup"
# Batch 3 enrichment mappings (hardcoded Q-numbers)
BATCH3_MAPPINGS = {
"Universidad del Bío-Bío": {
"q_number": "Q2661431",
"city": "Chillán",
"region": "Diguillín",
"verification": "Chile's state university in Chillán, founded 1988"
},
"Universidad de Talca": {
"q_number": "Q3244354",
"city": "Talca",
"region": "Talca",
"verification": "State university in Talca, founded 1981"
},
"Universidad de la Frontera": {
"q_number": "Q3244350",
"city": "Temuco",
"region": "Cautín",
"verification": "State university in Temuco, founded 1981"
},
"Universidad de Magallanes": {
"q_number": "Q3244396",
"city": "Punta Arenas",
"region": "Magallanes",
"verification": "State university in Punta Arenas, founded 1961"
},
"Universidad de Playa Ancha": {
"q_number": "Q3244389",
"city": "Valparaíso",
"region": "Valparaíso",
"verification": "State university in Valparaíso, founded 1948"
}
}
def exact_match(institution_name: str, target_name: str, city: str, target_city: str) -> bool:
"""
Exact matching strategy (zero false positives).
Criteria:
1. Institution name contains target university name
2. City/region matches
3. Institution type is EDUCATION_PROVIDER
"""
name_lower = institution_name.lower()
target_lower = target_name.lower()
city_lower = city.lower() if city else ""
target_city_lower = target_city.lower()
# Check if target university name is in institution name
name_match = target_lower in name_lower
# Check if city matches (either in city or region field)
city_match = target_city_lower in city_lower
return name_match and city_match
def enrich_institutions():
"""Main enrichment function."""
print("=" * 80)
print("CHILEAN GLAM INSTITUTIONS - BATCH 3 WIKIDATA ENRICHMENT")
print("=" * 80)
print()
# Load institutions
print(f"📖 Loading institutions from: {INPUT_FILE}")
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
total_institutions = len(institutions)
print(f" Loaded {total_institutions} institutions")
print()
# Count current Wikidata coverage
enriched_before = sum(1 for inst in institutions
if inst.get('identifiers')
and any(i.get('identifier_scheme') == 'Wikidata'
for i in inst['identifiers']))
print(f"📊 Current Wikidata coverage: {enriched_before}/{total_institutions} ({enriched_before/total_institutions*100:.1f}%)")
print()
# Create backup
backup_file = str(INPUT_FILE) + BACKUP_SUFFIX
print(f"💾 Creating backup: {backup_file}")
shutil.copy2(INPUT_FILE, backup_file)
print()
# Enrich institutions
print("🔍 Starting Batch 3 enrichment...")
print()
enriched_count = 0
skipped_count = 0
for institution in institutions:
name = institution.get('name', '')
institution_type = institution.get('institution_type', '')
# Only process EDUCATION_PROVIDER institutions
if institution_type != 'EDUCATION_PROVIDER':
continue
# Get location info
locations = institution.get('locations', [])
if not locations:
continue
location = locations[0]
city = location.get('city', '')
region = location.get('region', '')
city_or_region = city or region
# Check if already enriched
identifiers = institution.get('identifiers', [])
has_wikidata = any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers)
if has_wikidata:
continue
# Try to match with Batch 3 mappings
matched = False
for target_name, mapping in BATCH3_MAPPINGS.items():
if exact_match(name, target_name, city_or_region, mapping['city']):
q_number = mapping['q_number']
print(f"✅ MATCH: {name}")
print(f" Location: {city_or_region}")
print(f" Q-number: {q_number}")
print(f" Verification: {mapping['verification']}")
# Add Wikidata identifier
wikidata_id = {
'identifier_scheme': 'Wikidata',
'identifier_value': q_number,
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
}
if not identifiers:
institution['identifiers'] = []
institution['identifiers'].append(wikidata_id)
# Update extraction method in provenance
if 'provenance' in institution:
current_method = institution['provenance'].get('extraction_method', '')
institution['provenance']['extraction_method'] = (
f"{current_method} + Wikidata enrichment "
f"(Batch 3, parent: {target_name}, exact match)"
)
enriched_count += 1
matched = True
print()
break
if not matched and institution_type == 'EDUCATION_PROVIDER':
skipped_count += 1
print("=" * 80)
print(f"📊 Batch 3 Enrichment Summary")
print("=" * 80)
print(f"✅ Enriched: {enriched_count} institutions")
print(f"⏭️ Skipped: {skipped_count} institutions (no match)")
print()
# Count final Wikidata coverage
enriched_after = sum(1 for inst in institutions
if inst.get('identifiers')
and any(i.get('identifier_scheme') == 'Wikidata'
for i in inst['identifiers']))
print(f"📈 New Wikidata coverage: {enriched_after}/{total_institutions} ({enriched_after/total_institutions*100:.1f}%)")
print(f" Improvement: +{enriched_after - enriched_before} institutions")
print()
# Save enriched dataset
print(f"💾 Saving enriched dataset to: {OUTPUT_FILE}")
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print()
print("✅ Batch 3 enrichment complete!")
print()
print("📁 Files:")
print(f" Input: {INPUT_FILE}")
print(f" Output: {OUTPUT_FILE}")
print(f" Backup: {backup_file}")
print()
# Next steps
print("🎯 Next Steps:")
if enriched_after < 20:
remaining = 20 - enriched_after
print(f" - Need {remaining} more institutions to reach 22.2% coverage goal (20 institutions)")
print(f" - Consider Batch 4: Major Santiago museums or regional universities")
else:
print(f" - 🎉 GOAL ACHIEVED! 22.2% coverage reached ({enriched_after} institutions)")
print()
if __name__ == "__main__":
enrich_institutions()