- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
230 lines
7.9 KiB
Python
230 lines
7.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Chilean GLAM Institutions - Batch 3 Wikidata Enrichment
|
|
|
|
Target: 5 more university departments
|
|
- Universidad del Bío-Bío (Chillán) → Q2661431
|
|
- Universidad de Talca (Talca) → Q3244354
|
|
- Universidad de la Frontera (Temuco) → Q3244350
|
|
- Universidad de Magallanes (Punta Arenas) → Q3244396
|
|
- Universidad de Playa Ancha (Valparaíso) → Q3244389
|
|
|
|
Strategy: Direct Q-number mapping with exact matching (100% accuracy in Batch 2)
|
|
Expected result: 6 → 11 institutions (12.2% coverage)
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
import shutil
|
|
|
|
# File paths
|
|
INPUT_FILE = Path("data/instances/chile/chilean_institutions_batch2_enriched.yaml")
|
|
OUTPUT_FILE = Path("data/instances/chile/chilean_institutions_batch3_enriched.yaml")
|
|
BACKUP_SUFFIX = ".batch3_backup"
|
|
|
|
# Batch 3 enrichment mappings (hardcoded Q-numbers)
|
|
BATCH3_MAPPINGS = {
|
|
"Universidad del Bío-Bío": {
|
|
"q_number": "Q2661431",
|
|
"city": "Chillán",
|
|
"region": "Diguillín",
|
|
"verification": "Chile's state university in Chillán, founded 1988"
|
|
},
|
|
"Universidad de Talca": {
|
|
"q_number": "Q3244354",
|
|
"city": "Talca",
|
|
"region": "Talca",
|
|
"verification": "State university in Talca, founded 1981"
|
|
},
|
|
"Universidad de la Frontera": {
|
|
"q_number": "Q3244350",
|
|
"city": "Temuco",
|
|
"region": "Cautín",
|
|
"verification": "State university in Temuco, founded 1981"
|
|
},
|
|
"Universidad de Magallanes": {
|
|
"q_number": "Q3244396",
|
|
"city": "Punta Arenas",
|
|
"region": "Magallanes",
|
|
"verification": "State university in Punta Arenas, founded 1961"
|
|
},
|
|
"Universidad de Playa Ancha": {
|
|
"q_number": "Q3244389",
|
|
"city": "Valparaíso",
|
|
"region": "Valparaíso",
|
|
"verification": "State university in Valparaíso, founded 1948"
|
|
}
|
|
}
|
|
|
|
|
|
def exact_match(institution_name: str, target_name: str, city: str, target_city: str) -> bool:
|
|
"""
|
|
Exact matching strategy (zero false positives).
|
|
|
|
Criteria:
|
|
1. Institution name contains target university name
|
|
2. City/region matches
|
|
3. Institution type is EDUCATION_PROVIDER
|
|
"""
|
|
name_lower = institution_name.lower()
|
|
target_lower = target_name.lower()
|
|
city_lower = city.lower() if city else ""
|
|
target_city_lower = target_city.lower()
|
|
|
|
# Check if target university name is in institution name
|
|
name_match = target_lower in name_lower
|
|
|
|
# Check if city matches (either in city or region field)
|
|
city_match = target_city_lower in city_lower
|
|
|
|
return name_match and city_match
|
|
|
|
|
|
def enrich_institutions():
|
|
"""Main enrichment function."""
|
|
|
|
print("=" * 80)
|
|
print("CHILEAN GLAM INSTITUTIONS - BATCH 3 WIKIDATA ENRICHMENT")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load institutions
|
|
print(f"📖 Loading institutions from: {INPUT_FILE}")
|
|
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
total_institutions = len(institutions)
|
|
print(f" Loaded {total_institutions} institutions")
|
|
print()
|
|
|
|
# Count current Wikidata coverage
|
|
enriched_before = sum(1 for inst in institutions
|
|
if inst.get('identifiers')
|
|
and any(i.get('identifier_scheme') == 'Wikidata'
|
|
for i in inst['identifiers']))
|
|
|
|
print(f"📊 Current Wikidata coverage: {enriched_before}/{total_institutions} ({enriched_before/total_institutions*100:.1f}%)")
|
|
print()
|
|
|
|
# Create backup
|
|
backup_file = str(INPUT_FILE) + BACKUP_SUFFIX
|
|
print(f"💾 Creating backup: {backup_file}")
|
|
shutil.copy2(INPUT_FILE, backup_file)
|
|
print()
|
|
|
|
# Enrich institutions
|
|
print("🔍 Starting Batch 3 enrichment...")
|
|
print()
|
|
|
|
enriched_count = 0
|
|
skipped_count = 0
|
|
|
|
for institution in institutions:
|
|
name = institution.get('name', '')
|
|
institution_type = institution.get('institution_type', '')
|
|
|
|
# Only process EDUCATION_PROVIDER institutions
|
|
if institution_type != 'EDUCATION_PROVIDER':
|
|
continue
|
|
|
|
# Get location info
|
|
locations = institution.get('locations', [])
|
|
if not locations:
|
|
continue
|
|
|
|
location = locations[0]
|
|
city = location.get('city', '')
|
|
region = location.get('region', '')
|
|
city_or_region = city or region
|
|
|
|
# Check if already enriched
|
|
identifiers = institution.get('identifiers', [])
|
|
has_wikidata = any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers)
|
|
|
|
if has_wikidata:
|
|
continue
|
|
|
|
# Try to match with Batch 3 mappings
|
|
matched = False
|
|
for target_name, mapping in BATCH3_MAPPINGS.items():
|
|
if exact_match(name, target_name, city_or_region, mapping['city']):
|
|
q_number = mapping['q_number']
|
|
|
|
print(f"✅ MATCH: {name}")
|
|
print(f" Location: {city_or_region}")
|
|
print(f" Q-number: {q_number}")
|
|
print(f" Verification: {mapping['verification']}")
|
|
|
|
# Add Wikidata identifier
|
|
wikidata_id = {
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': q_number,
|
|
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
|
|
}
|
|
|
|
if not identifiers:
|
|
institution['identifiers'] = []
|
|
|
|
institution['identifiers'].append(wikidata_id)
|
|
|
|
# Update extraction method in provenance
|
|
if 'provenance' in institution:
|
|
current_method = institution['provenance'].get('extraction_method', '')
|
|
institution['provenance']['extraction_method'] = (
|
|
f"{current_method} + Wikidata enrichment "
|
|
f"(Batch 3, parent: {target_name}, exact match)"
|
|
)
|
|
|
|
enriched_count += 1
|
|
matched = True
|
|
print()
|
|
break
|
|
|
|
if not matched and institution_type == 'EDUCATION_PROVIDER':
|
|
skipped_count += 1
|
|
|
|
print("=" * 80)
|
|
print(f"📊 Batch 3 Enrichment Summary")
|
|
print("=" * 80)
|
|
print(f"✅ Enriched: {enriched_count} institutions")
|
|
print(f"⏭️ Skipped: {skipped_count} institutions (no match)")
|
|
print()
|
|
|
|
# Count final Wikidata coverage
|
|
enriched_after = sum(1 for inst in institutions
|
|
if inst.get('identifiers')
|
|
and any(i.get('identifier_scheme') == 'Wikidata'
|
|
for i in inst['identifiers']))
|
|
|
|
print(f"📈 New Wikidata coverage: {enriched_after}/{total_institutions} ({enriched_after/total_institutions*100:.1f}%)")
|
|
print(f" Improvement: +{enriched_after - enriched_before} institutions")
|
|
print()
|
|
|
|
# Save enriched dataset
|
|
print(f"💾 Saving enriched dataset to: {OUTPUT_FILE}")
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
print()
|
|
print("✅ Batch 3 enrichment complete!")
|
|
print()
|
|
print("📁 Files:")
|
|
print(f" Input: {INPUT_FILE}")
|
|
print(f" Output: {OUTPUT_FILE}")
|
|
print(f" Backup: {backup_file}")
|
|
print()
|
|
|
|
# Next steps
|
|
print("🎯 Next Steps:")
|
|
if enriched_after < 20:
|
|
remaining = 20 - enriched_after
|
|
print(f" - Need {remaining} more institutions to reach 22.2% coverage goal (20 institutions)")
|
|
print(f" - Consider Batch 4: Major Santiago museums or regional universities")
|
|
else:
|
|
print(f" - 🎉 GOAL ACHIEVED! 22.2% coverage reached ({enriched_after} institutions)")
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
enrich_institutions()
|