glam/scripts/enrich_chilean_batch7.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

459 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Chilean GLAM Institutions - Batch 7 Wikidata Enrichment
Uses bulk SPARQL matches from query_wikidata_chilean_museums.py
32 museums with verified Q-numbers from Wikidata Query Service
Target: 52/90 institutions (57.8% coverage)
"""
import yaml
from pathlib import Path
from datetime import datetime, timezone
# Batch 7: 32 museums from SPARQL bulk query (all regions)
BATCH_7_ENRICHMENTS = [
# ARICA Y PARINACOTA (1)
{
"name": "Museo Universidad de Tarapacá San Miguel de Azapa (MASMA)",
"city": "Arica",
"q_number": "Q9046776",
"wikidata_name": "Museo Arqueológico y Antropológico de San Miguel de Azapa",
"confidence": "partial", # Name variation but city + type match
"notes": "SPARQL match - partial name (includes full institutional title)",
},
# ANTOFAGASTA (2)
{
"name": "Museo de Historia Natural y Cultural del Desierto de Atacama (MUHNCAL)",
"city": "Calama",
"q_number": "Q86276638",
"wikidata_name": "Museo de Historia Natural y Cultural del Desierto de Atacama",
"confidence": "partial",
"notes": "SPARQL match - exact name match",
},
{
"name": "Museo Indígena Atacameño",
"city": "Calama",
"q_number": "Q86276595",
"wikidata_name": "Museo Indígena Atacameño de Arqueología y Etnografía",
"confidence": "partial",
"notes": "SPARQL match - partial name (full title in Wikidata)",
},
# ATACAMA (1)
{
"name": "Museo Mineralógico Universidad de Atacama",
"city": "Copiapó",
"q_number": "Q28501803",
"wikidata_name": "Museo Mineralógico de la Universidad de Atacama",
"confidence": "partial",
"notes": "SPARQL match - partial name match",
},
# VALPARAÍSO (5)
{
"name": "Museo de Historia Natural",
"city": "Valparaíso",
"q_number": "Q19950374",
"wikidata_name": "Museo de Historia Natural de Valparaíso",
"confidence": "partial",
"notes": "SPARQL match - partial name (city-specific), founded 1878",
},
{
"name": "Casa Museo La Sebastiana",
"city": "Valparaíso",
"q_number": "Q86278008",
"wikidata_name": "Casa Museo La Sebastiana",
"confidence": "exact",
"notes": "SPARQL match - exact name match (Pablo Neruda house)",
},
{
"name": "Casa Museo Isla Negra",
"city": "El Quisco",
"q_number": "Q86277516",
"wikidata_name": "Casa Museo Isla Negra",
"confidence": "exact",
"notes": "SPARQL match - exact name match (Pablo Neruda house)",
},
{
"name": "Museo Antropológico Padre Sebastián Englert (MAPSE)",
"city": "Isla de Pascua",
"q_number": "Q5437650",
"wikidata_name": "Museo Antropológico Padre Sebastián Englert",
"confidence": "partial",
"notes": "SPARQL match - partial name (excludes acronym), founded 1973",
},
{
"name": "Museo Arqueológico",
"city": "Los Andes",
"q_number": "Q86277234",
"wikidata_name": "Museo Arqueológico de Los Andes",
"confidence": "partial",
"notes": "SPARQL match - partial name (city-specific), founded 1905",
},
# REGIÓN METROPOLITANA (3)
{
"name": "Museo Histórico",
"city": "San Felipe",
"q_number": "Q86277658",
"wikidata_name": "Museo Histórico de San Felipe",
"confidence": "partial",
"notes": "SPARQL match - partial name (city-specific)",
},
{
"name": "Museo de La Ligua",
"city": "La Ligua",
"q_number": "Q6034082",
"wikidata_name": "Museo de La Ligua",
"confidence": "exact",
"notes": "SPARQL match - exact name match, founded 1985",
},
{
"name": "Museo de Talagante",
"city": "Talagante",
"q_number": "Q86280216",
"wikidata_name": "Museo de Talagante",
"confidence": "exact",
"notes": "SPARQL match - exact name match",
},
# O'HIGGINS (2)
{
"name": "Museo Histórico de Pichilemu",
"city": "Pichilemu",
"q_number": "Q112044338",
"wikidata_name": "Museo Histórico de Pichilemu",
"confidence": "exact",
"notes": "SPARQL match - exact name match",
},
{
"name": "Museo Lircunlauta",
"city": "San Fernando",
"q_number": "Q86280637",
"wikidata_name": "Museo Lircunlauta",
"confidence": "exact",
"notes": "SPARQL match - exact name match",
},
# MAULE (3)
{
"name": "Museo de Arte y Artesanía",
"city": "Linares",
"q_number": "Q6033923",
"wikidata_name": "Museo Arte y Artesanía de Linares",
"confidence": "partial",
"notes": "SPARQL match - partial name match, founded 1962",
},
{
"name": "Museo Histórico de Yerbas Buenas",
"city": "Yerbas Buenas",
"q_number": "Q20022173",
"wikidata_name": "Museo Histórico de Yerbas Buenas",
"confidence": "exact",
"notes": "SPARQL match - exact name match",
},
# ÑUBLE (3)
{
"name": "Museo Marta Colvin",
"city": "Chillán",
"q_number": "Q112044588",
"wikidata_name": "Museo Marta Colvin",
"confidence": "exact",
"notes": "SPARQL match - exact name match",
},
{
"name": "Museo Municipal de Ciencias Naturales",
"city": "Chillán",
"q_number": "Q112044585",
"wikidata_name": "Museo Municipal de Ciencias Naturales y Arqueológico Profesor Pedro Ramírez Fuentes",
"confidence": "partial",
"notes": "SPARQL match - partial name (full title in Wikidata)",
},
{
"name": "Itata Museo Antropológico",
"city": "Quirihue",
"q_number": "Q112044584",
"wikidata_name": "Itata Museo Antropológico",
"confidence": "exact",
"notes": "SPARQL match - exact name match",
},
# BIOBÍO (1)
{
"name": "Museo Mapuche de Cañete",
"city": "Cañete",
"q_number": "Q16609804",
"wikidata_name": "Museo Mapuche de Cañete",
"confidence": "exact",
"notes": "SPARQL match - exact name match, founded 1977",
},
# LOS RÍOS (4)
{
"name": "Museo Histórico y Antropológico",
"city": "Valdivia",
"q_number": "Q6940480",
"wikidata_name": "Museo Histórico y Antropológico de Valdivia Mauricio Van de Maele",
"confidence": "partial",
"notes": "SPARQL match - partial name (full title in Wikidata), founded 1994",
},
{
"name": "Museo de la Catedral",
"city": "Valdivia",
"q_number": "Q86283115",
"wikidata_name": "Museo de la Catedral de Valdivia",
"confidence": "partial",
"notes": "SPARQL match - partial name (city-specific)",
},
{
"name": "Museo de sitio Castillo de Niebla",
"city": "Valdivia",
"q_number": "Q20022172",
"wikidata_name": "Museo de Sitio Castillo de Niebla",
"confidence": "exact",
"notes": "SPARQL match - exact name match (case variation)",
},
{
"name": "Museo Tringlo",
"city": "Lago Ranco",
"q_number": "Q86282868",
"wikidata_name": "Museo Tringlo",
"confidence": "exact",
"notes": "SPARQL match - exact name match",
},
# LOS LAGOS (3)
{
"name": "Museo Colonial Alemán de Frutillar",
"city": "Frutillar",
"q_number": "Q20010979",
"wikidata_name": "Museo Colonial Alemán de Frutillar",
"confidence": "exact",
"notes": "SPARQL match - exact name match",
},
{
"name": "Museo Antonio Felmer",
"city": "Puerto Varas",
"q_number": "Q20022171",
"wikidata_name": "Museo Antonio Felmer",
"confidence": "exact",
"notes": "SPARQL match - exact name match",
},
{
"name": "Museo y Archivo Histórico Municipal",
"city": "Osorno",
"q_number": "Q16609772",
"wikidata_name": "Museo y Archivo Histórico Municipal de Osorno",
"confidence": "partial",
"notes": "SPARQL match - partial name (city-specific)",
},
# AYSÉN (3)
{
"name": "Museo de Sitio de Chaitén",
"city": "Chaitén",
"q_number": "Q112044386",
"wikidata_name": "Museo de Sitio de Chaitén",
"confidence": "exact",
"notes": "SPARQL match - exact name match",
},
{
"name": "Museo Municipal de Cochrane",
"city": "Cochrane",
"q_number": "Q86284188",
"wikidata_name": "Museo Municipal de Cochrane",
"confidence": "exact",
"notes": "SPARQL match - exact name match",
},
{
"name": "Museo Pioneros del Baker",
"city": "Cochrane",
"q_number": "Q86284160",
"wikidata_name": "Museo Rural Pioneros del Baker",
"confidence": "partial",
"notes": "SPARQL match - partial name (includes 'Rural')",
},
# MAGALLANES (2)
{
"name": "Museo Salesiano",
"city": "Punta Arenas",
"q_number": "Q86284641",
"wikidata_name": "Museo Salesiano Maggiorino Borgatello",
"confidence": "partial",
"notes": "SPARQL match - partial name (full title in Wikidata)",
},
{
"name": "Museo Municipal Fernando Cordero Rusque",
"city": "Porvenir",
"q_number": "Q83551041",
"wikidata_name": "Museo Municipal Fernando Cordero Rusque",
"confidence": "exact",
"notes": "SPARQL match - exact name match, founded 1980",
},
]
def load_institutions(yaml_path: Path) -> list:
"""Load institutions from YAML file."""
with open(yaml_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
return data if isinstance(data, list) else []
def enrich_institution(institution: dict, enrichment: dict) -> dict:
"""Add Wikidata identifier to institution."""
# Initialize identifiers list if not present
if 'identifiers' not in institution:
institution['identifiers'] = []
# Check if Wikidata already exists
wikidata_ids = [i for i in institution['identifiers']
if i.get('identifier_scheme') == 'Wikidata']
if wikidata_ids:
print(f"⚠️ {institution['name']} already has Wikidata: {wikidata_ids[0]['identifier_value']}")
return institution
# Add new Wikidata identifier
wikidata_entry = {
'identifier_scheme': 'Wikidata',
'identifier_value': enrichment['q_number'],
'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['q_number']}"
}
institution['identifiers'].append(wikidata_entry)
# Update provenance
if 'provenance' not in institution:
institution['provenance'] = {}
institution['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()
institution['provenance']['enrichment_batch'] = 7
institution['provenance']['enrichment_method'] = 'SPARQL_BULK_QUERY'
institution['provenance']['enrichment_confidence'] = enrichment['confidence']
institution['provenance']['wikidata_verified'] = True
if enrichment.get('notes'):
if 'notes' not in institution['provenance']:
institution['provenance']['notes'] = []
elif isinstance(institution['provenance']['notes'], str):
institution['provenance']['notes'] = [institution['provenance']['notes']]
institution['provenance']['notes'].append(
f"Batch 7: {enrichment['notes']}"
)
print(f"✅ Enriched: {institution['name']}{enrichment['q_number']}")
return institution
def main():
# File paths
input_file = Path("data/instances/chile/chilean_institutions_batch6_enriched.yaml")
output_file = Path("data/instances/chile/chilean_institutions_batch7_enriched.yaml")
print("=" * 80)
print("CHILEAN INSTITUTIONS - BATCH 7 WIKIDATA ENRICHMENT")
print("=" * 80)
print()
print(f"📂 Input: {input_file}")
print(f"📝 Output: {output_file}")
print(f"🎯 Target: Add {len(BATCH_7_ENRICHMENTS)} Wikidata Q-numbers (SPARQL bulk)")
print()
# Load institutions
institutions = load_institutions(input_file)
print(f"📖 Loaded {len(institutions)} institutions")
print()
# Track statistics
enriched_count = 0
already_enriched_count = 0
not_found_count = 0
# Process each enrichment
for enrichment in BATCH_7_ENRICHMENTS:
# Find matching institution
matches = [
inst for inst in institutions
if inst.get('name') == enrichment['name']
]
if not matches:
print(f"❌ NOT FOUND: {enrichment['name']} ({enrichment['city']})")
not_found_count += 1
continue
if len(matches) > 1:
print(f"⚠️ MULTIPLE MATCHES: {enrichment['name']}")
continue
institution = matches[0]
# Check if already enriched
existing_wikidata = [
i for i in institution.get('identifiers', [])
if i.get('identifier_scheme') == 'Wikidata'
]
if existing_wikidata:
already_enriched_count += 1
continue
# Enrich institution
enrich_institution(institution, enrichment)
enriched_count += 1
print()
print("=" * 80)
print("ENRICHMENT SUMMARY")
print("=" * 80)
print(f"✅ Newly enriched: {enriched_count}")
print(f"⚠️ Already enriched: {already_enriched_count}")
print(f"❌ Not found: {not_found_count}")
print()
# Count total with Wikidata
total_with_wikidata = sum(
1 for inst in institutions
if any(i.get('identifier_scheme') == 'Wikidata'
for i in inst.get('identifiers', []))
)
coverage = (total_with_wikidata / len(institutions)) * 100
print(f"📊 Total institutions: {len(institutions)}")
print(f"🔗 With Wikidata: {total_with_wikidata} ({coverage:.1f}%)")
print()
# Save enriched data
print(f"💾 Saving to: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(
institutions,
f,
allow_unicode=True,
default_flow_style=False,
sort_keys=False,
width=120
)
print("✅ Batch 7 enrichment complete!")
print()
# Regional breakdown
museums = [inst for inst in institutions if inst.get('institution_type') == 'MUSEUM']
museums_with_wd = [
m for m in museums
if any(i.get('identifier_scheme') == 'Wikidata'
for i in m.get('identifiers', []))
]
print(f"🏛️ Museums: {len(museums_with_wd)}/{len(museums)} with Wikidata")
print()
if __name__ == "__main__":
main()