- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
459 lines
15 KiB
Python
459 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Chilean GLAM Institutions - Batch 7 Wikidata Enrichment
|
|
Uses bulk SPARQL matches from query_wikidata_chilean_museums.py
|
|
32 museums with verified Q-numbers from Wikidata Query Service
|
|
|
|
Target: 52/90 institutions (57.8% coverage)
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
# Batch 7: 32 museums from SPARQL bulk query (all regions)
|
|
BATCH_7_ENRICHMENTS = [
|
|
# ARICA Y PARINACOTA (1)
|
|
{
|
|
"name": "Museo Universidad de Tarapacá San Miguel de Azapa (MASMA)",
|
|
"city": "Arica",
|
|
"q_number": "Q9046776",
|
|
"wikidata_name": "Museo Arqueológico y Antropológico de San Miguel de Azapa",
|
|
"confidence": "partial", # Name variation but city + type match
|
|
"notes": "SPARQL match - partial name (includes full institutional title)",
|
|
},
|
|
|
|
# ANTOFAGASTA (2)
|
|
{
|
|
"name": "Museo de Historia Natural y Cultural del Desierto de Atacama (MUHNCAL)",
|
|
"city": "Calama",
|
|
"q_number": "Q86276638",
|
|
"wikidata_name": "Museo de Historia Natural y Cultural del Desierto de Atacama",
|
|
"confidence": "partial",
|
|
"notes": "SPARQL match - exact name match",
|
|
},
|
|
{
|
|
"name": "Museo Indígena Atacameño",
|
|
"city": "Calama",
|
|
"q_number": "Q86276595",
|
|
"wikidata_name": "Museo Indígena Atacameño de Arqueología y Etnografía",
|
|
"confidence": "partial",
|
|
"notes": "SPARQL match - partial name (full title in Wikidata)",
|
|
},
|
|
|
|
# ATACAMA (1)
|
|
{
|
|
"name": "Museo Mineralógico Universidad de Atacama",
|
|
"city": "Copiapó",
|
|
"q_number": "Q28501803",
|
|
"wikidata_name": "Museo Mineralógico de la Universidad de Atacama",
|
|
"confidence": "partial",
|
|
"notes": "SPARQL match - partial name match",
|
|
},
|
|
|
|
# VALPARAÍSO (5)
|
|
{
|
|
"name": "Museo de Historia Natural",
|
|
"city": "Valparaíso",
|
|
"q_number": "Q19950374",
|
|
"wikidata_name": "Museo de Historia Natural de Valparaíso",
|
|
"confidence": "partial",
|
|
"notes": "SPARQL match - partial name (city-specific), founded 1878",
|
|
},
|
|
{
|
|
"name": "Casa Museo La Sebastiana",
|
|
"city": "Valparaíso",
|
|
"q_number": "Q86278008",
|
|
"wikidata_name": "Casa Museo La Sebastiana",
|
|
"confidence": "exact",
|
|
"notes": "SPARQL match - exact name match (Pablo Neruda house)",
|
|
},
|
|
{
|
|
"name": "Casa Museo Isla Negra",
|
|
"city": "El Quisco",
|
|
"q_number": "Q86277516",
|
|
"wikidata_name": "Casa Museo Isla Negra",
|
|
"confidence": "exact",
|
|
"notes": "SPARQL match - exact name match (Pablo Neruda house)",
|
|
},
|
|
{
|
|
"name": "Museo Antropológico Padre Sebastián Englert (MAPSE)",
|
|
"city": "Isla de Pascua",
|
|
"q_number": "Q5437650",
|
|
"wikidata_name": "Museo Antropológico Padre Sebastián Englert",
|
|
"confidence": "partial",
|
|
"notes": "SPARQL match - partial name (excludes acronym), founded 1973",
|
|
},
|
|
{
|
|
"name": "Museo Arqueológico",
|
|
"city": "Los Andes",
|
|
"q_number": "Q86277234",
|
|
"wikidata_name": "Museo Arqueológico de Los Andes",
|
|
"confidence": "partial",
|
|
"notes": "SPARQL match - partial name (city-specific), founded 1905",
|
|
},
|
|
|
|
# REGIÓN METROPOLITANA (3)
|
|
{
|
|
"name": "Museo Histórico",
|
|
"city": "San Felipe",
|
|
"q_number": "Q86277658",
|
|
"wikidata_name": "Museo Histórico de San Felipe",
|
|
"confidence": "partial",
|
|
"notes": "SPARQL match - partial name (city-specific)",
|
|
},
|
|
{
|
|
"name": "Museo de La Ligua",
|
|
"city": "La Ligua",
|
|
"q_number": "Q6034082",
|
|
"wikidata_name": "Museo de La Ligua",
|
|
"confidence": "exact",
|
|
"notes": "SPARQL match - exact name match, founded 1985",
|
|
},
|
|
{
|
|
"name": "Museo de Talagante",
|
|
"city": "Talagante",
|
|
"q_number": "Q86280216",
|
|
"wikidata_name": "Museo de Talagante",
|
|
"confidence": "exact",
|
|
"notes": "SPARQL match - exact name match",
|
|
},
|
|
|
|
# O'HIGGINS (2)
|
|
{
|
|
"name": "Museo Histórico de Pichilemu",
|
|
"city": "Pichilemu",
|
|
"q_number": "Q112044338",
|
|
"wikidata_name": "Museo Histórico de Pichilemu",
|
|
"confidence": "exact",
|
|
"notes": "SPARQL match - exact name match",
|
|
},
|
|
{
|
|
"name": "Museo Lircunlauta",
|
|
"city": "San Fernando",
|
|
"q_number": "Q86280637",
|
|
"wikidata_name": "Museo Lircunlauta",
|
|
"confidence": "exact",
|
|
"notes": "SPARQL match - exact name match",
|
|
},
|
|
|
|
# MAULE (3)
|
|
{
|
|
"name": "Museo de Arte y Artesanía",
|
|
"city": "Linares",
|
|
"q_number": "Q6033923",
|
|
"wikidata_name": "Museo Arte y Artesanía de Linares",
|
|
"confidence": "partial",
|
|
"notes": "SPARQL match - partial name match, founded 1962",
|
|
},
|
|
{
|
|
"name": "Museo Histórico de Yerbas Buenas",
|
|
"city": "Yerbas Buenas",
|
|
"q_number": "Q20022173",
|
|
"wikidata_name": "Museo Histórico de Yerbas Buenas",
|
|
"confidence": "exact",
|
|
"notes": "SPARQL match - exact name match",
|
|
},
|
|
|
|
# ÑUBLE (3)
|
|
{
|
|
"name": "Museo Marta Colvin",
|
|
"city": "Chillán",
|
|
"q_number": "Q112044588",
|
|
"wikidata_name": "Museo Marta Colvin",
|
|
"confidence": "exact",
|
|
"notes": "SPARQL match - exact name match",
|
|
},
|
|
{
|
|
"name": "Museo Municipal de Ciencias Naturales",
|
|
"city": "Chillán",
|
|
"q_number": "Q112044585",
|
|
"wikidata_name": "Museo Municipal de Ciencias Naturales y Arqueológico Profesor Pedro Ramírez Fuentes",
|
|
"confidence": "partial",
|
|
"notes": "SPARQL match - partial name (full title in Wikidata)",
|
|
},
|
|
{
|
|
"name": "Itata Museo Antropológico",
|
|
"city": "Quirihue",
|
|
"q_number": "Q112044584",
|
|
"wikidata_name": "Itata Museo Antropológico",
|
|
"confidence": "exact",
|
|
"notes": "SPARQL match - exact name match",
|
|
},
|
|
|
|
# BIOBÍO (1)
|
|
{
|
|
"name": "Museo Mapuche de Cañete",
|
|
"city": "Cañete",
|
|
"q_number": "Q16609804",
|
|
"wikidata_name": "Museo Mapuche de Cañete",
|
|
"confidence": "exact",
|
|
"notes": "SPARQL match - exact name match, founded 1977",
|
|
},
|
|
|
|
# LOS RÍOS (4)
|
|
{
|
|
"name": "Museo Histórico y Antropológico",
|
|
"city": "Valdivia",
|
|
"q_number": "Q6940480",
|
|
"wikidata_name": "Museo Histórico y Antropológico de Valdivia Mauricio Van de Maele",
|
|
"confidence": "partial",
|
|
"notes": "SPARQL match - partial name (full title in Wikidata), founded 1994",
|
|
},
|
|
{
|
|
"name": "Museo de la Catedral",
|
|
"city": "Valdivia",
|
|
"q_number": "Q86283115",
|
|
"wikidata_name": "Museo de la Catedral de Valdivia",
|
|
"confidence": "partial",
|
|
"notes": "SPARQL match - partial name (city-specific)",
|
|
},
|
|
{
|
|
"name": "Museo de sitio Castillo de Niebla",
|
|
"city": "Valdivia",
|
|
"q_number": "Q20022172",
|
|
"wikidata_name": "Museo de Sitio Castillo de Niebla",
|
|
"confidence": "exact",
|
|
"notes": "SPARQL match - exact name match (case variation)",
|
|
},
|
|
{
|
|
"name": "Museo Tringlo",
|
|
"city": "Lago Ranco",
|
|
"q_number": "Q86282868",
|
|
"wikidata_name": "Museo Tringlo",
|
|
"confidence": "exact",
|
|
"notes": "SPARQL match - exact name match",
|
|
},
|
|
|
|
# LOS LAGOS (3)
|
|
{
|
|
"name": "Museo Colonial Alemán de Frutillar",
|
|
"city": "Frutillar",
|
|
"q_number": "Q20010979",
|
|
"wikidata_name": "Museo Colonial Alemán de Frutillar",
|
|
"confidence": "exact",
|
|
"notes": "SPARQL match - exact name match",
|
|
},
|
|
{
|
|
"name": "Museo Antonio Felmer",
|
|
"city": "Puerto Varas",
|
|
"q_number": "Q20022171",
|
|
"wikidata_name": "Museo Antonio Felmer",
|
|
"confidence": "exact",
|
|
"notes": "SPARQL match - exact name match",
|
|
},
|
|
{
|
|
"name": "Museo y Archivo Histórico Municipal",
|
|
"city": "Osorno",
|
|
"q_number": "Q16609772",
|
|
"wikidata_name": "Museo y Archivo Histórico Municipal de Osorno",
|
|
"confidence": "partial",
|
|
"notes": "SPARQL match - partial name (city-specific)",
|
|
},
|
|
|
|
# AYSÉN (3)
|
|
{
|
|
"name": "Museo de Sitio de Chaitén",
|
|
"city": "Chaitén",
|
|
"q_number": "Q112044386",
|
|
"wikidata_name": "Museo de Sitio de Chaitén",
|
|
"confidence": "exact",
|
|
"notes": "SPARQL match - exact name match",
|
|
},
|
|
{
|
|
"name": "Museo Municipal de Cochrane",
|
|
"city": "Cochrane",
|
|
"q_number": "Q86284188",
|
|
"wikidata_name": "Museo Municipal de Cochrane",
|
|
"confidence": "exact",
|
|
"notes": "SPARQL match - exact name match",
|
|
},
|
|
{
|
|
"name": "Museo Pioneros del Baker",
|
|
"city": "Cochrane",
|
|
"q_number": "Q86284160",
|
|
"wikidata_name": "Museo Rural Pioneros del Baker",
|
|
"confidence": "partial",
|
|
"notes": "SPARQL match - partial name (includes 'Rural')",
|
|
},
|
|
|
|
# MAGALLANES (2)
|
|
{
|
|
"name": "Museo Salesiano",
|
|
"city": "Punta Arenas",
|
|
"q_number": "Q86284641",
|
|
"wikidata_name": "Museo Salesiano Maggiorino Borgatello",
|
|
"confidence": "partial",
|
|
"notes": "SPARQL match - partial name (full title in Wikidata)",
|
|
},
|
|
{
|
|
"name": "Museo Municipal Fernando Cordero Rusque",
|
|
"city": "Porvenir",
|
|
"q_number": "Q83551041",
|
|
"wikidata_name": "Museo Municipal Fernando Cordero Rusque",
|
|
"confidence": "exact",
|
|
"notes": "SPARQL match - exact name match, founded 1980",
|
|
},
|
|
]
|
|
|
|
|
|
def load_institutions(yaml_path: Path) -> list:
|
|
"""Load institutions from YAML file."""
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
return data if isinstance(data, list) else []
|
|
|
|
|
|
def enrich_institution(institution: dict, enrichment: dict) -> dict:
|
|
"""Add Wikidata identifier to institution."""
|
|
# Initialize identifiers list if not present
|
|
if 'identifiers' not in institution:
|
|
institution['identifiers'] = []
|
|
|
|
# Check if Wikidata already exists
|
|
wikidata_ids = [i for i in institution['identifiers']
|
|
if i.get('identifier_scheme') == 'Wikidata']
|
|
|
|
if wikidata_ids:
|
|
print(f"⚠️ {institution['name']} already has Wikidata: {wikidata_ids[0]['identifier_value']}")
|
|
return institution
|
|
|
|
# Add new Wikidata identifier
|
|
wikidata_entry = {
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': enrichment['q_number'],
|
|
'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['q_number']}"
|
|
}
|
|
institution['identifiers'].append(wikidata_entry)
|
|
|
|
# Update provenance
|
|
if 'provenance' not in institution:
|
|
institution['provenance'] = {}
|
|
|
|
institution['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()
|
|
institution['provenance']['enrichment_batch'] = 7
|
|
institution['provenance']['enrichment_method'] = 'SPARQL_BULK_QUERY'
|
|
institution['provenance']['enrichment_confidence'] = enrichment['confidence']
|
|
institution['provenance']['wikidata_verified'] = True
|
|
|
|
if enrichment.get('notes'):
|
|
if 'notes' not in institution['provenance']:
|
|
institution['provenance']['notes'] = []
|
|
elif isinstance(institution['provenance']['notes'], str):
|
|
institution['provenance']['notes'] = [institution['provenance']['notes']]
|
|
|
|
institution['provenance']['notes'].append(
|
|
f"Batch 7: {enrichment['notes']}"
|
|
)
|
|
|
|
print(f"✅ Enriched: {institution['name']} → {enrichment['q_number']}")
|
|
return institution
|
|
|
|
|
|
def main():
|
|
# File paths
|
|
input_file = Path("data/instances/chile/chilean_institutions_batch6_enriched.yaml")
|
|
output_file = Path("data/instances/chile/chilean_institutions_batch7_enriched.yaml")
|
|
|
|
print("=" * 80)
|
|
print("CHILEAN INSTITUTIONS - BATCH 7 WIKIDATA ENRICHMENT")
|
|
print("=" * 80)
|
|
print()
|
|
print(f"📂 Input: {input_file}")
|
|
print(f"📝 Output: {output_file}")
|
|
print(f"🎯 Target: Add {len(BATCH_7_ENRICHMENTS)} Wikidata Q-numbers (SPARQL bulk)")
|
|
print()
|
|
|
|
# Load institutions
|
|
institutions = load_institutions(input_file)
|
|
print(f"📖 Loaded {len(institutions)} institutions")
|
|
print()
|
|
|
|
# Track statistics
|
|
enriched_count = 0
|
|
already_enriched_count = 0
|
|
not_found_count = 0
|
|
|
|
# Process each enrichment
|
|
for enrichment in BATCH_7_ENRICHMENTS:
|
|
# Find matching institution
|
|
matches = [
|
|
inst for inst in institutions
|
|
if inst.get('name') == enrichment['name']
|
|
]
|
|
|
|
if not matches:
|
|
print(f"❌ NOT FOUND: {enrichment['name']} ({enrichment['city']})")
|
|
not_found_count += 1
|
|
continue
|
|
|
|
if len(matches) > 1:
|
|
print(f"⚠️ MULTIPLE MATCHES: {enrichment['name']}")
|
|
continue
|
|
|
|
institution = matches[0]
|
|
|
|
# Check if already enriched
|
|
existing_wikidata = [
|
|
i for i in institution.get('identifiers', [])
|
|
if i.get('identifier_scheme') == 'Wikidata'
|
|
]
|
|
|
|
if existing_wikidata:
|
|
already_enriched_count += 1
|
|
continue
|
|
|
|
# Enrich institution
|
|
enrich_institution(institution, enrichment)
|
|
enriched_count += 1
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print("ENRICHMENT SUMMARY")
|
|
print("=" * 80)
|
|
print(f"✅ Newly enriched: {enriched_count}")
|
|
print(f"⚠️ Already enriched: {already_enriched_count}")
|
|
print(f"❌ Not found: {not_found_count}")
|
|
print()
|
|
|
|
# Count total with Wikidata
|
|
total_with_wikidata = sum(
|
|
1 for inst in institutions
|
|
if any(i.get('identifier_scheme') == 'Wikidata'
|
|
for i in inst.get('identifiers', []))
|
|
)
|
|
|
|
coverage = (total_with_wikidata / len(institutions)) * 100
|
|
print(f"📊 Total institutions: {len(institutions)}")
|
|
print(f"🔗 With Wikidata: {total_with_wikidata} ({coverage:.1f}%)")
|
|
print()
|
|
|
|
# Save enriched data
|
|
print(f"💾 Saving to: {output_file}")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(
|
|
institutions,
|
|
f,
|
|
allow_unicode=True,
|
|
default_flow_style=False,
|
|
sort_keys=False,
|
|
width=120
|
|
)
|
|
|
|
print("✅ Batch 7 enrichment complete!")
|
|
print()
|
|
|
|
# Regional breakdown
|
|
museums = [inst for inst in institutions if inst.get('institution_type') == 'MUSEUM']
|
|
museums_with_wd = [
|
|
m for m in museums
|
|
if any(i.get('identifier_scheme') == 'Wikidata'
|
|
for i in m.get('identifiers', []))
|
|
]
|
|
|
|
print(f"🏛️ Museums: {len(museums_with_wd)}/{len(museums)} with Wikidata")
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|