glam/scripts/enrich_chilean_batch11.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

204 lines
7 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Enrich Chilean heritage institutions with Batch 11 Wikidata identifiers.
Applies 5 validated Wikidata matches from batch11_final_validation.json:
- Museo Histórico-Arqueológico (Quillota) → Q12184920
- Museo Mapuche de Purén → Q86282614
- Museo Pleistocénico (Osorno) → Q112044601
- Red de Museos Aysén → Q53877849
- Museo Territorial Yagan Usi → Q6775581
Updates coverage from 55/90 (61.1%) → 60/90 (66.7%)
"""
import json
import yaml
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Any
import shutil
# File paths
INPUT_FILE = Path("data/instances/chile/chilean_institutions_batch10_enriched.yaml")
VALIDATION_FILE = Path("scripts/batch11_final_validation.json")
OUTPUT_FILE = Path("data/instances/chile/chilean_institutions_batch11_enriched.yaml")
BACKUP_FILE = INPUT_FILE.with_suffix('.yaml.batch10.backup')
def load_yaml(filepath: Path) -> List[Dict[str, Any]]:
"""Load YAML file."""
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def load_json(filepath: Path) -> Dict[str, Any]:
"""Load JSON file."""
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
def save_yaml(data: List[Dict[str, Any]], filepath: Path):
"""Save YAML file with proper formatting."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f,
allow_unicode=True,
default_flow_style=False,
sort_keys=False,
indent=2)
def normalize_name(name: str) -> str:
"""Normalize institution name for matching."""
return name.lower().strip().replace(" ", " ")
def find_institution(institutions: List[Dict], name: str, city: str) -> tuple:
"""Find institution by name and city. Returns (index, institution) or (None, None)."""
norm_name = normalize_name(name)
norm_city = normalize_name(city)
for idx, inst in enumerate(institutions):
inst_name = normalize_name(inst.get('name', ''))
# Check if institution has locations
locations = inst.get('locations', [])
if not locations:
continue
inst_city = normalize_name(locations[0].get('city', ''))
# Match by name and city
if norm_name == inst_name and norm_city == inst_city:
return idx, inst
# Also try partial name match with exact city (for "Museo Territorial Yagan Usi")
if norm_name in inst_name and norm_city == inst_city:
return idx, inst
return None, None
def has_wikidata_identifier(institution: Dict) -> bool:
"""Check if institution already has a Wikidata identifier."""
identifiers = institution.get('identifiers', [])
return any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers)
def add_wikidata_identifier(institution: Dict, q_number: str, wikidata_name: str, confidence: str, reason: str):
"""Add Wikidata identifier to institution."""
# Initialize identifiers list if it doesn't exist
if 'identifiers' not in institution:
institution['identifiers'] = []
# Add Wikidata identifier
wikidata_id = {
'identifier_scheme': 'Wikidata',
'identifier_value': q_number,
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
}
institution['identifiers'].append(wikidata_id)
# Update provenance
if 'provenance' not in institution:
institution['provenance'] = {}
institution['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()
institution['provenance']['enrichment_batch'] = 11
institution['provenance']['wikidata_match_confidence'] = confidence
institution['provenance']['wikidata_match_reason'] = reason
institution['provenance']['wikidata_name'] = wikidata_name
return institution
def enrich_batch11():
"""Main enrichment function."""
print("=" * 80)
print("Chilean GLAM Wikidata Enrichment - Batch 11")
print("=" * 80)
print()
# Create backup
print(f"Creating backup: {BACKUP_FILE}")
shutil.copy2(INPUT_FILE, BACKUP_FILE)
print()
# Load data
print(f"Loading dataset: {INPUT_FILE}")
institutions = load_yaml(INPUT_FILE)
print(f" Loaded {len(institutions)} institutions")
print()
print(f"Loading validation results: {VALIDATION_FILE}")
validation = load_json(VALIDATION_FILE)
validated_matches = validation['validated_matches']
print(f" Loaded {len(validated_matches)} validated matches")
print()
# Statistics
enriched_count = 0
already_enriched = 0
not_found = 0
# Process each validated match
print("Processing validated matches:")
print("-" * 80)
for match in validated_matches:
museum_name = match['museum_name']
city = match['city']
q_number = match['q_number']
wikidata_name = match['wikidata_name']
confidence = match['confidence']
reason = match['reason']
print(f"\n{museum_name} ({city})")
print(f"{q_number}: {wikidata_name}")
# Find institution in dataset
idx, institution = find_institution(institutions, museum_name, city)
if institution is None:
print(f" ❌ NOT FOUND in dataset")
not_found += 1
continue
# Check if already has Wikidata ID
if has_wikidata_identifier(institution):
print(f" ⚠️ Already has Wikidata identifier (skipping)")
already_enriched += 1
continue
# Add Wikidata identifier
add_wikidata_identifier(institution, q_number, wikidata_name, confidence, reason)
institutions[idx] = institution
enriched_count += 1
print(f" ✅ Enriched with {q_number}")
print()
print("-" * 80)
print(f"\nEnrichment Summary:")
print(f" Enriched: {enriched_count}")
print(f" Already enriched: {already_enriched}")
print(f" Not found: {not_found}")
print()
# Calculate coverage
total_institutions = len(institutions)
institutions_with_wikidata = sum(1 for inst in institutions if has_wikidata_identifier(inst))
coverage_pct = (institutions_with_wikidata / total_institutions) * 100
print(f"Coverage:")
print(f" Institutions with Wikidata: {institutions_with_wikidata}/{total_institutions} ({coverage_pct:.1f}%)")
print()
# Save enriched dataset
print(f"Saving enriched dataset: {OUTPUT_FILE}")
save_yaml(institutions, OUTPUT_FILE)
print()
print("=" * 80)
print("Batch 11 Enrichment Complete!")
print("=" * 80)
print()
print(f"✅ Updated: {OUTPUT_FILE}")
print(f"📦 Backup: {BACKUP_FILE}")
print()
print(f"Progress: {institutions_with_wikidata}/{total_institutions} institutions ({coverage_pct:.1f}%)")
print(f"Target: 63/90 (70.0%) - Need {63 - institutions_with_wikidata} more institutions")
if __name__ == "__main__":
enrich_batch11()