- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
204 lines
7 KiB
Python
Executable file
204 lines
7 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Chilean heritage institutions with Batch 11 Wikidata identifiers.
|
|
|
|
Applies 5 validated Wikidata matches from batch11_final_validation.json:
|
|
- Museo Histórico-Arqueológico (Quillota) → Q12184920
|
|
- Museo Mapuche de Purén → Q86282614
|
|
- Museo Pleistocénico (Osorno) → Q112044601
|
|
- Red de Museos Aysén → Q53877849
|
|
- Museo Territorial Yagan Usi → Q6775581
|
|
|
|
Updates coverage from 55/90 (61.1%) → 60/90 (66.7%)
|
|
"""
|
|
|
|
import json
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Any
|
|
import shutil
|
|
|
|
# File paths
|
|
INPUT_FILE = Path("data/instances/chile/chilean_institutions_batch10_enriched.yaml")
|
|
VALIDATION_FILE = Path("scripts/batch11_final_validation.json")
|
|
OUTPUT_FILE = Path("data/instances/chile/chilean_institutions_batch11_enriched.yaml")
|
|
BACKUP_FILE = INPUT_FILE.with_suffix('.yaml.batch10.backup')
|
|
|
|
def load_yaml(filepath: Path) -> List[Dict[str, Any]]:
|
|
"""Load YAML file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def load_json(filepath: Path) -> Dict[str, Any]:
|
|
"""Load JSON file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
def save_yaml(data: List[Dict[str, Any]], filepath: Path):
|
|
"""Save YAML file with proper formatting."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f,
|
|
allow_unicode=True,
|
|
default_flow_style=False,
|
|
sort_keys=False,
|
|
indent=2)
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize institution name for matching."""
|
|
return name.lower().strip().replace(" ", " ")
|
|
|
|
def find_institution(institutions: List[Dict], name: str, city: str) -> tuple:
|
|
"""Find institution by name and city. Returns (index, institution) or (None, None)."""
|
|
norm_name = normalize_name(name)
|
|
norm_city = normalize_name(city)
|
|
|
|
for idx, inst in enumerate(institutions):
|
|
inst_name = normalize_name(inst.get('name', ''))
|
|
|
|
# Check if institution has locations
|
|
locations = inst.get('locations', [])
|
|
if not locations:
|
|
continue
|
|
|
|
inst_city = normalize_name(locations[0].get('city', ''))
|
|
|
|
# Match by name and city
|
|
if norm_name == inst_name and norm_city == inst_city:
|
|
return idx, inst
|
|
|
|
# Also try partial name match with exact city (for "Museo Territorial Yagan Usi")
|
|
if norm_name in inst_name and norm_city == inst_city:
|
|
return idx, inst
|
|
|
|
return None, None
|
|
|
|
def has_wikidata_identifier(institution: Dict) -> bool:
|
|
"""Check if institution already has a Wikidata identifier."""
|
|
identifiers = institution.get('identifiers', [])
|
|
return any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers)
|
|
|
|
def add_wikidata_identifier(institution: Dict, q_number: str, wikidata_name: str, confidence: str, reason: str):
|
|
"""Add Wikidata identifier to institution."""
|
|
# Initialize identifiers list if it doesn't exist
|
|
if 'identifiers' not in institution:
|
|
institution['identifiers'] = []
|
|
|
|
# Add Wikidata identifier
|
|
wikidata_id = {
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': q_number,
|
|
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
|
|
}
|
|
institution['identifiers'].append(wikidata_id)
|
|
|
|
# Update provenance
|
|
if 'provenance' not in institution:
|
|
institution['provenance'] = {}
|
|
|
|
institution['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()
|
|
institution['provenance']['enrichment_batch'] = 11
|
|
institution['provenance']['wikidata_match_confidence'] = confidence
|
|
institution['provenance']['wikidata_match_reason'] = reason
|
|
institution['provenance']['wikidata_name'] = wikidata_name
|
|
|
|
return institution
|
|
|
|
def enrich_batch11():
|
|
"""Main enrichment function."""
|
|
print("=" * 80)
|
|
print("Chilean GLAM Wikidata Enrichment - Batch 11")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Create backup
|
|
print(f"Creating backup: {BACKUP_FILE}")
|
|
shutil.copy2(INPUT_FILE, BACKUP_FILE)
|
|
print()
|
|
|
|
# Load data
|
|
print(f"Loading dataset: {INPUT_FILE}")
|
|
institutions = load_yaml(INPUT_FILE)
|
|
print(f" Loaded {len(institutions)} institutions")
|
|
print()
|
|
|
|
print(f"Loading validation results: {VALIDATION_FILE}")
|
|
validation = load_json(VALIDATION_FILE)
|
|
validated_matches = validation['validated_matches']
|
|
print(f" Loaded {len(validated_matches)} validated matches")
|
|
print()
|
|
|
|
# Statistics
|
|
enriched_count = 0
|
|
already_enriched = 0
|
|
not_found = 0
|
|
|
|
# Process each validated match
|
|
print("Processing validated matches:")
|
|
print("-" * 80)
|
|
|
|
for match in validated_matches:
|
|
museum_name = match['museum_name']
|
|
city = match['city']
|
|
q_number = match['q_number']
|
|
wikidata_name = match['wikidata_name']
|
|
confidence = match['confidence']
|
|
reason = match['reason']
|
|
|
|
print(f"\n{museum_name} ({city})")
|
|
print(f" → {q_number}: {wikidata_name}")
|
|
|
|
# Find institution in dataset
|
|
idx, institution = find_institution(institutions, museum_name, city)
|
|
|
|
if institution is None:
|
|
print(f" ❌ NOT FOUND in dataset")
|
|
not_found += 1
|
|
continue
|
|
|
|
# Check if already has Wikidata ID
|
|
if has_wikidata_identifier(institution):
|
|
print(f" ⚠️ Already has Wikidata identifier (skipping)")
|
|
already_enriched += 1
|
|
continue
|
|
|
|
# Add Wikidata identifier
|
|
add_wikidata_identifier(institution, q_number, wikidata_name, confidence, reason)
|
|
institutions[idx] = institution
|
|
enriched_count += 1
|
|
print(f" ✅ Enriched with {q_number}")
|
|
|
|
print()
|
|
print("-" * 80)
|
|
print(f"\nEnrichment Summary:")
|
|
print(f" Enriched: {enriched_count}")
|
|
print(f" Already enriched: {already_enriched}")
|
|
print(f" Not found: {not_found}")
|
|
print()
|
|
|
|
# Calculate coverage
|
|
total_institutions = len(institutions)
|
|
institutions_with_wikidata = sum(1 for inst in institutions if has_wikidata_identifier(inst))
|
|
coverage_pct = (institutions_with_wikidata / total_institutions) * 100
|
|
|
|
print(f"Coverage:")
|
|
print(f" Institutions with Wikidata: {institutions_with_wikidata}/{total_institutions} ({coverage_pct:.1f}%)")
|
|
print()
|
|
|
|
# Save enriched dataset
|
|
print(f"Saving enriched dataset: {OUTPUT_FILE}")
|
|
save_yaml(institutions, OUTPUT_FILE)
|
|
print()
|
|
|
|
print("=" * 80)
|
|
print("Batch 11 Enrichment Complete!")
|
|
print("=" * 80)
|
|
print()
|
|
print(f"✅ Updated: {OUTPUT_FILE}")
|
|
print(f"📦 Backup: {BACKUP_FILE}")
|
|
print()
|
|
print(f"Progress: {institutions_with_wikidata}/{total_institutions} institutions ({coverage_pct:.1f}%)")
|
|
print(f"Target: 63/90 (70.0%) - Need {63 - institutions_with_wikidata} more institutions")
|
|
|
|
if __name__ == "__main__":
|
|
enrich_batch11()
|