glam/scripts/enrich_chilean_batch8.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

208 lines
7 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Chilean GLAM Institutions - Batch 8 Wikidata Enrichment (Libraries)
Uses bulk SPARQL matches from query_wikidata_chilean_libraries.py
2 libraries with verified Q-numbers from Wikidata Query Service
Target: 54/90 institutions (60% coverage)
"""
import yaml
from pathlib import Path
from datetime import datetime, timezone
# Batch 8: 2 libraries from SPARQL bulk query
BATCH_8_ENRICHMENTS = [
{
"name": "Biblioteca Nacional Digital",
"city": "Iquique",
"q_number": "Q18924152",
"wikidata_name": "Biblioteca Nacional Digital de Chile",
"confidence": "partial",
"notes": "SPARQL match - partial name (full official title in Wikidata). Note: City may be incorrect in our data - this is a national digital library, not specific to Iquique.",
},
{
"name": "William Mulloy Library",
"city": "Isla de Pascua", # Updated from Unknown
"q_number": "Q8015912",
"wikidata_name": "Biblioteca William Mulloy",
"confidence": "partial",
"founded": "2002",
"notes": "SPARQL match - partial name (Spanish vs English). Easter Island archaeological library.",
},
]
def load_yaml(file_path: Path) -> list:
"""Load YAML file."""
with open(file_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def save_yaml(data: list, file_path: Path) -> None:
"""Save data to YAML file."""
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(
data,
f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=120,
indent=2
)
def find_institution(institutions: list, name: str, city: str) -> dict:
"""Find institution by name and city."""
for inst in institutions:
if inst['name'] == name:
# Check city if provided
if city and city != "Unknown":
inst_city = inst.get('locations', [{}])[0].get('city', '')
if inst_city == city or city == "Isla de Pascua": # Allow Easter Island match
return inst
else:
return inst
raise ValueError(f"Institution not found: {name} ({city})")
def enrich_institution(inst: dict, enrichment: dict) -> None:
"""Add Wikidata identifier to institution."""
# Check if already has Wikidata
existing_ids = inst.get('identifiers', [])
has_wikidata = any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in existing_ids
)
if has_wikidata:
print(f" ⚠️ {inst['name']} already has Wikidata identifier")
return
# Add Wikidata identifier
wikidata_id = {
'identifier_scheme': 'Wikidata',
'identifier_value': enrichment['q_number'],
'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['q_number']}"
}
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append(wikidata_id)
# Update city if it was Unknown
if enrichment.get('city') and enrichment['city'] != "Unknown":
locations = inst.get('locations', [])
if locations and not locations[0].get('city'):
locations[0]['city'] = enrichment['city']
print(f" 📍 Updated city to: {enrichment['city']}")
# Update provenance
if 'provenance' not in inst:
inst['provenance'] = {}
inst['provenance']['enrichment_method'] = 'Wikidata SPARQL bulk query (Batch 8 - Libraries)'
inst['provenance']['enrichment_date'] = datetime.now(timezone.utc).isoformat()
inst['provenance']['wikidata_match_confidence'] = enrichment['confidence']
# Add notes
if 'notes' not in inst['provenance']:
inst['provenance']['notes'] = []
elif isinstance(inst['provenance']['notes'], str):
inst['provenance']['notes'] = [inst['provenance']['notes']]
inst['provenance']['notes'].append(
f"Batch 8: {enrichment['notes']}"
)
print(f" ✅ Added Wikidata: {enrichment['q_number']} ({enrichment['wikidata_name']})")
def main():
print("=" * 80)
print("CHILEAN GLAM INSTITUTIONS - BATCH 8 ENRICHMENT (LIBRARIES)")
print("=" * 80)
print()
# Load data
input_file = Path('data/instances/chile/chilean_institutions_batch7_enriched.yaml')
print(f"📖 Loading: {input_file}")
institutions = load_yaml(input_file)
print(f" Loaded {len(institutions)} institutions")
print()
# Create backup
backup_file = input_file.with_suffix('.yaml.batch8_backup')
print(f"💾 Creating backup: {backup_file}")
save_yaml(institutions, backup_file)
print()
# Apply enrichments
print(f"🔧 Applying {len(BATCH_8_ENRICHMENTS)} enrichments...")
print()
enriched_count = 0
for i, enrichment in enumerate(BATCH_8_ENRICHMENTS, 1):
print(f"{i}. {enrichment['name']} ({enrichment['city']})")
try:
inst = find_institution(institutions, enrichment['name'], enrichment['city'])
enrich_institution(inst, enrichment)
enriched_count += 1
except ValueError as e:
print(f"{e}")
except Exception as e:
print(f" ❌ Error: {e}")
print()
# Save enriched data
output_file = Path('data/instances/chile/chilean_institutions_batch8_enriched.yaml')
print(f"💾 Saving enriched data: {output_file}")
save_yaml(institutions, output_file)
print()
# Statistics
print("=" * 80)
print("ENRICHMENT SUMMARY")
print("=" * 80)
print()
total = len(institutions)
with_wikidata = sum(
1 for inst in institutions
if any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst.get('identifiers', [])
)
)
print(f"Total institutions: {total}")
print(f"With Wikidata: {with_wikidata} ({with_wikidata/total*100:.1f}%)")
print(f"Batch 8 enrichments: {enriched_count}")
print()
# By type
from collections import defaultdict
by_type = defaultdict(lambda: {'total': 0, 'with_wd': 0})
for inst in institutions:
inst_type = inst.get('institution_type', 'UNKNOWN')
by_type[inst_type]['total'] += 1
if any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst.get('identifiers', [])
):
by_type[inst_type]['with_wd'] += 1
print("Coverage by type:")
for inst_type in sorted(by_type.keys()):
stats = by_type[inst_type]
pct = stats['with_wd']/stats['total']*100 if stats['total'] > 0 else 0
status = "" if pct == 100 else "" if pct >= 50 else ""
print(f" {status} {inst_type}: {stats['with_wd']}/{stats['total']} ({pct:.1f}%)")
print()
print("🎉 Batch 8 enrichment complete!")
print(f"📊 New coverage: {with_wikidata}/{total} ({with_wikidata/total*100:.1f}%)")
if __name__ == '__main__':
main()