glam/scripts/enrich_chilean_batch14.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

158 lines
5.7 KiB
Python

#!/usr/bin/env python3
"""
Chilean GLAM Wikidata Enrichment - Batch 14 (Manual WebFetch Results)
Apply manually verified Wikidata identifier found via WebFetch tool:
- Museo Rudolph Philippi (Valdivia) → Q6940547
Target: 62/90 institutions (68.9% coverage)
Previous: 61/90 (67.8%)
Manual verification performed Nov 9, 2025 via WebFetch after API rate limits.
See: scripts/batch14_manual_results.json
"""
import sys
from pathlib import Path
from datetime import datetime, timezone
import yaml
def load_institutions(filepath: Path) -> list[dict]:
"""Load YAML file as list of dictionaries."""
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def save_institutions(institutions: list[dict], filepath: Path):
"""Save institutions to YAML file."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, width=120)
def enrich_batch14(institutions: list[dict]) -> tuple[list[dict], int]:
"""
Apply Batch 14 Wikidata enrichments (manual WebFetch verification).
Returns:
Tuple of (enriched_institutions, count_enriched)
"""
enrichment_date = datetime.now(timezone.utc).isoformat()
enriched_count = 0
# Batch 14: Manual WebFetch verification (1 institution)
# Q6940547 - Museo de la Exploración Rudolph Amandus Philippi
valdivia_philippi_matches = [
("Museo Rudolph Philippi", "Valdivia", "Q6940547",
"Museo de la Exploración Rudolph Amandus Philippi, museum in Valdivia, founded 1914, reopened 2006")
]
for inst in institutions:
name = inst.get('name', '')
city = inst.get('locations', [{}])[0].get('city', '') if inst.get('locations') else ''
# Check if institution already has Wikidata identifier
existing_wikidata = any(
id_item.get('identifier_scheme') == 'Wikidata'
for id_item in inst.get('identifiers', [])
)
if existing_wikidata:
continue # Skip if already enriched
# Check against Batch 14 matches
for match_name, match_city, q_number, verification in valdivia_philippi_matches:
if match_name.lower() in name.lower() and match_city.lower() == city.lower():
# Add Wikidata identifier
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append({
'identifier_scheme': 'Wikidata',
'identifier_value': q_number,
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
})
# Update provenance
if 'provenance' not in inst:
inst['provenance'] = {}
inst['provenance']['data_tier'] = 'TIER_3_CROWD_SOURCED'
inst['provenance']['last_updated'] = enrichment_date
inst['provenance']['enrichment_batch'] = 14
inst['provenance']['wikidata_match_confidence'] = 'HIGH'
inst['provenance']['wikidata_match_reason'] = 'Manual verification via WebFetch after API rate limits'
inst['provenance']['wikidata_name'] = verification
enriched_count += 1
print(f"✓ Enriched: {name} ({city}) → {q_number}")
print(f" Wikidata: {verification}")
break
return institutions, enriched_count
def main():
"""Main enrichment workflow."""
# Paths
input_file = Path("data/instances/chile/chilean_institutions_batch13_enriched.yaml")
output_file = Path("data/instances/chile/chilean_institutions_batch14_enriched.yaml")
print("=" * 80)
print("Chilean GLAM Wikidata Enrichment - Batch 14")
print("Manual WebFetch Verification Results")
print("=" * 80)
print()
# Load data
print(f"Loading: {input_file}")
institutions = load_institutions(input_file)
print(f" Total institutions: {len(institutions)}")
# Count current Wikidata coverage
current_with_wikidata = sum(
1 for inst in institutions
if any(id_item.get('identifier_scheme') == 'Wikidata'
for id_item in inst.get('identifiers', []))
)
print(f" Current Wikidata coverage: {current_with_wikidata}/{len(institutions)} ({current_with_wikidata/len(institutions)*100:.1f}%)")
print()
# Apply Batch 14 enrichments
print("Applying Batch 14 enrichments...")
institutions, enriched_count = enrich_batch14(institutions)
print()
# Calculate new coverage
new_with_wikidata = sum(
1 for inst in institutions
if any(id_item.get('identifier_scheme') == 'Wikidata'
for id_item in inst.get('identifiers', []))
)
print("=" * 80)
print("Enrichment Summary")
print("=" * 80)
print(f"Institutions enriched: {enriched_count}")
print(f"New Wikidata coverage: {new_with_wikidata}/{len(institutions)} ({new_with_wikidata/len(institutions)*100:.1f}%)")
print(f"Coverage change: +{new_with_wikidata - current_with_wikidata} ({(new_with_wikidata - current_with_wikidata)/len(institutions)*100:.1f}%)")
print()
# Save enriched data
print(f"Saving: {output_file}")
save_institutions(institutions, output_file)
print("✓ Done!")
print()
# Progress towards 70% target
target = 63 # 70% of 90
remaining = target - new_with_wikidata
print(f"Progress to 70% target: {new_with_wikidata}/{target}")
if remaining > 0:
print(f" Still need: {remaining} more institution(s)")
else:
print(" 🎉 Target reached!")
if __name__ == '__main__':
main()