glam/scripts/enrich_chilean_batch4.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

229 lines
8.2 KiB
Python

#!/usr/bin/env python3
"""
Chilean GLAM Institutions - Batch 4 Wikidata Enrichment
Target: National institutions and major regional museums
Goal: 10/90 → 14/90 (11.1% → 15.6% coverage)
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List
# Batch 4 targets with verified Wikidata Q-numbers
BATCH_4_TARGETS = [
{
"q_number": "Q2901485",
"name_pattern": "Biblioteca Nacional",
"location": "Santiago", # Note: Dataset may list as Iquique (digital platform)
"institution_type": "LIBRARY",
"verification": "National Library of Chile, founded 1813, Santiago",
"notes": "May appear as 'Biblioteca Nacional Digital' in dataset"
},
{
"q_number": "Q17166403",
"name_pattern": "Museo Marítimo Nacional",
"location": "Valparaíso",
"institution_type": "MUSEUM",
"verification": "National Maritime Museum, Valparaíso, founded on Artillery Hill"
},
{
"q_number": "Q6970429",
"name_pattern": "Archivo Nacional",
"location": "Santiago",
"institution_type": "ARCHIVE",
"verification": "National Archives of Chile, founded 1927, Santiago"
},
# Note: Servicio Nacional del Patrimonio Cultural (Q-number not found in Wikidata)
]
def load_institutions(file_path: Path) -> List[Dict]:
"""Load institutions from YAML file."""
print(f"📖 Loading institutions from: {file_path}")
with open(file_path, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f" Loaded {len(institutions)} institutions")
return institutions
def count_wikidata_coverage(institutions: List[Dict]) -> tuple:
"""Count institutions with Wikidata identifiers."""
with_wikidata = sum(
1 for inst in institutions
if any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst.get('identifiers', [])
)
)
return with_wikidata, len(institutions)
def institution_has_wikidata(institution: Dict) -> bool:
"""Check if institution already has Wikidata identifier."""
return any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in institution.get('identifiers', [])
)
def matches_target(institution: Dict, target: Dict) -> bool:
"""Check if institution matches target criteria."""
name = institution.get('name', '')
inst_type = institution.get('institution_type', '')
locations = institution.get('locations', [])
# Institution type must match
if inst_type != target['institution_type']:
return False
# Name must contain the pattern
if target['name_pattern'] not in name:
return False
# Location match (optional - some records may have incomplete location data)
if locations:
city = locations[0].get('city', '')
# Flexible location matching (target location or Unknown)
if city and city != 'Unknown' and target['location'] not in city:
return False
return True
def enrich_institution(institution: Dict, target: Dict) -> bool:
"""Add Wikidata identifier to institution."""
q_number = target['q_number']
# Create Wikidata identifier
wikidata_id = {
'identifier_scheme': 'Wikidata',
'identifier_value': q_number,
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
}
# Ensure identifiers list exists
if 'identifiers' not in institution:
institution['identifiers'] = []
# Add Wikidata identifier
institution['identifiers'].append(wikidata_id)
# Update provenance
if 'provenance' not in institution:
institution['provenance'] = {}
provenance = institution['provenance']
# Record enrichment
if 'enrichment_history' not in provenance:
provenance['enrichment_history'] = []
provenance['enrichment_history'].append({
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'enrichment_method': 'Chilean Batch 4 - Manual Wikidata verification',
'enrichment_batch': 'batch_4',
'q_number': q_number,
'verification': target['verification']
})
# Update data tier if not already set
if 'data_tier' not in provenance or provenance['data_tier'] == 'TIER_4_INFERRED':
provenance['data_tier'] = 'TIER_3_CROWD_SOURCED' # Wikidata is TIER_3
return True
def main():
"""Main enrichment workflow."""
print("=" * 80)
print("CHILEAN GLAM INSTITUTIONS - BATCH 4 WIKIDATA ENRICHMENT")
print("=" * 80)
# Paths
input_file = Path('data/instances/chile/chilean_institutions_batch3_enriched.yaml')
output_file = Path('data/instances/chile/chilean_institutions_batch4_enriched.yaml')
backup_file = Path(f'{input_file}.batch4_backup')
# Load institutions
institutions = load_institutions(input_file)
# Count current coverage
with_wikidata, total = count_wikidata_coverage(institutions)
coverage_pct = (with_wikidata / total * 100) if total > 0 else 0
print(f"📊 Current Wikidata coverage: {with_wikidata}/{total} ({coverage_pct:.1f}%)")
# Create backup
print(f"💾 Creating backup: {backup_file}")
with open(backup_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
# Enrichment tracking
enriched_count = 0
skipped_count = 0
print(f"🔍 Starting Batch 4 enrichment...")
print()
# Process each target
for target in BATCH_4_TARGETS:
matched = False
for institution in institutions:
# Skip if already has Wikidata
if institution_has_wikidata(institution):
continue
# Check if matches target
if matches_target(institution, target):
print(f"✅ MATCH: {institution.get('name', 'Unknown')}")
locations = institution.get('locations', [])
if locations:
print(f" Location: {locations[0].get('city', 'Unknown')}")
print(f" Q-number: {target['q_number']}")
print(f" Verification: {target['verification']}")
# Enrich institution
enrich_institution(institution, target)
enriched_count += 1
matched = True
print()
break
if not matched:
print(f"⏭️ SKIP: {target['name_pattern']} ({target['location']}) - No match found")
print(f" Q-number: {target['q_number']}")
print(f" Notes: {target.get('notes', 'Institution not in dataset or different naming')}")
skipped_count += 1
print()
# Final coverage
new_with_wikidata, _ = count_wikidata_coverage(institutions)
new_coverage_pct = (new_with_wikidata / total * 100) if total > 0 else 0
# Summary
print("=" * 80)
print("📊 Batch 4 Enrichment Summary")
print("=" * 80)
print(f"✅ Enriched: {enriched_count} institutions")
print(f"⏭️ Skipped: {skipped_count} institutions (no match)")
print(f"📈 New Wikidata coverage: {new_with_wikidata}/{total} ({new_coverage_pct:.1f}%)")
print(f" Improvement: +{enriched_count} institutions")
# Save enriched dataset
print(f"💾 Saving enriched dataset to: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
print()
print("✅ Batch 4 enrichment complete!")
print()
print("📁 Files:")
print(f" Input: {input_file}")
print(f" Output: {output_file}")
print(f" Backup: {backup_file}")
print()
print("🎯 Next Steps:")
if new_with_wikidata < 20:
print(f" - Need {20 - new_with_wikidata} more institutions to reach 22.2% coverage goal (20 institutions)")
print(" - Consider Batch 5: More regional museums or verify Servicio Nacional del Patrimonio Cultural")
else:
print(" - 🎉 GOAL REACHED: 20+ institutions with Wikidata!")
print(" - Consider moving to Brazil Batch 7 or updating documentation")
if __name__ == '__main__':
main()