glam/scripts/backfill_chilean_enrichment_history.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

249 lines
8.8 KiB
Python
Executable file
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Backfill enrichment_history for 188 Chilean institutions with Wikidata IDs.
These institutions were enriched during the Chilean GLAM research conversation
(edc75d66-ee42-4199-8e22-65b0d2347922) but lack enrichment_history metadata.
This script:
1. Identifies institutions with Wikidata IDs but no enrichment_history
2. Confirms they're from the Chilean research conversation
3. Creates enrichment_history entries documenting the enrichment process
4. Preserves original provenance metadata
5. Backs up files before modification
"""
import yaml
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Any
import shutil
# Constants
CHILE_DIR = Path("/Users/kempersc/apps/glam/data/instances/chile")
CONVERSATION_ID = "edc75d66-ee42-4199-8e22-65b0d2347922"
CONVERSATION_DATE = "2025-09-22T14:43:14Z"
# Key enrichment sources from the conversation
ENRICHMENT_SOURCES = [
"https://www.wikidata.org", # Wikidata (primary)
"https://www.surdoc.cl", # Sistema Unificado de Registros (museum catalog)
"https://sinarchile.archivonacional.gob.cl", # Sistema Nacional de Archivos
"http://www.memoriachilena.gob.cl", # Memoria Chilena digital heritage
"https://www.registromuseoschile.cl", # Registro de Museos de Chile
"https://www.archivonacional.gob.cl", # Archivo Nacional
]
def needs_backfill(institution: Dict[str, Any]) -> bool:
"""Check if institution needs enrichment_history backfill."""
# Must have Wikidata identifier
has_wikidata = False
for identifier in institution.get('identifiers', []):
if identifier.get('identifier_scheme') == 'Wikidata':
has_wikidata = True
break
if not has_wikidata:
return False
# Must lack enrichment_history
provenance = institution.get('provenance', {})
has_enrichment = 'enrichment_history' in provenance
if has_enrichment:
return False
# Must be from Chilean conversation
conv_id = provenance.get('conversation_id')
if conv_id != CONVERSATION_ID:
return False
return True
def get_wikidata_id(institution: Dict[str, Any]) -> str:
"""Extract Wikidata Q-number from identifiers."""
for identifier in institution.get('identifiers', []):
if identifier.get('identifier_scheme') == 'Wikidata':
return identifier.get('identifier_value', '')
return ''
def create_enrichment_history(institution: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Create enrichment_history entry for institution."""
wikidata_id = get_wikidata_id(institution)
# Use extraction_date from provenance as enrichment timestamp
extraction_date = institution.get('provenance', {}).get('extraction_date', '')
if not extraction_date:
# Fallback to conversation date if extraction_date missing
extraction_date = CONVERSATION_DATE
# Build enrichment sources list
sources = [f"https://www.wikidata.org/wiki/{wikidata_id}"]
# Add platform sources if institution references them
digital_platforms = institution.get('digital_platforms', [])
for platform in digital_platforms:
platform_url = platform.get('platform_url', '')
platform_name = platform.get('platform_name', '')
# Match to known sources
if 'surdoc.cl' in platform_url and 'https://www.surdoc.cl' not in sources:
sources.append('https://www.surdoc.cl')
elif 'sinar' in platform_url.lower() and 'https://sinarchile.archivonacional.gob.cl' not in sources:
sources.append('https://sinarchile.archivonacional.gob.cl')
elif 'memoriachilena' in platform_url.lower() and 'http://www.memoriachilena.gob.cl' not in sources:
sources.append('http://www.memoriachilena.gob.cl')
# Add Archivo Nacional for archives
if institution.get('institution_type') == 'ARCHIVE':
if 'https://www.archivonacional.gob.cl' not in sources:
sources.append('https://www.archivonacional.gob.cl')
# Build enrichment notes
enrichment_notes = f"""Wikidata identifier obtained during comprehensive Chilean GLAM research conversation
(ID: {CONVERSATION_ID}, date: {CONVERSATION_DATE.split('T')[0]}). Research involved cross-referencing
multiple national heritage platforms including SURDOC (museum catalog system), SINAR (national archives
system), Memoria Chilena (digital heritage collection), and institutional websites. The conversation
analyzed 695+ library services, 500,000+ archival records, and 72,000+ catalogued museum objects to
identify and validate Chilean heritage institutions."""
return [{
'enrichment_date': extraction_date,
'enrichment_method': 'Conversation-based research with Wikidata verification and national platform cross-referencing',
'enrichment_source': sources,
'enrichment_notes': enrichment_notes
}]
def backfill_batch_file(batch_file: Path) -> Dict[str, int]:
"""Process a single batch file and backfill enrichment_history."""
stats = {
'total': 0,
'backfilled': 0,
'skipped': 0
}
print(f"\nProcessing: {batch_file.name}")
# Load institutions
with open(batch_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
if not institutions:
print(f" ⚠️ Empty file or failed to load")
return stats
stats['total'] = len(institutions)
modified = False
# Process each institution
for inst in institutions:
if needs_backfill(inst):
# Create enrichment_history
enrichment_history = create_enrichment_history(inst)
# Add to provenance
if 'provenance' not in inst:
inst['provenance'] = {}
inst['provenance']['enrichment_history'] = enrichment_history
stats['backfilled'] += 1
modified = True
# Log
name = inst.get('name', 'Unknown')
wikidata_id = get_wikidata_id(inst)
print(f"{name} (Wikidata: {wikidata_id})")
else:
stats['skipped'] += 1
# Save if modified
if modified:
# Backup original
backup_path = batch_file.with_suffix('.yaml.pre_enrichment_backfill')
if not backup_path.exists():
shutil.copy2(batch_file, backup_path)
print(f" 💾 Backup: {backup_path.name}")
# Write updated file
with open(batch_file, 'w', encoding='utf-8') as f:
yaml.dump(
institutions,
f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=120
)
print(f" ✨ Updated: {batch_file.name}")
else:
print(f" No changes needed")
return stats
def main():
"""Main backfill process."""
print("=" * 70)
print("Chilean Institutions Enrichment History Backfill")
print("=" * 70)
print(f"\nTarget conversation: {CONVERSATION_ID}")
print(f"Conversation date: {CONVERSATION_DATE}")
print(f"\nSearching for Chilean batch files in: {CHILE_DIR}")
# Find all enriched batch files
batch_files = sorted(CHILE_DIR.glob("*_enriched.yaml"))
# Exclude backups
batch_files = [
f for f in batch_files
if not any(suffix in f.name for suffix in ['.backup', '.pre_v0.2.2_backup', '.pre_enrichment_backfill'])
]
print(f"Found {len(batch_files)} batch files\n")
if not batch_files:
print("❌ No batch files found!")
return
# Process all files
total_stats = {
'files': len(batch_files),
'total_institutions': 0,
'backfilled': 0,
'skipped': 0
}
for batch_file in batch_files:
stats = backfill_batch_file(batch_file)
total_stats['total_institutions'] += stats['total']
total_stats['backfilled'] += stats['backfilled']
total_stats['skipped'] += stats['skipped']
# Summary
print("\n" + "=" * 70)
print("Backfill Summary")
print("=" * 70)
print(f"Files processed: {total_stats['files']}")
print(f"Total institutions: {total_stats['total_institutions']}")
print(f"✅ Backfilled: {total_stats['backfilled']}")
print(f"⏭️ Skipped: {total_stats['skipped']}")
print()
if total_stats['backfilled'] > 0:
print("✨ Enrichment history successfully backfilled!")
print("\nNext steps:")
print("1. Run migration script to verify all institutions now have enrichment_history")
print("2. Check that 188 → 0 gap reduction achieved")
print("\nCommand:")
print(" python scripts/migrate_to_schema_v0.2.2_enrichment.py")
else:
print(" No institutions required backfilling")
if __name__ == '__main__':
main()