- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
249 lines
8.8 KiB
Python
Executable file
249 lines
8.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
"""
|
||
Backfill enrichment_history for 188 Chilean institutions with Wikidata IDs.
|
||
|
||
These institutions were enriched during the Chilean GLAM research conversation
|
||
(edc75d66-ee42-4199-8e22-65b0d2347922) but lack enrichment_history metadata.
|
||
|
||
This script:
|
||
1. Identifies institutions with Wikidata IDs but no enrichment_history
|
||
2. Confirms they're from the Chilean research conversation
|
||
3. Creates enrichment_history entries documenting the enrichment process
|
||
4. Preserves original provenance metadata
|
||
5. Backs up files before modification
|
||
"""
|
||
|
||
import yaml
|
||
from pathlib import Path
|
||
from datetime import datetime, timezone
|
||
from typing import List, Dict, Any
|
||
import shutil
|
||
|
||
# Constants
|
||
CHILE_DIR = Path("/Users/kempersc/apps/glam/data/instances/chile")
|
||
CONVERSATION_ID = "edc75d66-ee42-4199-8e22-65b0d2347922"
|
||
CONVERSATION_DATE = "2025-09-22T14:43:14Z"
|
||
|
||
# Key enrichment sources from the conversation
|
||
ENRICHMENT_SOURCES = [
|
||
"https://www.wikidata.org", # Wikidata (primary)
|
||
"https://www.surdoc.cl", # Sistema Unificado de Registros (museum catalog)
|
||
"https://sinarchile.archivonacional.gob.cl", # Sistema Nacional de Archivos
|
||
"http://www.memoriachilena.gob.cl", # Memoria Chilena digital heritage
|
||
"https://www.registromuseoschile.cl", # Registro de Museos de Chile
|
||
"https://www.archivonacional.gob.cl", # Archivo Nacional
|
||
]
|
||
|
||
|
||
def needs_backfill(institution: Dict[str, Any]) -> bool:
|
||
"""Check if institution needs enrichment_history backfill."""
|
||
# Must have Wikidata identifier
|
||
has_wikidata = False
|
||
for identifier in institution.get('identifiers', []):
|
||
if identifier.get('identifier_scheme') == 'Wikidata':
|
||
has_wikidata = True
|
||
break
|
||
|
||
if not has_wikidata:
|
||
return False
|
||
|
||
# Must lack enrichment_history
|
||
provenance = institution.get('provenance', {})
|
||
has_enrichment = 'enrichment_history' in provenance
|
||
|
||
if has_enrichment:
|
||
return False
|
||
|
||
# Must be from Chilean conversation
|
||
conv_id = provenance.get('conversation_id')
|
||
if conv_id != CONVERSATION_ID:
|
||
return False
|
||
|
||
return True
|
||
|
||
|
||
def get_wikidata_id(institution: Dict[str, Any]) -> str:
|
||
"""Extract Wikidata Q-number from identifiers."""
|
||
for identifier in institution.get('identifiers', []):
|
||
if identifier.get('identifier_scheme') == 'Wikidata':
|
||
return identifier.get('identifier_value', '')
|
||
return ''
|
||
|
||
|
||
def create_enrichment_history(institution: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||
"""Create enrichment_history entry for institution."""
|
||
wikidata_id = get_wikidata_id(institution)
|
||
|
||
# Use extraction_date from provenance as enrichment timestamp
|
||
extraction_date = institution.get('provenance', {}).get('extraction_date', '')
|
||
if not extraction_date:
|
||
# Fallback to conversation date if extraction_date missing
|
||
extraction_date = CONVERSATION_DATE
|
||
|
||
# Build enrichment sources list
|
||
sources = [f"https://www.wikidata.org/wiki/{wikidata_id}"]
|
||
|
||
# Add platform sources if institution references them
|
||
digital_platforms = institution.get('digital_platforms', [])
|
||
for platform in digital_platforms:
|
||
platform_url = platform.get('platform_url', '')
|
||
platform_name = platform.get('platform_name', '')
|
||
|
||
# Match to known sources
|
||
if 'surdoc.cl' in platform_url and 'https://www.surdoc.cl' not in sources:
|
||
sources.append('https://www.surdoc.cl')
|
||
elif 'sinar' in platform_url.lower() and 'https://sinarchile.archivonacional.gob.cl' not in sources:
|
||
sources.append('https://sinarchile.archivonacional.gob.cl')
|
||
elif 'memoriachilena' in platform_url.lower() and 'http://www.memoriachilena.gob.cl' not in sources:
|
||
sources.append('http://www.memoriachilena.gob.cl')
|
||
|
||
# Add Archivo Nacional for archives
|
||
if institution.get('institution_type') == 'ARCHIVE':
|
||
if 'https://www.archivonacional.gob.cl' not in sources:
|
||
sources.append('https://www.archivonacional.gob.cl')
|
||
|
||
# Build enrichment notes
|
||
enrichment_notes = f"""Wikidata identifier obtained during comprehensive Chilean GLAM research conversation
|
||
(ID: {CONVERSATION_ID}, date: {CONVERSATION_DATE.split('T')[0]}). Research involved cross-referencing
|
||
multiple national heritage platforms including SURDOC (museum catalog system), SINAR (national archives
|
||
system), Memoria Chilena (digital heritage collection), and institutional websites. The conversation
|
||
analyzed 695+ library services, 500,000+ archival records, and 72,000+ catalogued museum objects to
|
||
identify and validate Chilean heritage institutions."""
|
||
|
||
return [{
|
||
'enrichment_date': extraction_date,
|
||
'enrichment_method': 'Conversation-based research with Wikidata verification and national platform cross-referencing',
|
||
'enrichment_source': sources,
|
||
'enrichment_notes': enrichment_notes
|
||
}]
|
||
|
||
|
||
def backfill_batch_file(batch_file: Path) -> Dict[str, int]:
|
||
"""Process a single batch file and backfill enrichment_history."""
|
||
stats = {
|
||
'total': 0,
|
||
'backfilled': 0,
|
||
'skipped': 0
|
||
}
|
||
|
||
print(f"\nProcessing: {batch_file.name}")
|
||
|
||
# Load institutions
|
||
with open(batch_file, 'r', encoding='utf-8') as f:
|
||
institutions = yaml.safe_load(f)
|
||
|
||
if not institutions:
|
||
print(f" ⚠️ Empty file or failed to load")
|
||
return stats
|
||
|
||
stats['total'] = len(institutions)
|
||
modified = False
|
||
|
||
# Process each institution
|
||
for inst in institutions:
|
||
if needs_backfill(inst):
|
||
# Create enrichment_history
|
||
enrichment_history = create_enrichment_history(inst)
|
||
|
||
# Add to provenance
|
||
if 'provenance' not in inst:
|
||
inst['provenance'] = {}
|
||
|
||
inst['provenance']['enrichment_history'] = enrichment_history
|
||
|
||
stats['backfilled'] += 1
|
||
modified = True
|
||
|
||
# Log
|
||
name = inst.get('name', 'Unknown')
|
||
wikidata_id = get_wikidata_id(inst)
|
||
print(f" ✅ {name} (Wikidata: {wikidata_id})")
|
||
else:
|
||
stats['skipped'] += 1
|
||
|
||
# Save if modified
|
||
if modified:
|
||
# Backup original
|
||
backup_path = batch_file.with_suffix('.yaml.pre_enrichment_backfill')
|
||
if not backup_path.exists():
|
||
shutil.copy2(batch_file, backup_path)
|
||
print(f" 💾 Backup: {backup_path.name}")
|
||
|
||
# Write updated file
|
||
with open(batch_file, 'w', encoding='utf-8') as f:
|
||
yaml.dump(
|
||
institutions,
|
||
f,
|
||
default_flow_style=False,
|
||
allow_unicode=True,
|
||
sort_keys=False,
|
||
width=120
|
||
)
|
||
print(f" ✨ Updated: {batch_file.name}")
|
||
else:
|
||
print(f" ℹ️ No changes needed")
|
||
|
||
return stats
|
||
|
||
|
||
def main():
|
||
"""Main backfill process."""
|
||
print("=" * 70)
|
||
print("Chilean Institutions Enrichment History Backfill")
|
||
print("=" * 70)
|
||
print(f"\nTarget conversation: {CONVERSATION_ID}")
|
||
print(f"Conversation date: {CONVERSATION_DATE}")
|
||
print(f"\nSearching for Chilean batch files in: {CHILE_DIR}")
|
||
|
||
# Find all enriched batch files
|
||
batch_files = sorted(CHILE_DIR.glob("*_enriched.yaml"))
|
||
|
||
# Exclude backups
|
||
batch_files = [
|
||
f for f in batch_files
|
||
if not any(suffix in f.name for suffix in ['.backup', '.pre_v0.2.2_backup', '.pre_enrichment_backfill'])
|
||
]
|
||
|
||
print(f"Found {len(batch_files)} batch files\n")
|
||
|
||
if not batch_files:
|
||
print("❌ No batch files found!")
|
||
return
|
||
|
||
# Process all files
|
||
total_stats = {
|
||
'files': len(batch_files),
|
||
'total_institutions': 0,
|
||
'backfilled': 0,
|
||
'skipped': 0
|
||
}
|
||
|
||
for batch_file in batch_files:
|
||
stats = backfill_batch_file(batch_file)
|
||
total_stats['total_institutions'] += stats['total']
|
||
total_stats['backfilled'] += stats['backfilled']
|
||
total_stats['skipped'] += stats['skipped']
|
||
|
||
# Summary
|
||
print("\n" + "=" * 70)
|
||
print("Backfill Summary")
|
||
print("=" * 70)
|
||
print(f"Files processed: {total_stats['files']}")
|
||
print(f"Total institutions: {total_stats['total_institutions']}")
|
||
print(f"✅ Backfilled: {total_stats['backfilled']}")
|
||
print(f"⏭️ Skipped: {total_stats['skipped']}")
|
||
print()
|
||
|
||
if total_stats['backfilled'] > 0:
|
||
print("✨ Enrichment history successfully backfilled!")
|
||
print("\nNext steps:")
|
||
print("1. Run migration script to verify all institutions now have enrichment_history")
|
||
print("2. Check that 188 → 0 gap reduction achieved")
|
||
print("\nCommand:")
|
||
print(" python scripts/migrate_to_schema_v0.2.2_enrichment.py")
|
||
else:
|
||
print("ℹ️ No institutions required backfilling")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|