glam/scripts/backfill_north_africa_enrichment_history.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

271 lines
10 KiB
Python
Executable file
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Backfill enrichment_history for North African institutions with Wikidata IDs.
Handles Tunisia (2), Algeria (1), and Libya (8) institutions that were enriched
during GLAM research conversations but lack enrichment_history metadata.
This script:
1. Identifies institutions with Wikidata IDs but no enrichment_history
2. Confirms they're from the correct research conversation
3. Creates enrichment_history entries documenting the enrichment process
4. Preserves original provenance metadata
5. Backs up files before modification
"""
import yaml
from pathlib import Path
from typing import List, Dict, Any
import shutil
# Country configurations
COUNTRY_CONFIGS = {
'tunisia': {
'dir': Path("/Users/kempersc/apps/glam/data/instances/tunisia"),
'conversation_id': '89ad670e-c3b3-491f-9b86-e8e612493072',
'conversation_date': '2025-09-22T14:49:26Z',
'conversation_name': 'Tunisian_GLAM_resource_inventory',
'enrichment_context': """Wikidata identifier obtained during comprehensive Tunisian GLAM research conversation
(ID: {conv_id}, date: {conv_date}). Research involved analyzing Tunisia's cultural heritage
landscape including museums, libraries, archives, and heritage sites. Cross-referencing was performed
against national heritage databases, institutional websites, and Wikidata to ensure accurate identification
and validation of institutions."""
},
'algeria': {
'dir': Path("/Users/kempersc/apps/glam/data/instances/algeria"),
'conversation_id': '039a271a-f8e3-4bf3-9e89-b289ec80701d',
'conversation_date': '2025-09-22T14:48:54Z',
'conversation_name': 'Comprehensive_GLAM_resources_in_Algeria',
'enrichment_context': """Wikidata identifier obtained during comprehensive Algerian GLAM research conversation
(ID: {conv_id}, date: {conv_date}). Research examined Algeria's extensive cultural heritage network
including the National Library, university libraries, historical archives, and archaeological museums.
Verification was performed through cross-referencing with institutional websites, national heritage
platforms, and Wikidata entities."""
},
'libya': {
'dir': Path("/Users/kempersc/apps/glam/data/instances/libya"),
'conversation_id': 'd06ded03-ba79-4b79-b068-406c2da01f8c',
'conversation_date': '2025-09-22T14:49:44Z',
'conversation_name': 'Libyan_cultural_heritage_resources',
'enrichment_context': """Wikidata identifier obtained during comprehensive Libyan cultural heritage research conversation
(ID: {conv_id}, date: {conv_date}). Research covered Libya's heritage institutions including
archaeological museums, UNESCO World Heritage Sites (Cyrene, Leptis Magna, Sabratha, Tadrart Acacus),
university collections, and historical archives. Cross-referencing was performed against UNESCO databases,
institutional records, and Wikidata to validate heritage sites and institutions."""
}
}
def needs_backfill(institution: Dict[str, Any], conversation_id: str) -> bool:
"""Check if institution needs enrichment_history backfill."""
# Must have Wikidata identifier
has_wikidata = False
for identifier in institution.get('identifiers', []):
if identifier.get('identifier_scheme') == 'Wikidata':
has_wikidata = True
break
if not has_wikidata:
return False
# Must lack enrichment_history
provenance = institution.get('provenance', {})
has_enrichment = 'enrichment_history' in provenance
if has_enrichment:
return False
# Must be from target conversation
conv_id = provenance.get('conversation_id')
if conv_id != conversation_id:
return False
return True
def get_wikidata_id(institution: Dict[str, Any]) -> str:
"""Extract Wikidata Q-number from identifiers."""
for identifier in institution.get('identifiers', []):
if identifier.get('identifier_scheme') == 'Wikidata':
return identifier.get('identifier_value', '')
return ''
def create_enrichment_history(
institution: Dict[str, Any],
config: Dict[str, Any]
) -> List[Dict[str, Any]]:
"""Create enrichment_history entry for institution."""
wikidata_id = get_wikidata_id(institution)
# Use extraction_date from provenance as enrichment timestamp
extraction_date = institution.get('provenance', {}).get('extraction_date', '')
if not extraction_date:
# Fallback to conversation date if extraction_date missing
extraction_date = config['conversation_date']
# Build enrichment sources list
sources = [f"https://www.wikidata.org/wiki/{wikidata_id}"]
# Add platform sources if institution references them
digital_platforms = institution.get('digital_platforms', [])
for platform in digital_platforms:
platform_url = platform.get('platform_url', '')
if platform_url and platform_url not in sources:
sources.append(platform_url)
# Build enrichment notes with context
enrichment_notes = config['enrichment_context'].format(
conv_id=config['conversation_id'],
conv_date=config['conversation_date'].split('T')[0]
)
return [{
'enrichment_date': extraction_date,
'enrichment_method': 'Conversation-based research with Wikidata verification and institutional cross-referencing',
'enrichment_source': sources,
'enrichment_notes': enrichment_notes
}]
def backfill_country(country_name: str, config: Dict[str, Any]) -> Dict[str, int]:
"""Process all files for a country and backfill enrichment_history."""
stats = {
'files': 0,
'total': 0,
'backfilled': 0,
'skipped': 0
}
print(f"\n{'='*70}")
print(f"Processing: {country_name.upper()}")
print(f"{'='*70}")
print(f"Conversation ID: {config['conversation_id']}")
print(f"Directory: {config['dir']}")
if not config['dir'].exists():
print(f"❌ Directory not found!")
return stats
# Find all YAML files (exclude backups)
yaml_files = [
f for f in config['dir'].glob("*.yaml")
if not any(suffix in f.name for suffix in ['.backup', '.pre_', '.bak'])
]
if not yaml_files:
print(f"❌ No YAML files found!")
return stats
print(f"Found {len(yaml_files)} file(s)\n")
for yaml_file in yaml_files:
print(f"File: {yaml_file.name}")
stats['files'] += 1
# Load institutions
with open(yaml_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
if not institutions or not isinstance(institutions, list):
print(f" ⚠️ Empty or invalid file")
continue
stats['total'] += len(institutions)
modified = False
# Process each institution
for inst in institutions:
if not isinstance(inst, dict):
continue
if needs_backfill(inst, config['conversation_id']):
# Create enrichment_history
enrichment_history = create_enrichment_history(inst, config)
# Add to provenance
if 'provenance' not in inst:
inst['provenance'] = {}
inst['provenance']['enrichment_history'] = enrichment_history
stats['backfilled'] += 1
modified = True
# Log
name = inst.get('name', 'Unknown')
wikidata_id = get_wikidata_id(inst)
print(f"{name} (Wikidata: {wikidata_id})")
else:
stats['skipped'] += 1
# Save if modified
if modified:
# Backup original
backup_path = yaml_file.with_suffix('.yaml.pre_enrichment_backfill')
if not backup_path.exists():
shutil.copy2(yaml_file, backup_path)
print(f" 💾 Backup: {backup_path.name}")
# Write updated file
with open(yaml_file, 'w', encoding='utf-8') as f:
yaml.dump(
institutions,
f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=120
)
print(f" ✨ Updated: {yaml_file.name}")
else:
print(f" No changes needed")
return stats
def main():
"""Main backfill process."""
print("=" * 70)
print("North Africa Institutions Enrichment History Backfill")
print("=" * 70)
print("\nCountries: Tunisia (2), Algeria (1), Libya (8)")
print("Total expected backfills: 11 institutions\n")
# Process all countries
total_stats = {
'countries': 0,
'files': 0,
'total_institutions': 0,
'backfilled': 0,
'skipped': 0
}
for country_name, config in COUNTRY_CONFIGS.items():
stats = backfill_country(country_name, config)
total_stats['countries'] += 1
total_stats['files'] += stats['files']
total_stats['total_institutions'] += stats['total']
total_stats['backfilled'] += stats['backfilled']
total_stats['skipped'] += stats['skipped']
# Summary
print("\n" + "=" * 70)
print("Backfill Summary")
print("=" * 70)
print(f"Countries processed: {total_stats['countries']}")
print(f"Files processed: {total_stats['files']}")
print(f"Total institutions: {total_stats['total_institutions']}")
print(f"✅ Backfilled: {total_stats['backfilled']}")
print(f"⏭️ Skipped: {total_stats['skipped']}")
print()
if total_stats['backfilled'] > 0:
print("✨ Enrichment history successfully backfilled!")
print("\nNext step: Verify gap closure with validation script")
else:
print(" No institutions required backfilling")
if __name__ == '__main__':
main()