- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
271 lines
10 KiB
Python
Executable file
271 lines
10 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
"""
|
||
Backfill enrichment_history for North African institutions with Wikidata IDs.
|
||
|
||
Handles Tunisia (2), Algeria (1), and Libya (8) institutions that were enriched
|
||
during GLAM research conversations but lack enrichment_history metadata.
|
||
|
||
This script:
|
||
1. Identifies institutions with Wikidata IDs but no enrichment_history
|
||
2. Confirms they're from the correct research conversation
|
||
3. Creates enrichment_history entries documenting the enrichment process
|
||
4. Preserves original provenance metadata
|
||
5. Backs up files before modification
|
||
"""
|
||
|
||
import yaml
|
||
from pathlib import Path
|
||
from typing import List, Dict, Any
|
||
import shutil
|
||
|
||
# Country configurations
|
||
COUNTRY_CONFIGS = {
|
||
'tunisia': {
|
||
'dir': Path("/Users/kempersc/apps/glam/data/instances/tunisia"),
|
||
'conversation_id': '89ad670e-c3b3-491f-9b86-e8e612493072',
|
||
'conversation_date': '2025-09-22T14:49:26Z',
|
||
'conversation_name': 'Tunisian_GLAM_resource_inventory',
|
||
'enrichment_context': """Wikidata identifier obtained during comprehensive Tunisian GLAM research conversation
|
||
(ID: {conv_id}, date: {conv_date}). Research involved analyzing Tunisia's cultural heritage
|
||
landscape including museums, libraries, archives, and heritage sites. Cross-referencing was performed
|
||
against national heritage databases, institutional websites, and Wikidata to ensure accurate identification
|
||
and validation of institutions."""
|
||
},
|
||
'algeria': {
|
||
'dir': Path("/Users/kempersc/apps/glam/data/instances/algeria"),
|
||
'conversation_id': '039a271a-f8e3-4bf3-9e89-b289ec80701d',
|
||
'conversation_date': '2025-09-22T14:48:54Z',
|
||
'conversation_name': 'Comprehensive_GLAM_resources_in_Algeria',
|
||
'enrichment_context': """Wikidata identifier obtained during comprehensive Algerian GLAM research conversation
|
||
(ID: {conv_id}, date: {conv_date}). Research examined Algeria's extensive cultural heritage network
|
||
including the National Library, university libraries, historical archives, and archaeological museums.
|
||
Verification was performed through cross-referencing with institutional websites, national heritage
|
||
platforms, and Wikidata entities."""
|
||
},
|
||
'libya': {
|
||
'dir': Path("/Users/kempersc/apps/glam/data/instances/libya"),
|
||
'conversation_id': 'd06ded03-ba79-4b79-b068-406c2da01f8c',
|
||
'conversation_date': '2025-09-22T14:49:44Z',
|
||
'conversation_name': 'Libyan_cultural_heritage_resources',
|
||
'enrichment_context': """Wikidata identifier obtained during comprehensive Libyan cultural heritage research conversation
|
||
(ID: {conv_id}, date: {conv_date}). Research covered Libya's heritage institutions including
|
||
archaeological museums, UNESCO World Heritage Sites (Cyrene, Leptis Magna, Sabratha, Tadrart Acacus),
|
||
university collections, and historical archives. Cross-referencing was performed against UNESCO databases,
|
||
institutional records, and Wikidata to validate heritage sites and institutions."""
|
||
}
|
||
}
|
||
|
||
|
||
def needs_backfill(institution: Dict[str, Any], conversation_id: str) -> bool:
|
||
"""Check if institution needs enrichment_history backfill."""
|
||
# Must have Wikidata identifier
|
||
has_wikidata = False
|
||
for identifier in institution.get('identifiers', []):
|
||
if identifier.get('identifier_scheme') == 'Wikidata':
|
||
has_wikidata = True
|
||
break
|
||
|
||
if not has_wikidata:
|
||
return False
|
||
|
||
# Must lack enrichment_history
|
||
provenance = institution.get('provenance', {})
|
||
has_enrichment = 'enrichment_history' in provenance
|
||
|
||
if has_enrichment:
|
||
return False
|
||
|
||
# Must be from target conversation
|
||
conv_id = provenance.get('conversation_id')
|
||
if conv_id != conversation_id:
|
||
return False
|
||
|
||
return True
|
||
|
||
|
||
def get_wikidata_id(institution: Dict[str, Any]) -> str:
|
||
"""Extract Wikidata Q-number from identifiers."""
|
||
for identifier in institution.get('identifiers', []):
|
||
if identifier.get('identifier_scheme') == 'Wikidata':
|
||
return identifier.get('identifier_value', '')
|
||
return ''
|
||
|
||
|
||
def create_enrichment_history(
|
||
institution: Dict[str, Any],
|
||
config: Dict[str, Any]
|
||
) -> List[Dict[str, Any]]:
|
||
"""Create enrichment_history entry for institution."""
|
||
wikidata_id = get_wikidata_id(institution)
|
||
|
||
# Use extraction_date from provenance as enrichment timestamp
|
||
extraction_date = institution.get('provenance', {}).get('extraction_date', '')
|
||
if not extraction_date:
|
||
# Fallback to conversation date if extraction_date missing
|
||
extraction_date = config['conversation_date']
|
||
|
||
# Build enrichment sources list
|
||
sources = [f"https://www.wikidata.org/wiki/{wikidata_id}"]
|
||
|
||
# Add platform sources if institution references them
|
||
digital_platforms = institution.get('digital_platforms', [])
|
||
for platform in digital_platforms:
|
||
platform_url = platform.get('platform_url', '')
|
||
if platform_url and platform_url not in sources:
|
||
sources.append(platform_url)
|
||
|
||
# Build enrichment notes with context
|
||
enrichment_notes = config['enrichment_context'].format(
|
||
conv_id=config['conversation_id'],
|
||
conv_date=config['conversation_date'].split('T')[0]
|
||
)
|
||
|
||
return [{
|
||
'enrichment_date': extraction_date,
|
||
'enrichment_method': 'Conversation-based research with Wikidata verification and institutional cross-referencing',
|
||
'enrichment_source': sources,
|
||
'enrichment_notes': enrichment_notes
|
||
}]
|
||
|
||
|
||
def backfill_country(country_name: str, config: Dict[str, Any]) -> Dict[str, int]:
|
||
"""Process all files for a country and backfill enrichment_history."""
|
||
stats = {
|
||
'files': 0,
|
||
'total': 0,
|
||
'backfilled': 0,
|
||
'skipped': 0
|
||
}
|
||
|
||
print(f"\n{'='*70}")
|
||
print(f"Processing: {country_name.upper()}")
|
||
print(f"{'='*70}")
|
||
print(f"Conversation ID: {config['conversation_id']}")
|
||
print(f"Directory: {config['dir']}")
|
||
|
||
if not config['dir'].exists():
|
||
print(f"❌ Directory not found!")
|
||
return stats
|
||
|
||
# Find all YAML files (exclude backups)
|
||
yaml_files = [
|
||
f for f in config['dir'].glob("*.yaml")
|
||
if not any(suffix in f.name for suffix in ['.backup', '.pre_', '.bak'])
|
||
]
|
||
|
||
if not yaml_files:
|
||
print(f"❌ No YAML files found!")
|
||
return stats
|
||
|
||
print(f"Found {len(yaml_files)} file(s)\n")
|
||
|
||
for yaml_file in yaml_files:
|
||
print(f"File: {yaml_file.name}")
|
||
stats['files'] += 1
|
||
|
||
# Load institutions
|
||
with open(yaml_file, 'r', encoding='utf-8') as f:
|
||
institutions = yaml.safe_load(f)
|
||
|
||
if not institutions or not isinstance(institutions, list):
|
||
print(f" ⚠️ Empty or invalid file")
|
||
continue
|
||
|
||
stats['total'] += len(institutions)
|
||
modified = False
|
||
|
||
# Process each institution
|
||
for inst in institutions:
|
||
if not isinstance(inst, dict):
|
||
continue
|
||
|
||
if needs_backfill(inst, config['conversation_id']):
|
||
# Create enrichment_history
|
||
enrichment_history = create_enrichment_history(inst, config)
|
||
|
||
# Add to provenance
|
||
if 'provenance' not in inst:
|
||
inst['provenance'] = {}
|
||
|
||
inst['provenance']['enrichment_history'] = enrichment_history
|
||
|
||
stats['backfilled'] += 1
|
||
modified = True
|
||
|
||
# Log
|
||
name = inst.get('name', 'Unknown')
|
||
wikidata_id = get_wikidata_id(inst)
|
||
print(f" ✅ {name} (Wikidata: {wikidata_id})")
|
||
else:
|
||
stats['skipped'] += 1
|
||
|
||
# Save if modified
|
||
if modified:
|
||
# Backup original
|
||
backup_path = yaml_file.with_suffix('.yaml.pre_enrichment_backfill')
|
||
if not backup_path.exists():
|
||
shutil.copy2(yaml_file, backup_path)
|
||
print(f" 💾 Backup: {backup_path.name}")
|
||
|
||
# Write updated file
|
||
with open(yaml_file, 'w', encoding='utf-8') as f:
|
||
yaml.dump(
|
||
institutions,
|
||
f,
|
||
default_flow_style=False,
|
||
allow_unicode=True,
|
||
sort_keys=False,
|
||
width=120
|
||
)
|
||
print(f" ✨ Updated: {yaml_file.name}")
|
||
else:
|
||
print(f" ℹ️ No changes needed")
|
||
|
||
return stats
|
||
|
||
|
||
def main():
|
||
"""Main backfill process."""
|
||
print("=" * 70)
|
||
print("North Africa Institutions Enrichment History Backfill")
|
||
print("=" * 70)
|
||
print("\nCountries: Tunisia (2), Algeria (1), Libya (8)")
|
||
print("Total expected backfills: 11 institutions\n")
|
||
|
||
# Process all countries
|
||
total_stats = {
|
||
'countries': 0,
|
||
'files': 0,
|
||
'total_institutions': 0,
|
||
'backfilled': 0,
|
||
'skipped': 0
|
||
}
|
||
|
||
for country_name, config in COUNTRY_CONFIGS.items():
|
||
stats = backfill_country(country_name, config)
|
||
total_stats['countries'] += 1
|
||
total_stats['files'] += stats['files']
|
||
total_stats['total_institutions'] += stats['total']
|
||
total_stats['backfilled'] += stats['backfilled']
|
||
total_stats['skipped'] += stats['skipped']
|
||
|
||
# Summary
|
||
print("\n" + "=" * 70)
|
||
print("Backfill Summary")
|
||
print("=" * 70)
|
||
print(f"Countries processed: {total_stats['countries']}")
|
||
print(f"Files processed: {total_stats['files']}")
|
||
print(f"Total institutions: {total_stats['total_institutions']}")
|
||
print(f"✅ Backfilled: {total_stats['backfilled']}")
|
||
print(f"⏭️ Skipped: {total_stats['skipped']}")
|
||
print()
|
||
|
||
if total_stats['backfilled'] > 0:
|
||
print("✨ Enrichment history successfully backfilled!")
|
||
print("\nNext step: Verify gap closure with validation script")
|
||
else:
|
||
print("ℹ️ No institutions required backfilling")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|