glam/scripts/merge_viaf_mappings.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

160 lines
5.3 KiB
Python

#!/usr/bin/env python3
"""
Merge manual VIAF mappings from CSV back into Egypt institutions YAML file.
Usage:
python scripts/merge_viaf_mappings.py
Reads:
- data/manual_enrichment/egypt_viaf_mappings.csv (manual VIAF lookups)
- data/instances/egypt_institutions_wikidata_viaf.yaml (current data)
Writes:
- data/instances/egypt_institutions_wikidata_viaf.yaml (updated with new VIAF IDs)
CSV columns:
- institution_id: Full institution URI
- name: Institution name (for reference)
- institution_type: Type (for reference)
- city: Location (for reference)
- viaf_id: VIAF ID (numeric, e.g., 123456789)
- viaf_url: Full VIAF URL (optional, will be generated if missing)
- notes: Additional notes (optional)
- lookup_status: PENDING, FOUND, NOT_FOUND, UNCERTAIN
"""
import csv
import yaml
from datetime import datetime, timezone
from pathlib import Path
def load_viaf_mappings(csv_path: Path) -> dict:
"""Load VIAF mappings from CSV, returning dict keyed by institution_id."""
mappings = {}
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
inst_id = row['institution_id'].strip()
viaf_id = row.get('viaf_id', '').strip()
status = row.get('lookup_status', 'PENDING').strip().upper()
# Only process if VIAF ID found
if viaf_id and status == 'FOUND':
viaf_url = row.get('viaf_url', '').strip()
if not viaf_url:
viaf_url = f"https://viaf.org/viaf/{viaf_id}"
mappings[inst_id] = {
'identifier_scheme': 'VIAF',
'identifier_value': viaf_id,
'identifier_url': viaf_url,
'notes': row.get('notes', '').strip()
}
return mappings
def merge_viaf_into_institutions(yaml_path: Path, viaf_mappings: dict) -> tuple[int, int]:
"""Merge VIAF mappings into institutions YAML file.
Returns:
(added_count, skipped_count) tuple
"""
with open(yaml_path, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
added_count = 0
skipped_count = 0
enrichment_date = datetime.now(timezone.utc).isoformat()
for inst in institutions:
inst_id = inst.get('id')
if inst_id in viaf_mappings:
# Check if VIAF already exists
has_viaf = False
if inst.get('identifiers'):
for identifier in inst['identifiers']:
if identifier.get('identifier_scheme') == 'VIAF':
has_viaf = True
break
if has_viaf:
print(f"⏭️ Skipping {inst['name']} - already has VIAF")
skipped_count += 1
continue
# Add VIAF identifier
viaf_mapping = viaf_mappings[inst_id]
if not inst.get('identifiers'):
inst['identifiers'] = []
inst['identifiers'].append({
'identifier_scheme': viaf_mapping['identifier_scheme'],
'identifier_value': viaf_mapping['identifier_value'],
'identifier_url': viaf_mapping['identifier_url']
})
# Add enrichment metadata to provenance
if not inst.get('provenance'):
inst['provenance'] = {}
inst['provenance']['viaf_enrichment'] = {
'method': 'Manual VIAF web lookup',
'enrichment_date': enrichment_date,
'verified': True,
'notes': viaf_mapping.get('notes', 'Manual lookup via VIAF website')
}
print(f"✅ Added VIAF {viaf_mapping['identifier_value']} to {inst['name']}")
added_count += 1
# Write updated YAML back
with open(yaml_path, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f,
allow_unicode=True,
default_flow_style=False,
sort_keys=False,
width=100)
return added_count, skipped_count
def main():
csv_path = Path('data/manual_enrichment/egypt_viaf_mappings.csv')
yaml_path = Path('data/instances/egypt_institutions_wikidata_viaf.yaml')
if not csv_path.exists():
print(f"❌ CSV file not found: {csv_path}")
return
if not yaml_path.exists():
print(f"❌ YAML file not found: {yaml_path}")
return
print(f"📂 Loading VIAF mappings from {csv_path}...")
viaf_mappings = load_viaf_mappings(csv_path)
print(f"✅ Loaded {len(viaf_mappings)} VIAF mappings")
if not viaf_mappings:
print("⚠️ No VIAF mappings with status=FOUND in CSV")
print(" Please update the CSV with VIAF IDs and set lookup_status=FOUND")
return
print(f"\n📂 Merging into {yaml_path}...")
added_count, skipped_count = merge_viaf_into_institutions(yaml_path, viaf_mappings)
print(f"\n{'='*60}")
print(f"✅ Merge complete!")
print(f" Added: {added_count}")
print(f" Skipped: {skipped_count}")
print(f" Total mappings processed: {len(viaf_mappings)}")
print(f"{'='*60}")
if __name__ == '__main__':
main()