- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
160 lines
5.3 KiB
Python
160 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Merge manual VIAF mappings from CSV back into Egypt institutions YAML file.
|
|
|
|
Usage:
|
|
python scripts/merge_viaf_mappings.py
|
|
|
|
Reads:
|
|
- data/manual_enrichment/egypt_viaf_mappings.csv (manual VIAF lookups)
|
|
- data/instances/egypt_institutions_wikidata_viaf.yaml (current data)
|
|
|
|
Writes:
|
|
- data/instances/egypt_institutions_wikidata_viaf.yaml (updated with new VIAF IDs)
|
|
|
|
CSV columns:
|
|
- institution_id: Full institution URI
|
|
- name: Institution name (for reference)
|
|
- institution_type: Type (for reference)
|
|
- city: Location (for reference)
|
|
- viaf_id: VIAF ID (numeric, e.g., 123456789)
|
|
- viaf_url: Full VIAF URL (optional, will be generated if missing)
|
|
- notes: Additional notes (optional)
|
|
- lookup_status: PENDING, FOUND, NOT_FOUND, UNCERTAIN
|
|
"""
|
|
|
|
import csv
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
|
|
def load_viaf_mappings(csv_path: Path) -> dict:
|
|
"""Load VIAF mappings from CSV, returning dict keyed by institution_id."""
|
|
mappings = {}
|
|
|
|
with open(csv_path, 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
inst_id = row['institution_id'].strip()
|
|
viaf_id = row.get('viaf_id', '').strip()
|
|
status = row.get('lookup_status', 'PENDING').strip().upper()
|
|
|
|
# Only process if VIAF ID found
|
|
if viaf_id and status == 'FOUND':
|
|
viaf_url = row.get('viaf_url', '').strip()
|
|
if not viaf_url:
|
|
viaf_url = f"https://viaf.org/viaf/{viaf_id}"
|
|
|
|
mappings[inst_id] = {
|
|
'identifier_scheme': 'VIAF',
|
|
'identifier_value': viaf_id,
|
|
'identifier_url': viaf_url,
|
|
'notes': row.get('notes', '').strip()
|
|
}
|
|
|
|
return mappings
|
|
|
|
|
|
def merge_viaf_into_institutions(yaml_path: Path, viaf_mappings: dict) -> tuple[int, int]:
|
|
"""Merge VIAF mappings into institutions YAML file.
|
|
|
|
Returns:
|
|
(added_count, skipped_count) tuple
|
|
"""
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
added_count = 0
|
|
skipped_count = 0
|
|
enrichment_date = datetime.now(timezone.utc).isoformat()
|
|
|
|
for inst in institutions:
|
|
inst_id = inst.get('id')
|
|
|
|
if inst_id in viaf_mappings:
|
|
# Check if VIAF already exists
|
|
has_viaf = False
|
|
if inst.get('identifiers'):
|
|
for identifier in inst['identifiers']:
|
|
if identifier.get('identifier_scheme') == 'VIAF':
|
|
has_viaf = True
|
|
break
|
|
|
|
if has_viaf:
|
|
print(f"⏭️ Skipping {inst['name']} - already has VIAF")
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Add VIAF identifier
|
|
viaf_mapping = viaf_mappings[inst_id]
|
|
|
|
if not inst.get('identifiers'):
|
|
inst['identifiers'] = []
|
|
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': viaf_mapping['identifier_scheme'],
|
|
'identifier_value': viaf_mapping['identifier_value'],
|
|
'identifier_url': viaf_mapping['identifier_url']
|
|
})
|
|
|
|
# Add enrichment metadata to provenance
|
|
if not inst.get('provenance'):
|
|
inst['provenance'] = {}
|
|
|
|
inst['provenance']['viaf_enrichment'] = {
|
|
'method': 'Manual VIAF web lookup',
|
|
'enrichment_date': enrichment_date,
|
|
'verified': True,
|
|
'notes': viaf_mapping.get('notes', 'Manual lookup via VIAF website')
|
|
}
|
|
|
|
print(f"✅ Added VIAF {viaf_mapping['identifier_value']} to {inst['name']}")
|
|
added_count += 1
|
|
|
|
# Write updated YAML back
|
|
with open(yaml_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f,
|
|
allow_unicode=True,
|
|
default_flow_style=False,
|
|
sort_keys=False,
|
|
width=100)
|
|
|
|
return added_count, skipped_count
|
|
|
|
|
|
def main():
|
|
csv_path = Path('data/manual_enrichment/egypt_viaf_mappings.csv')
|
|
yaml_path = Path('data/instances/egypt_institutions_wikidata_viaf.yaml')
|
|
|
|
if not csv_path.exists():
|
|
print(f"❌ CSV file not found: {csv_path}")
|
|
return
|
|
|
|
if not yaml_path.exists():
|
|
print(f"❌ YAML file not found: {yaml_path}")
|
|
return
|
|
|
|
print(f"📂 Loading VIAF mappings from {csv_path}...")
|
|
viaf_mappings = load_viaf_mappings(csv_path)
|
|
|
|
print(f"✅ Loaded {len(viaf_mappings)} VIAF mappings")
|
|
|
|
if not viaf_mappings:
|
|
print("⚠️ No VIAF mappings with status=FOUND in CSV")
|
|
print(" Please update the CSV with VIAF IDs and set lookup_status=FOUND")
|
|
return
|
|
|
|
print(f"\n📂 Merging into {yaml_path}...")
|
|
added_count, skipped_count = merge_viaf_into_institutions(yaml_path, viaf_mappings)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"✅ Merge complete!")
|
|
print(f" Added: {added_count}")
|
|
print(f" Skipped: {skipped_count}")
|
|
print(f" Total mappings processed: {len(viaf_mappings)}")
|
|
print(f"{'='*60}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|