glam/scripts/enrich_egypt_viaf.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

273 lines
8.5 KiB
Python

#!/usr/bin/env python3
"""
VIAF Enrichment for Egyptian Heritage Institutions
Searches VIAF (Virtual International Authority File) for heritage institutions
without VIAF identifiers. VIAF is particularly strong for libraries, archives,
and museums.
VIAF API Documentation: https://www.oclc.org/developer/api/oclc-apis/viaf.en.html
Usage:
python scripts/enrich_egypt_viaf.py
"""
import yaml
import requests
import time
from datetime import datetime, timezone
from typing import Dict, List, Optional, Tuple
from urllib.parse import quote
def search_viaf(institution_name: str, institution_type: str) -> Optional[Tuple[str, str, float]]:
"""
Search VIAF for an institution by name using AutoSuggest API.
Args:
institution_name: Name of the institution
institution_type: Type (LIBRARY, ARCHIVE, MUSEUM, etc.)
Returns:
Tuple of (viaf_id, viaf_label, confidence_score) or None if no match
"""
# VIAF AutoSuggest API endpoint
base_url = "https://viaf.org/viaf/AutoSuggest"
params = {
'query': institution_name
}
try:
headers = {'Accept': 'application/json'}
response = requests.get(base_url, params=params, headers=headers, timeout=10)
response.raise_for_status()
data = response.json()
# Check if we got results
if 'result' not in data or not data['result']:
return None
# Process results
for result in data['result']:
# Extract VIAF ID
viaf_id = result.get('viafid')
if not viaf_id:
continue
# Extract term (preferred name)
viaf_label = result.get('term', '')
if not viaf_label:
continue
# Calculate simple confidence score based on name similarity
confidence = calculate_name_similarity(institution_name, viaf_label)
print(f" Found: {viaf_label} (VIAF: {viaf_id}, confidence: {confidence:.3f})")
# Return if confidence is reasonable
if confidence > 0.5:
return (viaf_id, viaf_label, confidence)
return None
except requests.exceptions.RequestException as e:
print(f" Error querying VIAF: {e}")
return None
except Exception as e:
print(f" Error parsing VIAF response: {e}")
return None
def calculate_name_similarity(name1: str, name2: str) -> float:
"""
Calculate simple similarity score between two names.
Uses case-insensitive substring matching and word overlap.
Args:
name1: First name
name2: Second name
Returns:
Similarity score between 0.0 and 1.0
"""
name1_lower = name1.lower()
name2_lower = name2.lower()
# Exact match
if name1_lower == name2_lower:
return 1.0
# Substring match
if name1_lower in name2_lower or name2_lower in name1_lower:
return 0.9
# Word overlap
words1 = set(name1_lower.split())
words2 = set(name2_lower.split())
# Remove common stop words
stop_words = {'the', 'of', 'in', 'and', 'a', 'an', 'for', 'to', 'university', 'library', 'museum'}
words1 = words1 - stop_words
words2 = words2 - stop_words
if not words1 or not words2:
return 0.0
overlap = len(words1 & words2)
total = len(words1 | words2)
return overlap / total if total > 0 else 0.0
def enrich_with_viaf(institutions: List[Dict]) -> Tuple[List[Dict], Dict]:
"""
Enrich institutions with VIAF identifiers.
Args:
institutions: List of institution records
Returns:
Tuple of (enriched_institutions, statistics)
"""
stats = {
'total': len(institutions),
'already_has_viaf': 0,
'viaf_found': 0,
'viaf_not_found': 0,
'by_type': {}
}
enriched = []
for inst in institutions:
name = inst.get('name', '')
inst_type = inst.get('institution_type', 'UNKNOWN')
identifiers = inst.get('identifiers', [])
# Track by type
if inst_type not in stats['by_type']:
stats['by_type'][inst_type] = {
'total': 0,
'already_has_viaf': 0,
'viaf_found': 0,
'viaf_not_found': 0
}
stats['by_type'][inst_type]['total'] += 1
# Check if already has VIAF
has_viaf = any(i.get('identifier_scheme') == 'VIAF' for i in identifiers)
if has_viaf:
print(f"{name}: Already has VIAF identifier")
stats['already_has_viaf'] += 1
stats['by_type'][inst_type]['already_has_viaf'] += 1
enriched.append(inst)
continue
# Search VIAF
print(f"\n🔍 Searching VIAF for: {name} ({inst_type})")
result = search_viaf(name, inst_type)
if result:
viaf_id, viaf_label, confidence = result
# Add VIAF identifier
identifiers.append({
'identifier_scheme': 'VIAF',
'identifier_value': viaf_id,
'identifier_url': f'https://viaf.org/viaf/{viaf_id}'
})
inst['identifiers'] = identifiers
# Update provenance
if 'provenance' not in inst:
inst['provenance'] = {}
if 'viaf_enrichment' not in inst['provenance']:
inst['provenance']['viaf_enrichment'] = {}
inst['provenance']['viaf_enrichment'].update({
'method': 'VIAF SRU API search',
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'viaf_label': viaf_label,
'confidence_score': confidence,
'verified': confidence > 0.8
})
print(f"✅ Added VIAF identifier: {viaf_id}")
stats['viaf_found'] += 1
stats['by_type'][inst_type]['viaf_found'] += 1
else:
print(f"❌ No VIAF identifier found")
stats['viaf_not_found'] += 1
stats['by_type'][inst_type]['viaf_not_found'] += 1
enriched.append(inst)
# Rate limiting - be respectful to VIAF API
time.sleep(1)
return enriched, stats
def main():
"""Main execution function."""
input_file = 'data/instances/egypt_institutions_wikidata_corrected.yaml'
output_file = 'data/instances/egypt_institutions_viaf_enriched.yaml'
print("="*60)
print("VIAF Enrichment for Egyptian Heritage Institutions")
print("="*60)
# Load institutions
print(f"\nLoading institutions from: {input_file}")
with open(input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f"Loaded {len(institutions)} institutions")
# Enrich with VIAF
enriched, stats = enrich_with_viaf(institutions)
# Save enriched data
print(f"\n{'='*60}")
print(f"Saving enriched data to: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(enriched, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Print statistics
print(f"\n{'='*60}")
print("VIAF Enrichment Statistics")
print(f"{'='*60}")
print(f"Total institutions: {stats['total']}")
print(f"Already had VIAF: {stats['already_has_viaf']}")
print(f"VIAF found: {stats['viaf_found']}")
print(f"VIAF not found: {stats['viaf_not_found']}")
print(f"\nNew VIAF coverage: {stats['viaf_found'] + stats['already_has_viaf']}/{stats['total']} "
f"({100 * (stats['viaf_found'] + stats['already_has_viaf']) / stats['total']:.1f}%)")
print(f"\n{'='*60}")
print("Breakdown by Institution Type")
print(f"{'='*60}")
for inst_type, type_stats in sorted(stats['by_type'].items()):
total = type_stats['total']
found = type_stats['viaf_found']
already = type_stats['already_has_viaf']
coverage = 100 * (found + already) / total if total > 0 else 0
print(f"\n{inst_type} ({total} institutions):")
print(f" Already had VIAF: {already}")
print(f" VIAF found: {found}")
print(f" VIAF not found: {type_stats['viaf_not_found']}")
print(f" Coverage: {found + already}/{total} ({coverage:.1f}%)")
print(f"\n{'='*60}")
print("✅ VIAF enrichment complete!")
print(f"{'='*60}")
if __name__ == '__main__':
main()