- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
273 lines
8.5 KiB
Python
273 lines
8.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
VIAF Enrichment for Egyptian Heritage Institutions
|
|
|
|
Searches VIAF (Virtual International Authority File) for heritage institutions
|
|
without VIAF identifiers. VIAF is particularly strong for libraries, archives,
|
|
and museums.
|
|
|
|
VIAF API Documentation: https://www.oclc.org/developer/api/oclc-apis/viaf.en.html
|
|
|
|
Usage:
|
|
python scripts/enrich_egypt_viaf.py
|
|
"""
|
|
|
|
import yaml
|
|
import requests
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Optional, Tuple
|
|
from urllib.parse import quote
|
|
|
|
|
|
def search_viaf(institution_name: str, institution_type: str) -> Optional[Tuple[str, str, float]]:
|
|
"""
|
|
Search VIAF for an institution by name using AutoSuggest API.
|
|
|
|
Args:
|
|
institution_name: Name of the institution
|
|
institution_type: Type (LIBRARY, ARCHIVE, MUSEUM, etc.)
|
|
|
|
Returns:
|
|
Tuple of (viaf_id, viaf_label, confidence_score) or None if no match
|
|
"""
|
|
# VIAF AutoSuggest API endpoint
|
|
base_url = "https://viaf.org/viaf/AutoSuggest"
|
|
|
|
params = {
|
|
'query': institution_name
|
|
}
|
|
|
|
try:
|
|
headers = {'Accept': 'application/json'}
|
|
response = requests.get(base_url, params=params, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
|
|
# Check if we got results
|
|
if 'result' not in data or not data['result']:
|
|
return None
|
|
|
|
# Process results
|
|
for result in data['result']:
|
|
# Extract VIAF ID
|
|
viaf_id = result.get('viafid')
|
|
if not viaf_id:
|
|
continue
|
|
|
|
# Extract term (preferred name)
|
|
viaf_label = result.get('term', '')
|
|
if not viaf_label:
|
|
continue
|
|
|
|
# Calculate simple confidence score based on name similarity
|
|
confidence = calculate_name_similarity(institution_name, viaf_label)
|
|
|
|
print(f" Found: {viaf_label} (VIAF: {viaf_id}, confidence: {confidence:.3f})")
|
|
|
|
# Return if confidence is reasonable
|
|
if confidence > 0.5:
|
|
return (viaf_id, viaf_label, confidence)
|
|
|
|
return None
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f" Error querying VIAF: {e}")
|
|
return None
|
|
except Exception as e:
|
|
print(f" Error parsing VIAF response: {e}")
|
|
return None
|
|
|
|
|
|
def calculate_name_similarity(name1: str, name2: str) -> float:
|
|
"""
|
|
Calculate simple similarity score between two names.
|
|
Uses case-insensitive substring matching and word overlap.
|
|
|
|
Args:
|
|
name1: First name
|
|
name2: Second name
|
|
|
|
Returns:
|
|
Similarity score between 0.0 and 1.0
|
|
"""
|
|
name1_lower = name1.lower()
|
|
name2_lower = name2.lower()
|
|
|
|
# Exact match
|
|
if name1_lower == name2_lower:
|
|
return 1.0
|
|
|
|
# Substring match
|
|
if name1_lower in name2_lower or name2_lower in name1_lower:
|
|
return 0.9
|
|
|
|
# Word overlap
|
|
words1 = set(name1_lower.split())
|
|
words2 = set(name2_lower.split())
|
|
|
|
# Remove common stop words
|
|
stop_words = {'the', 'of', 'in', 'and', 'a', 'an', 'for', 'to', 'university', 'library', 'museum'}
|
|
words1 = words1 - stop_words
|
|
words2 = words2 - stop_words
|
|
|
|
if not words1 or not words2:
|
|
return 0.0
|
|
|
|
overlap = len(words1 & words2)
|
|
total = len(words1 | words2)
|
|
|
|
return overlap / total if total > 0 else 0.0
|
|
|
|
|
|
def enrich_with_viaf(institutions: List[Dict]) -> Tuple[List[Dict], Dict]:
|
|
"""
|
|
Enrich institutions with VIAF identifiers.
|
|
|
|
Args:
|
|
institutions: List of institution records
|
|
|
|
Returns:
|
|
Tuple of (enriched_institutions, statistics)
|
|
"""
|
|
stats = {
|
|
'total': len(institutions),
|
|
'already_has_viaf': 0,
|
|
'viaf_found': 0,
|
|
'viaf_not_found': 0,
|
|
'by_type': {}
|
|
}
|
|
|
|
enriched = []
|
|
|
|
for inst in institutions:
|
|
name = inst.get('name', '')
|
|
inst_type = inst.get('institution_type', 'UNKNOWN')
|
|
identifiers = inst.get('identifiers', [])
|
|
|
|
# Track by type
|
|
if inst_type not in stats['by_type']:
|
|
stats['by_type'][inst_type] = {
|
|
'total': 0,
|
|
'already_has_viaf': 0,
|
|
'viaf_found': 0,
|
|
'viaf_not_found': 0
|
|
}
|
|
stats['by_type'][inst_type]['total'] += 1
|
|
|
|
# Check if already has VIAF
|
|
has_viaf = any(i.get('identifier_scheme') == 'VIAF' for i in identifiers)
|
|
|
|
if has_viaf:
|
|
print(f"✓ {name}: Already has VIAF identifier")
|
|
stats['already_has_viaf'] += 1
|
|
stats['by_type'][inst_type]['already_has_viaf'] += 1
|
|
enriched.append(inst)
|
|
continue
|
|
|
|
# Search VIAF
|
|
print(f"\n🔍 Searching VIAF for: {name} ({inst_type})")
|
|
result = search_viaf(name, inst_type)
|
|
|
|
if result:
|
|
viaf_id, viaf_label, confidence = result
|
|
|
|
# Add VIAF identifier
|
|
identifiers.append({
|
|
'identifier_scheme': 'VIAF',
|
|
'identifier_value': viaf_id,
|
|
'identifier_url': f'https://viaf.org/viaf/{viaf_id}'
|
|
})
|
|
|
|
inst['identifiers'] = identifiers
|
|
|
|
# Update provenance
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
|
|
if 'viaf_enrichment' not in inst['provenance']:
|
|
inst['provenance']['viaf_enrichment'] = {}
|
|
|
|
inst['provenance']['viaf_enrichment'].update({
|
|
'method': 'VIAF SRU API search',
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'viaf_label': viaf_label,
|
|
'confidence_score': confidence,
|
|
'verified': confidence > 0.8
|
|
})
|
|
|
|
print(f"✅ Added VIAF identifier: {viaf_id}")
|
|
stats['viaf_found'] += 1
|
|
stats['by_type'][inst_type]['viaf_found'] += 1
|
|
else:
|
|
print(f"❌ No VIAF identifier found")
|
|
stats['viaf_not_found'] += 1
|
|
stats['by_type'][inst_type]['viaf_not_found'] += 1
|
|
|
|
enriched.append(inst)
|
|
|
|
# Rate limiting - be respectful to VIAF API
|
|
time.sleep(1)
|
|
|
|
return enriched, stats
|
|
|
|
|
|
def main():
|
|
"""Main execution function."""
|
|
input_file = 'data/instances/egypt_institutions_wikidata_corrected.yaml'
|
|
output_file = 'data/instances/egypt_institutions_viaf_enriched.yaml'
|
|
|
|
print("="*60)
|
|
print("VIAF Enrichment for Egyptian Heritage Institutions")
|
|
print("="*60)
|
|
|
|
# Load institutions
|
|
print(f"\nLoading institutions from: {input_file}")
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print(f"Loaded {len(institutions)} institutions")
|
|
|
|
# Enrich with VIAF
|
|
enriched, stats = enrich_with_viaf(institutions)
|
|
|
|
# Save enriched data
|
|
print(f"\n{'='*60}")
|
|
print(f"Saving enriched data to: {output_file}")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(enriched, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
# Print statistics
|
|
print(f"\n{'='*60}")
|
|
print("VIAF Enrichment Statistics")
|
|
print(f"{'='*60}")
|
|
print(f"Total institutions: {stats['total']}")
|
|
print(f"Already had VIAF: {stats['already_has_viaf']}")
|
|
print(f"VIAF found: {stats['viaf_found']}")
|
|
print(f"VIAF not found: {stats['viaf_not_found']}")
|
|
print(f"\nNew VIAF coverage: {stats['viaf_found'] + stats['already_has_viaf']}/{stats['total']} "
|
|
f"({100 * (stats['viaf_found'] + stats['already_has_viaf']) / stats['total']:.1f}%)")
|
|
|
|
print(f"\n{'='*60}")
|
|
print("Breakdown by Institution Type")
|
|
print(f"{'='*60}")
|
|
for inst_type, type_stats in sorted(stats['by_type'].items()):
|
|
total = type_stats['total']
|
|
found = type_stats['viaf_found']
|
|
already = type_stats['already_has_viaf']
|
|
coverage = 100 * (found + already) / total if total > 0 else 0
|
|
|
|
print(f"\n{inst_type} ({total} institutions):")
|
|
print(f" Already had VIAF: {already}")
|
|
print(f" VIAF found: {found}")
|
|
print(f" VIAF not found: {type_stats['viaf_not_found']}")
|
|
print(f" Coverage: {found + already}/{total} ({coverage:.1f}%)")
|
|
|
|
print(f"\n{'='*60}")
|
|
print("✅ VIAF enrichment complete!")
|
|
print(f"{'='*60}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|