#!/usr/bin/env python3 """ VIAF Enrichment for Egyptian Heritage Institutions Searches VIAF (Virtual International Authority File) for heritage institutions without VIAF identifiers. VIAF is particularly strong for libraries, archives, and museums. VIAF API Documentation: https://www.oclc.org/developer/api/oclc-apis/viaf.en.html Usage: python scripts/enrich_egypt_viaf.py """ import yaml import requests import time from datetime import datetime, timezone from typing import Dict, List, Optional, Tuple from urllib.parse import quote def search_viaf(institution_name: str, institution_type: str) -> Optional[Tuple[str, str, float]]: """ Search VIAF for an institution by name using AutoSuggest API. Args: institution_name: Name of the institution institution_type: Type (LIBRARY, ARCHIVE, MUSEUM, etc.) Returns: Tuple of (viaf_id, viaf_label, confidence_score) or None if no match """ # VIAF AutoSuggest API endpoint base_url = "https://viaf.org/viaf/AutoSuggest" params = { 'query': institution_name } try: headers = {'Accept': 'application/json'} response = requests.get(base_url, params=params, headers=headers, timeout=10) response.raise_for_status() data = response.json() # Check if we got results if 'result' not in data or not data['result']: return None # Process results for result in data['result']: # Extract VIAF ID viaf_id = result.get('viafid') if not viaf_id: continue # Extract term (preferred name) viaf_label = result.get('term', '') if not viaf_label: continue # Calculate simple confidence score based on name similarity confidence = calculate_name_similarity(institution_name, viaf_label) print(f" Found: {viaf_label} (VIAF: {viaf_id}, confidence: {confidence:.3f})") # Return if confidence is reasonable if confidence > 0.5: return (viaf_id, viaf_label, confidence) return None except requests.exceptions.RequestException as e: print(f" Error querying VIAF: {e}") return None except Exception as e: print(f" Error parsing VIAF response: {e}") return None def calculate_name_similarity(name1: str, name2: str) -> float: """ Calculate simple similarity score between two names. Uses case-insensitive substring matching and word overlap. Args: name1: First name name2: Second name Returns: Similarity score between 0.0 and 1.0 """ name1_lower = name1.lower() name2_lower = name2.lower() # Exact match if name1_lower == name2_lower: return 1.0 # Substring match if name1_lower in name2_lower or name2_lower in name1_lower: return 0.9 # Word overlap words1 = set(name1_lower.split()) words2 = set(name2_lower.split()) # Remove common stop words stop_words = {'the', 'of', 'in', 'and', 'a', 'an', 'for', 'to', 'university', 'library', 'museum'} words1 = words1 - stop_words words2 = words2 - stop_words if not words1 or not words2: return 0.0 overlap = len(words1 & words2) total = len(words1 | words2) return overlap / total if total > 0 else 0.0 def enrich_with_viaf(institutions: List[Dict]) -> Tuple[List[Dict], Dict]: """ Enrich institutions with VIAF identifiers. Args: institutions: List of institution records Returns: Tuple of (enriched_institutions, statistics) """ stats = { 'total': len(institutions), 'already_has_viaf': 0, 'viaf_found': 0, 'viaf_not_found': 0, 'by_type': {} } enriched = [] for inst in institutions: name = inst.get('name', '') inst_type = inst.get('institution_type', 'UNKNOWN') identifiers = inst.get('identifiers', []) # Track by type if inst_type not in stats['by_type']: stats['by_type'][inst_type] = { 'total': 0, 'already_has_viaf': 0, 'viaf_found': 0, 'viaf_not_found': 0 } stats['by_type'][inst_type]['total'] += 1 # Check if already has VIAF has_viaf = any(i.get('identifier_scheme') == 'VIAF' for i in identifiers) if has_viaf: print(f"āœ“ {name}: Already has VIAF identifier") stats['already_has_viaf'] += 1 stats['by_type'][inst_type]['already_has_viaf'] += 1 enriched.append(inst) continue # Search VIAF print(f"\nšŸ” Searching VIAF for: {name} ({inst_type})") result = search_viaf(name, inst_type) if result: viaf_id, viaf_label, confidence = result # Add VIAF identifier identifiers.append({ 'identifier_scheme': 'VIAF', 'identifier_value': viaf_id, 'identifier_url': f'https://viaf.org/viaf/{viaf_id}' }) inst['identifiers'] = identifiers # Update provenance if 'provenance' not in inst: inst['provenance'] = {} if 'viaf_enrichment' not in inst['provenance']: inst['provenance']['viaf_enrichment'] = {} inst['provenance']['viaf_enrichment'].update({ 'method': 'VIAF SRU API search', 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'viaf_label': viaf_label, 'confidence_score': confidence, 'verified': confidence > 0.8 }) print(f"āœ… Added VIAF identifier: {viaf_id}") stats['viaf_found'] += 1 stats['by_type'][inst_type]['viaf_found'] += 1 else: print(f"āŒ No VIAF identifier found") stats['viaf_not_found'] += 1 stats['by_type'][inst_type]['viaf_not_found'] += 1 enriched.append(inst) # Rate limiting - be respectful to VIAF API time.sleep(1) return enriched, stats def main(): """Main execution function.""" input_file = 'data/instances/egypt_institutions_wikidata_corrected.yaml' output_file = 'data/instances/egypt_institutions_viaf_enriched.yaml' print("="*60) print("VIAF Enrichment for Egyptian Heritage Institutions") print("="*60) # Load institutions print(f"\nLoading institutions from: {input_file}") with open(input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f"Loaded {len(institutions)} institutions") # Enrich with VIAF enriched, stats = enrich_with_viaf(institutions) # Save enriched data print(f"\n{'='*60}") print(f"Saving enriched data to: {output_file}") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(enriched, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Print statistics print(f"\n{'='*60}") print("VIAF Enrichment Statistics") print(f"{'='*60}") print(f"Total institutions: {stats['total']}") print(f"Already had VIAF: {stats['already_has_viaf']}") print(f"VIAF found: {stats['viaf_found']}") print(f"VIAF not found: {stats['viaf_not_found']}") print(f"\nNew VIAF coverage: {stats['viaf_found'] + stats['already_has_viaf']}/{stats['total']} " f"({100 * (stats['viaf_found'] + stats['already_has_viaf']) / stats['total']:.1f}%)") print(f"\n{'='*60}") print("Breakdown by Institution Type") print(f"{'='*60}") for inst_type, type_stats in sorted(stats['by_type'].items()): total = type_stats['total'] found = type_stats['viaf_found'] already = type_stats['already_has_viaf'] coverage = 100 * (found + already) / total if total > 0 else 0 print(f"\n{inst_type} ({total} institutions):") print(f" Already had VIAF: {already}") print(f" VIAF found: {found}") print(f" VIAF not found: {type_stats['viaf_not_found']}") print(f" Coverage: {found + already}/{total} ({coverage:.1f}%)") print(f"\n{'='*60}") print("āœ… VIAF enrichment complete!") print(f"{'='*60}") if __name__ == '__main__': main()