#!/usr/bin/env python3 """ Extract VIAF Identifiers from Wikidata For institutions that already have Wikidata IDs, query Wikidata to check if they have VIAF identifiers (Property P214) that we can add to our records. This is a quick win since we already have 7 Wikidata IDs confirmed. Usage: python scripts/extract_viaf_from_wikidata.py """ import yaml import requests import time from datetime import datetime, timezone from typing import Dict, List, Optional, Tuple def query_wikidata_for_viaf(wikidata_id: str) -> Optional[str]: """ Query Wikidata SPARQL endpoint to get VIAF ID for an entity. Args: wikidata_id: Wikidata Q-number (e.g., "Q501851") Returns: VIAF ID (numeric string) or None if not found """ endpoint = "https://query.wikidata.org/sparql" query = f""" SELECT ?viaf WHERE {{ wd:{wikidata_id} wdt:P214 ?viaf . }} """ try: response = requests.get( endpoint, params={'query': query, 'format': 'json'}, headers={'User-Agent': 'GLAM-Extractor/1.0 (heritage data research)'}, timeout=10 ) response.raise_for_status() data = response.json() # Check if we got results bindings = data.get('results', {}).get('bindings', []) if not bindings: return None # Extract VIAF ID from first result viaf_id = bindings[0].get('viaf', {}).get('value') return viaf_id except requests.exceptions.RequestException as e: print(f" Error querying Wikidata: {e}") return None except Exception as e: print(f" Error parsing Wikidata response: {e}") return None def extract_viaf_from_wikidata(institutions: List[Dict]) -> Tuple[List[Dict], Dict]: """ Extract VIAF identifiers from Wikidata for institutions with Wikidata IDs. Args: institutions: List of institution records Returns: Tuple of (enriched_institutions, statistics) """ stats = { 'total': len(institutions), 'has_wikidata': 0, 'already_has_viaf': 0, 'viaf_found_in_wikidata': 0, 'viaf_not_in_wikidata': 0, 'no_wikidata': 0 } enriched = [] for inst in institutions: name = inst.get('name', '') identifiers = inst.get('identifiers', []) # Check if already has VIAF has_viaf = any(i.get('identifier_scheme') == 'VIAF' for i in identifiers) # Find Wikidata ID wikidata_id = None for identifier in identifiers: if identifier.get('identifier_scheme') == 'Wikidata': wikidata_id = identifier.get('identifier_value') break if not wikidata_id: stats['no_wikidata'] += 1 enriched.append(inst) continue stats['has_wikidata'] += 1 if has_viaf: print(f"āœ“ {name}: Already has VIAF identifier") stats['already_has_viaf'] += 1 enriched.append(inst) continue # Query Wikidata for VIAF print(f"\nšŸ” Querying Wikidata {wikidata_id} for VIAF: {name}") viaf_id = query_wikidata_for_viaf(wikidata_id) if viaf_id: # Add VIAF identifier identifiers.append({ 'identifier_scheme': 'VIAF', 'identifier_value': viaf_id, 'identifier_url': f'https://viaf.org/viaf/{viaf_id}' }) inst['identifiers'] = identifiers # Update provenance if 'provenance' not in inst: inst['provenance'] = {} if 'viaf_enrichment' not in inst['provenance']: inst['provenance']['viaf_enrichment'] = {} inst['provenance']['viaf_enrichment'].update({ 'method': 'Extracted from Wikidata P214', 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'wikidata_source': wikidata_id, 'verified': True # High confidence since it's from Wikidata }) print(f"āœ… Added VIAF identifier from Wikidata: {viaf_id}") stats['viaf_found_in_wikidata'] += 1 else: print(f"āŒ No VIAF identifier in Wikidata {wikidata_id}") stats['viaf_not_in_wikidata'] += 1 enriched.append(inst) # Rate limiting - be respectful to Wikidata time.sleep(0.5) return enriched, stats def main(): """Main execution function.""" input_file = 'data/instances/egypt_institutions_wikidata_corrected.yaml' output_file = 'data/instances/egypt_institutions_wikidata_viaf.yaml' print("="*70) print("Extract VIAF Identifiers from Wikidata for Egyptian Institutions") print("="*70) # Load institutions print(f"\nLoading institutions from: {input_file}") with open(input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f"Loaded {len(institutions)} institutions") # Extract VIAF from Wikidata enriched, stats = extract_viaf_from_wikidata(institutions) # Save enriched data print(f"\n{'='*70}") print(f"Saving enriched data to: {output_file}") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(enriched, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Print statistics print(f"\n{'='*70}") print("VIAF Extraction Statistics (from Wikidata)") print(f"{'='*70}") print(f"Total institutions: {stats['total']}") print(f"Institutions with Wikidata IDs: {stats['has_wikidata']}") print(f"Already had VIAF: {stats['already_has_viaf']}") print(f"VIAF found in Wikidata: {stats['viaf_found_in_wikidata']}") print(f"No VIAF in Wikidata: {stats['viaf_not_in_wikidata']}") print(f"No Wikidata ID: {stats['no_wikidata']}") total_viaf = stats['viaf_found_in_wikidata'] + stats['already_has_viaf'] print(f"\nšŸ“Š Total VIAF coverage: {total_viaf}/{stats['total']} " f"({100 * total_viaf / stats['total']:.1f}%)") print(f"\n{'='*70}") print("āœ… VIAF extraction from Wikidata complete!") print(f"{'='*70}") if __name__ == '__main__': main()