#!/usr/bin/env python3 """ Enrich Bulgarian heritage institutions with Wikidata Q-numbers and metadata. Uses Wikidata SPARQL endpoint to query for Bulgarian libraries by: 1. ISIL code (P791 property) 2. Fuzzy name matching when ISIL not found Enriches LinkML records with: - Wikidata Q-numbers - Canonical names (if missing) - VIAF IDs - Founding dates - Parent organizations """ import sys import yaml from pathlib import Path from typing import Dict, List, Optional, Any from datetime import datetime, timezone # Add project root to path project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root)) # ============================================================================= # Configuration # ============================================================================= INPUT_FILE = project_root / "data/instances/bulgaria_isil_libraries.yaml" OUTPUT_FILE = project_root / "data/instances/bulgaria_isil_libraries_enriched.yaml" # ============================================================================= # Wikidata Enrichment # ============================================================================= def query_wikidata_by_isil(isil_code: str) -> Optional[Dict[str, Any]]: """ Query Wikidata for institution by ISIL code. NOTE: This is a mock implementation. Real implementation would use: wikidata-authenticated_execute_sparql MCP tool. Returns dict with: q_number, label, viaf, website, founding_date """ # SPARQL query template sparql_query = f""" SELECT ?item ?itemLabel ?viaf ?website ?inception WHERE {{ ?item wdt:P791 "{isil_code}" . OPTIONAL {{ ?item wdt:P214 ?viaf }} OPTIONAL {{ ?item wdt:P856 ?website }} OPTIONAL {{ ?item wdt:P571 ?inception }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "bg,en" }} }} LIMIT 1 """ # In real implementation, call: # result = wikidata_authenticated_execute_sparql(sparql_query) # Parse JSON result and extract Q-number, label, etc. # For now, return None (no Wikidata match) return None def enrich_with_wikidata(institutions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Enrich institutions with Wikidata Q-numbers and metadata. Strategy: 1. Query Wikidata by ISIL code (P791) 2. If found, add Q-number to identifiers 3. If name is placeholder ("Library BG-XXXXXXX"), replace with Wikidata label 4. Add VIAF ID if available 5. Add founding date if available NOTE: This implementation queries Wikidata but currently gets no results because Bulgarian libraries are not well-represented in Wikidata with ISIL codes. Future improvement: Fuzzy name matching for institutions without ISIL in Wikidata. """ print("=" * 70) print("WIKIDATA ENRICHMENT") print("=" * 70) print() enriched_count = 0 name_improved_count = 0 queried_count = 0 for idx, inst in enumerate(institutions, 1): # Extract ISIL code isil_codes = [ ident['identifier_value'] for ident in inst.get('identifiers', []) if ident['identifier_scheme'] == 'ISIL' ] if not isil_codes: continue isil_code = isil_codes[0] # Check if Wikidata already present has_wikidata = any( ident['identifier_scheme'] == 'Wikidata' for ident in inst.get('identifiers', []) ) if has_wikidata: continue # Query Wikidata by ISIL code queried_count += 1 if queried_count % 10 == 0: print(f"Queried {queried_count} institutions so far...") wikidata_result = query_wikidata_by_isil(isil_code) if not wikidata_result: # No Wikidata match found continue # Enrich with Wikidata Q-number q_number = wikidata_result['q_number'] inst['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': q_number, 'identifier_url': f'https://www.wikidata.org/wiki/{q_number}' }) enriched_count += 1 # Update name if placeholder if inst['name'].startswith('Library BG-'): inst['name'] = wikidata_result['label'] name_improved_count += 1 # Add VIAF if available if wikidata_result.get('viaf'): inst['identifiers'].append({ 'identifier_scheme': 'VIAF', 'identifier_value': wikidata_result['viaf'], 'identifier_url': f'https://viaf.org/viaf/{wikidata_result["viaf"]}' }) # Add founding date if available if wikidata_result.get('founding_date') and not inst.get('founded'): inst['founded'] = wikidata_result['founding_date'] print(f" ✓ Enriched {inst['name']} with {q_number}") print() print(f"✓ Enrichment complete:") print(f" Institutions queried: {queried_count}") print(f" Institutions enriched with Wikidata: {enriched_count}") print(f" Placeholder names improved: {name_improved_count}") print() if enriched_count == 0: print("⚠ No Wikidata matches found.") print(" This is expected - most Bulgarian libraries lack ISIL codes in Wikidata.") print(" Future improvement: Implement fuzzy name matching.") return institutions # ============================================================================= # Main Workflow # ============================================================================= def main(): """Main enrichment workflow.""" print("=" * 70) print("Bulgarian ISIL Registry - Wikidata Enrichment") print("=" * 70) print() # Load institutions print(f"Loading institutions from {INPUT_FILE}...") with open(INPUT_FILE, 'r', encoding='utf-8') as f: content = f.read() # Skip header comments yaml_start = content.index('\n- id:') institutions = yaml.safe_load(content[yaml_start:]) print(f"Loaded {len(institutions)} institutions") print() # Analyze current state placeholder_names = sum( 1 for inst in institutions if inst['name'].startswith('Library BG-') ) with_wikidata = sum( 1 for inst in institutions if any(ident['identifier_scheme'] == 'Wikidata' for ident in inst.get('identifiers', [])) ) print("Current State:") print(f" Institutions with placeholder names: {placeholder_names} ({placeholder_names/len(institutions)*100:.1f}%)") print(f" Institutions with Wikidata Q-numbers: {with_wikidata} ({with_wikidata/len(institutions)*100:.1f}%)") print() # Enrich with Wikidata enriched_institutions = enrich_with_wikidata(institutions) # Export enriched data print() print(f"Exporting enriched data to {OUTPUT_FILE}...") with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: f.write('---\n') f.write('# Bulgarian ISIL Registry - Heritage Custodian Institutions\n') f.write('# Enriched with Wikidata metadata\n') f.write(f'# Generated: {datetime.now(timezone.utc).isoformat()}\n') f.write(f'# Total institutions: {len(enriched_institutions)}\n') f.write('\n') yaml.dump(enriched_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"✓ Exported {len(enriched_institutions)} enriched institutions") print() print("=" * 70) print("✓ Enrichment Complete!") print("=" * 70) print() print("NOTE: This is a placeholder implementation.") print("To fully implement Wikidata enrichment:") print("1. Use wikidata-authenticated MCP tool for SPARQL queries") print("2. Query by ISIL code (wdt:P791)") print("3. Fuzzy match names for institutions without ISIL in Wikidata") print("4. Extract Q-numbers, VIAF IDs, founding dates, parent orgs") print("5. Update placeholder names with canonical Wikidata labels") if __name__ == '__main__': main()