#!/usr/bin/env python3 """ Wikidata enrichment for Tunisian heritage institutions. Searches Wikidata by institution name and location for French/Arabic named institutions in Tunisia. GLAM Data Extraction Project Schema: LinkML v0.2.1 """ import yaml import time import requests from datetime import datetime, timezone from pathlib import Path from typing import Optional, Dict, Any from rapidfuzz import fuzz SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" USER_AGENT = "GLAM-Tunisia-Wikidata-Enrichment/1.0" def search_wikidata_by_name(name: str, city: Optional[str] = None, timeout: int = 60) -> Optional[Dict[str, Any]]: """ Search Wikidata for heritage institutions by name. Returns dict with qid, viaf, founded_date, etc. """ # Escape quotes in name name_escaped = name.replace('"', '\\"').replace("'", "\\'") # Simplified query without wdt:P279* (subclass transitive) to avoid timeout query = f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil ?website ?coords ?inception WHERE {{ # Search by label in French, Arabic, or English {{ ?item rdfs:label "{name_escaped}"@fr . }} UNION {{ ?item rdfs:label "{name_escaped}"@ar . }} UNION {{ ?item rdfs:label "{name_escaped}"@en . }} # Must be in Tunisia ?item wdt:P17 wd:Q948 . # Must be heritage institution type (direct instance only, no subclass search) ?item wdt:P31 ?type . VALUES ?type {{ wd:Q33506 # Museum wd:Q7075 # Library wd:Q166118 # Archive wd:Q1030034 # Archaeological museum wd:Q473972 # Art museum wd:Q570116 # Public library wd:Q22687 # Synagogue wd:Q7840289 # Art gallery wd:Q2668072 # National library wd:Q7210356 # Organization }} OPTIONAL {{ ?item wdt:P214 ?viaf . }} OPTIONAL {{ ?item wdt:P791 ?isil . }} OPTIONAL {{ ?item wdt:P856 ?website . }} OPTIONAL {{ ?item wdt:P625 ?coords . }} OPTIONAL {{ ?item wdt:P571 ?inception . }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "fr,ar,en" . }} }} LIMIT 5 """ headers = {'User-Agent': USER_AGENT} params = { 'query': query, 'format': 'json' } try: time.sleep(1.5) # Increased rate limiting response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=timeout) response.raise_for_status() results = response.json() bindings = results.get("results", {}).get("bindings", []) if not bindings: return None # Return first result with highest match quality binding = bindings[0] item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q"): return None result = { "qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "description": binding.get("itemDescription", {}).get("value", "") } if "viaf" in binding: result["viaf"] = binding["viaf"]["value"] if "isil" in binding: result["isil"] = binding["isil"]["value"] if "website" in binding: result["website"] = binding["website"]["value"] if "inception" in binding: result["founded_date"] = binding["inception"]["value"].split("T")[0] if "coords" in binding: coords_str = binding["coords"]["value"] if coords_str.startswith("Point("): lon, lat = coords_str[6:-1].split() result["latitude"] = float(lat) result["longitude"] = float(lon) return result except requests.exceptions.Timeout: print(f" ā±ļø Query timeout (>{timeout}s)") return None except requests.exceptions.RequestException as e: print(f" āŒ Network error: {e}") return None except Exception as e: print(f" āŒ Error: {e}") return None def add_wikidata_to_institution(institution: dict, wikidata_result: dict): """Add Wikidata information to institution record.""" # Add Wikidata identifier if 'identifiers' not in institution: institution['identifiers'] = [] # Check if Wikidata already exists existing_schemes = {i.get('identifier_scheme') for i in institution['identifiers']} if 'Wikidata' not in existing_schemes: institution['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': wikidata_result['qid'], 'identifier_url': f"https://www.wikidata.org/wiki/{wikidata_result['qid']}" }) # Add VIAF if present if wikidata_result.get('viaf') and 'VIAF' not in existing_schemes: institution['identifiers'].append({ 'identifier_scheme': 'VIAF', 'identifier_value': wikidata_result['viaf'], 'identifier_url': f"https://viaf.org/viaf/{wikidata_result['viaf']}" }) # Add ISIL if present if wikidata_result.get('isil') and 'ISIL' not in existing_schemes: institution['identifiers'].append({ 'identifier_scheme': 'ISIL', 'identifier_value': wikidata_result['isil'], # ISIL codes don't have a universal URLisil']}" }) # Update provenance if 'provenance' in institution: notes = institution['provenance'].get('notes', '') enrich_note = f" Wikidata enriched on {datetime.now(timezone.utc).isoformat()} (Q{wikidata_result['qid']})." institution['provenance']['notes'] = notes + enrich_note def save_checkpoint(data: dict, input_file: Path, stats: dict): """Save progress checkpoint.""" print(f"\nšŸ’¾ Saving checkpoint... (enriched: {stats['enriched']})") data['_metadata']['generated'] = datetime.now(timezone.utc).isoformat() if 'Wikidata enrichment' not in data['_metadata'].get('enhancements', []): data['_metadata']['enhancements'].append('Wikidata enrichment') with open(input_file, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) def main(): input_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.yaml') print("Tunisia Wikidata Enrichment") print("=" * 60) print("Features:") print(" - Simplified SPARQL queries (no transitive subclass)") print(" - Multilingual search (French/Arabic/English)") print(" - Checkpoint saving every 10 institutions") print(" - Timeout handling (60s per query)") print("=" * 60) # Load data print(f"\nReading: {input_file}") with open(input_file, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) institutions = data['institutions'] print(f"Total institutions: {len(institutions)}") # Statistics stats = { 'total': len(institutions), 'already_enriched': 0, 'searched': 0, 'found': 0, 'enriched': 0, 'failed': 0, 'timeouts': 0 } # Process each institution checkpoint_interval = 10 for i, inst in enumerate(institutions, 1): name = inst.get('name', '') city = inst.get('locations', [{}])[0].get('city', '') if inst.get('locations') else '' # Check if already has Wikidata identifiers = inst.get('identifiers', []) existing_schemes = {id.get('identifier_scheme') for id in identifiers} if 'Wikidata' in existing_schemes: stats['already_enriched'] += 1 qid = next((id['identifier_value'] for id in identifiers if id.get('identifier_scheme') == 'Wikidata'), 'unknown') print(f"[{i}/{len(institutions)}] āœ“ {name} (already has {qid})") continue # Search Wikidata print(f"[{i}/{len(institutions)}] Searching: {name} ({city})") stats['searched'] += 1 result = search_wikidata_by_name(name, city, timeout=60) if result: stats['found'] += 1 print(f" āœ… Found: {result['qid']} - {result.get('name', '')}") # Verify name match (fuzzy) match_score = fuzz.ratio(name.lower(), result['name'].lower()) if match_score > 85: add_wikidata_to_institution(inst, result) stats['enriched'] += 1 print(f" āœ… Enriched (match score: {match_score})") else: stats['failed'] += 1 print(f" āš ļø Low match score ({match_score}), skipping") else: stats['failed'] += 1 if "timeout" in str(result): stats['timeouts'] += 1 print(f" āŒ Not found") # Checkpoint every N institutions if i % checkpoint_interval == 0 or i == len(institutions): save_checkpoint(data, input_file, stats) # Final save save_checkpoint(data, input_file, stats) # Print statistics print("\n" + "=" * 60) print("WIKIDATA ENRICHMENT STATISTICS") print("=" * 60) print(f"Total institutions: {stats['total']}") print(f"Already enriched: {stats['already_enriched']}") print(f"Searched: {stats['searched']}") print(f"Found: {stats['found']}") print(f"Enriched: {stats['enriched']}") print(f"Failed: {stats['failed']}") print(f"Timeouts: {stats['timeouts']}") print(f"\nFinal Wikidata coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']} ({100*(stats['already_enriched'] + stats['enriched'])/stats['total']:.1f}%)") print("\nāœ… Wikidata enrichment complete!") if __name__ == '__main__': main()