#!/usr/bin/env python3 """ Enrich Czech institutions with Wikidata Q-numbers. Uses Wikidata SPARQL endpoint to find matching institutions by name, location, and type. Adds Wikidata identifiers to czech_unified.yaml. Process: 1. Load czech_unified.yaml (8,694 institutions) 2. Filter institutions WITHOUT Wikidata Q-numbers (estimate: ~95%) 3. Query Wikidata for Czech heritage institutions 4. Fuzzy match by name + location + type 5. Add Wikidata identifiers to records 6. Save to czech_unified_wikidata.yaml Estimated time: 5-10 minutes (SPARQL queries + fuzzy matching) """ import yaml import requests from typing import List, Dict, Optional, Tuple from rapidfuzz import fuzz from datetime import datetime, timezone # Wikidata SPARQL endpoint WIKIDATA_SPARQL = "https://query.wikidata.org/sparql" # Wikidata institution type mapping (GLAM → Wikidata Q-numbers) WIKIDATA_TYPES = { 'MUSEUM': ['Q33506'], # museum 'LIBRARY': ['Q7075'], # library 'ARCHIVE': ['Q166118'], # archive 'GALLERY': ['Q1007870'], # art gallery } def query_wikidata_institutions(country_code: str = 'Q213') -> List[Dict]: """ Query Wikidata for Czech Republic heritage institutions. Args: country_code: Wikidata Q-number for country (Q213 = Czech Republic) Returns: List of dicts with: qid, label, type, location, coordinates """ # SPARQL query for Czech heritage institutions query = f""" SELECT DISTINCT ?item ?itemLabel ?typeLabel ?locationLabel ?coords ?isil ?viaf WHERE {{ # Institution types (museum, library, archive, gallery) VALUES ?type {{ wd:Q33506 wd:Q7075 wd:Q166118 wd:Q1007870 }} # Instance of heritage institution type ?item wdt:P31/wdt:P279* ?type . # Located in Czech Republic (or subdivisions) ?item wdt:P17 wd:{country_code} . # Optional: specific location (city/town) OPTIONAL {{ ?item wdt:P131 ?location }} # Optional: coordinates OPTIONAL {{ ?item wdt:P625 ?coords }} # Optional: ISIL code OPTIONAL {{ ?item wdt:P791 ?isil }} # Optional: VIAF ID OPTIONAL {{ ?item wdt:P214 ?viaf }} # Get labels in Czech and English SERVICE wikibase:label {{ bd:serviceParam wikibase:language "cs,en" }} }} LIMIT 10000 """ print("Querying Wikidata for Czech heritage institutions...") print(f"SPARQL endpoint: {WIKIDATA_SPARQL}") headers = { 'User-Agent': 'GLAM-Data-Extraction/0.2.0 (heritage institution research)', 'Accept': 'application/sparql-results+json' } try: response = requests.get( WIKIDATA_SPARQL, params={'query': query}, headers=headers, timeout=60 ) response.raise_for_status() data = response.json() # Parse results institutions = [] for binding in data['results']['bindings']: qid = binding['item']['value'].split('/')[-1] label = binding['itemLabel']['value'] inst_type = binding['typeLabel']['value'] location = binding.get('locationLabel', {}).get('value', '') coords = binding.get('coords', {}).get('value', '') isil = binding.get('isil', {}).get('value', '') viaf = binding.get('viaf', {}).get('value', '') institutions.append({ 'qid': qid, 'label': label, 'type': inst_type, 'location': location, 'coordinates': coords, 'isil': isil, 'viaf': viaf }) print(f"Found {len(institutions)} institutions in Wikidata") return institutions except Exception as e: print(f"Error querying Wikidata: {e}") return [] def fuzzy_match_institution( inst_name: str, inst_city: str, inst_type: str, wikidata_results: List[Dict], threshold: float = 85.0 ) -> Optional[Tuple[Dict, float]]: """ Fuzzy match institution to Wikidata results. Args: inst_name: Institution name from our dataset inst_city: City location inst_type: Institution type (MUSEUM, LIBRARY, ARCHIVE, GALLERY) wikidata_results: List of Wikidata query results threshold: Minimum similarity score (0-100) Returns: Tuple of (matched_wikidata_record, confidence_score) or None """ best_match = None best_score = 0.0 for wd in wikidata_results: # Name similarity name_score = fuzz.ratio(inst_name.lower(), wd['label'].lower()) # Location boost (if cities match) location_boost = 0 if inst_city and wd['location']: location_score = fuzz.partial_ratio(inst_city.lower(), wd['location'].lower()) if location_score > 85: location_boost = 10 # Type match check (optional, informational only) # We don't penalize type mismatches since Wikidata typing can be inconsistent # Combined score total_score = name_score + location_boost if total_score > best_score and total_score >= threshold: best_score = total_score best_match = wd if best_match: return (best_match, best_score) else: return None def enrich_with_wikidata(): """Main enrichment workflow.""" print("="*80) print("CZECH INSTITUTIONS - WIKIDATA ENRICHMENT") print("="*80) print() # Load unified dataset print("Loading czech_unified.yaml...") with open('data/instances/czech_unified.yaml', 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f"Loaded {len(institutions)} institutions") # Filter institutions without Wikidata Q-numbers needs_wikidata = [] has_wikidata = 0 for inst in institutions: has_qid = False for identifier in inst.get('identifiers', []): if identifier.get('identifier_scheme') == 'Wikidata': has_qid = True has_wikidata += 1 break if not has_qid: needs_wikidata.append(inst) print(f"Institutions with Wikidata: {has_wikidata}") print(f"Institutions needing Wikidata: {len(needs_wikidata)}") print() # Query Wikidata wikidata_results = query_wikidata_institutions() if not wikidata_results: print("No Wikidata results found. Exiting.") return print() print(f"Fuzzy matching {len(needs_wikidata)} institutions...") print(f"Match threshold: 85% similarity") print() # Fuzzy match matched = 0 low_confidence = 0 for idx, inst in enumerate(needs_wikidata, 1): if idx % 100 == 0: print(f" Processed {idx}/{len(needs_wikidata)} institutions...") # Extract city from locations city = '' if inst.get('locations'): city = inst['locations'][0].get('city', '') # Fuzzy match match_result = fuzzy_match_institution( inst['name'], city, inst['institution_type'], wikidata_results, threshold=85.0 ) if match_result: matched_wd, confidence = match_result # Add Wikidata identifier if 'identifiers' not in inst: inst['identifiers'] = [] inst['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': matched_wd['qid'], 'identifier_url': f"https://www.wikidata.org/wiki/{matched_wd['qid']}" }) # Add ISIL if available and not already present if matched_wd.get('isil'): has_isil = any( i.get('identifier_scheme') == 'ISIL' for i in inst['identifiers'] ) if not has_isil: inst['identifiers'].append({ 'identifier_scheme': 'ISIL', 'identifier_value': matched_wd['isil'], # ISIL codes don't have a universal URLisil']}" }) # Add VIAF if available and not already present if matched_wd.get('viaf'): has_viaf = any( i.get('identifier_scheme') == 'VIAF' for i in inst['identifiers'] ) if not has_viaf: inst['identifiers'].append({ 'identifier_scheme': 'VIAF', 'identifier_value': matched_wd['viaf'], 'identifier_url': f"https://viaf.org/viaf/{matched_wd['viaf']}" }) # Update provenance if 'enrichment_history' not in inst['provenance']: inst['provenance']['enrichment_history'] = [] inst['provenance']['enrichment_history'].append({ 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'enrichment_method': 'Wikidata SPARQL query + fuzzy matching', 'match_score': confidence, 'verified': True if confidence > 95 else False }) matched += 1 if confidence < 90: low_confidence += 1 print(f"\n✅ Matched {matched} institutions ({matched/len(needs_wikidata)*100:.1f}%)") print(f"⚠️ Low confidence matches (<90%): {low_confidence}") print(f"❌ No match: {len(needs_wikidata) - matched}") print() # Save enriched dataset output_path = 'data/instances/czech_unified_wikidata.yaml' print(f"Saving enriched dataset to {output_path}...") with open(output_path, 'w', encoding='utf-8') as f: yaml.dump( institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False, width=100 ) print(f"✅ Saved {len(institutions)} institutions") print() print("="*80) print("ENRICHMENT COMPLETE") print("="*80) print(f"Total institutions: {len(institutions)}") print(f"With Wikidata Q-numbers: {has_wikidata + matched}") print(f"Newly enriched: {matched}") print(f"Enrichment rate: {(has_wikidata + matched)/len(institutions)*100:.1f}%") if __name__ == '__main__': enrich_with_wikidata()