#!/usr/bin/env python3 """ Enrich Georgian heritage institutions - Phase 1 Proof of Concept Target: 14 Georgian institutions with 0% Wikidata coverage Goal: Achieve 50%+ Wikidata coverage (7+ institutions matched) Strategy: 1. Query Wikidata for museums/libraries/archives in Georgia (Q230) 2. Fuzzy match institution names with 0.85+ threshold 3. Verify type compatibility (museum, library, archive) 4. Enrich with Wikidata Q-numbers, VIAF, coordinates, websites 5. Geocode remaining institutions using Nominatim CRITICAL: This follows the Chilean enrichment success pattern (78.9% coverage) """ import sys from pathlib import Path from typing import Any, Optional, Dict, List from datetime import datetime, timezone import time import yaml from difflib import SequenceMatcher import re sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore def normalize_name(name: str) -> str: """Normalize institution name for fuzzy matching.""" name = name.lower() # Remove common prefixes/suffixes (English, Georgian transliterations) name = re.sub(r'^(museum|muzeum|museu|library|biblioteka|archive|arkivi)[\s\-]+', '', name) name = re.sub(r'[\s\-]+(museum|muzeum|library|biblioteka|archive|arkivi|georgia|georgian)$', '', name) # Remove organizational forms name = re.sub(r'\b(foundation|institute|state|national|central)\b', '', name) # Remove punctuation name = re.sub(r'[^\w\s]', ' ', name) # Normalize whitespace name = ' '.join(name.split()) return name def similarity_score(name1: str, name2: str) -> float: """Calculate similarity between two names (0-1).""" norm1 = normalize_name(name1) norm2 = normalize_name(name2) return SequenceMatcher(None, norm1, norm2).ratio() def institution_type_compatible(inst_type: str, inst_name: str, wd_name: str, wd_desc: str) -> bool: """ Check if institution types are compatible. Prevents mismatches like museum → archive. """ museum_kw = ['museum', 'muzeum', 'museu'] archive_kw = ['archive', 'arkivi', 'archiv'] library_kw = ['library', 'biblioteka', 'bibliothek'] inst_lower = (inst_name + ' ' + inst_type).lower() wd_lower = (wd_name + ' ' + wd_desc).lower() inst_is_museum = any(kw in inst_lower for kw in museum_kw) or inst_type == 'MUSEUM' inst_is_archive = any(kw in inst_lower for kw in archive_kw) or inst_type == 'ARCHIVE' inst_is_library = any(kw in inst_lower for kw in library_kw) or inst_type == 'LIBRARY' wd_is_museum = any(kw in wd_lower for kw in museum_kw) wd_is_archive = any(kw in wd_lower for kw in archive_kw) wd_is_library = any(kw in wd_lower for kw in library_kw) # If both have explicit types, they must match if inst_is_museum and not wd_is_museum: return False if inst_is_archive and not wd_is_archive: return False if inst_is_library and not wd_is_library: return False return True def query_georgian_institutions(sparql: SPARQLWrapper) -> Dict[str, Dict[str, Any]]: """ Query Wikidata for GLAM institutions in Georgia (Q230). Returns: dict keyed by QID with institution data """ query = """ SELECT DISTINCT ?item ?itemLabel ?itemDescription ?typeLabel ?isil ?viaf ?coords ?website ?inception WHERE { # Institution is in Georgia ?item wdt:P17 wd:Q230 . # Institution is a GLAM type VALUES ?type { wd:Q7075 # library wd:Q166118 # archive wd:Q33506 # museum wd:Q1007870 # art gallery wd:Q28564 # public library wd:Q11396180 # academic library wd:Q207694 # art museum wd:Q2772772 # history museum wd:Q768717 # ethnographic museum wd:Q7406919 # state museum } ?item wdt:P31 ?type . # Optional enrichment data OPTIONAL { ?item wdt:P791 ?isil . } # ISIL code OPTIONAL { ?item wdt:P214 ?viaf . } # VIAF ID OPTIONAL { ?item wdt:P625 ?coords . } # Coordinates OPTIONAL { ?item wdt:P856 ?website . } # Official website OPTIONAL { ?item wdt:P571 ?inception . } # Founding date # Get labels (English, Georgian, Russian) SERVICE wikibase:label { bd:serviceParam wikibase:language "en,ka,ru" . } } LIMIT 500 """ sparql.setQuery(query) try: raw_results = sparql.query().convert() bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else [] results = {} for binding in bindings: item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q"): continue result = { "qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "description": binding.get("itemDescription", {}).get("value", ""), "type": binding.get("typeLabel", {}).get("value", ""), "identifiers": {} } if "isil" in binding: result["identifiers"]["ISIL"] = binding["isil"]["value"] if "viaf" in binding: result["identifiers"]["VIAF"] = binding["viaf"]["value"] if "website" in binding: result["identifiers"]["Website"] = binding["website"]["value"] if "inception" in binding: result["founding_date"] = binding["inception"]["value"].split("T")[0] if "coords" in binding: coords_str = binding["coords"]["value"] if coords_str.startswith("Point("): lon, lat = coords_str[6:-1].split() result["latitude"] = float(lat) result["longitude"] = float(lon) results[qid] = result return results except Exception as e: print(f"\nāŒ Error querying Wikidata: {e}") return {} def geocode_institution(name: str, country: str = "Georgia") -> Optional[Dict[str, Any]]: """ Geocode institution using Nominatim. Respects 1 req/sec rate limit. """ try: import requests # Try with institution name + country search_query = f"{name}, {country}" response = requests.get( "https://nominatim.openstreetmap.org/search", params={ "q": search_query, "format": "json", "limit": 1, "countrycodes": "ge" # Georgia ISO code }, headers={"User-Agent": "GLAM-Dataset-Enrichment/1.0"} ) if response.status_code == 200: results = response.json() if results: location = results[0] return { "latitude": float(location["lat"]), "longitude": float(location["lon"]), "display_name": location.get("display_name", "") } # Rate limit: 1 request per second time.sleep(1.1) except Exception as e: print(f" āš ļø Geocoding error: {e}") return None def load_georgia_institutions(yaml_path: Path) -> List[Dict[str, Any]]: """Load Georgian institutions from unified dataset.""" with open(yaml_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Filter for Georgian institutions return [inst for inst in data if inst.get('locations', [{}])[0].get('country') == 'GE'] def enrich_institution( inst: Dict[str, Any], wikidata_results: Dict[str, Dict[str, Any]], fuzzy_threshold: float = 0.85 ) -> Optional[Dict[str, Any]]: """ Try to enrich institution with Wikidata data. Returns enrichment data if match found, None otherwise. """ inst_name = inst.get('name', '') inst_type = inst.get('institution_type', '') # Try exact matches first (by existing identifiers) existing_ids = inst.get('identifiers', []) for identifier in existing_ids: if identifier.get('identifier_scheme') == 'Wikidata': qid = identifier.get('identifier_value', '') if qid in wikidata_results: return wikidata_results[qid] # Fuzzy matching by name best_match = None best_score = 0.0 for qid, wd_data in wikidata_results.items(): wd_name = wd_data.get('name', '') wd_desc = wd_data.get('description', '') # Check type compatibility first if not institution_type_compatible(inst_type, inst_name, wd_name, wd_desc): continue # Calculate similarity score score = similarity_score(inst_name, wd_name) if score > best_score: best_score = score best_match = wd_data # Return match if above threshold if best_score >= fuzzy_threshold and best_match: enriched_match = dict(best_match) enriched_match["match_score"] = best_score return enriched_match return None def main(): print("=" * 80) print("šŸ‡¬šŸ‡Ŗ Georgia Heritage Institutions Enrichment - Batch 1") print("=" * 80) print() print("Target: 14 institutions with 0% Wikidata coverage") print("Goal: Achieve 50%+ coverage (7+ institutions)") print() # Paths data_dir = Path(__file__).parent.parent / "data" / "instances" input_file = data_dir / "all" / "globalglam-20251111.yaml" output_file = data_dir / "georgia" / "georgian_institutions_enriched_batch1.yaml" output_file.parent.mkdir(parents=True, exist_ok=True) # Step 1: Load Georgian institutions print("šŸ“‚ Loading Georgian institutions...") institutions = load_georgia_institutions(input_file) print(f" āœ… Loaded {len(institutions)} Georgian institutions") print() # Step 2: Query Wikidata print("🌐 Querying Wikidata for Georgian GLAM institutions...") sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setReturnFormat(SPARQL_JSON) wikidata_results = query_georgian_institutions(sparql) print(f" āœ… Found {len(wikidata_results)} institutions in Wikidata") print() # Step 3: Fuzzy matching and enrichment print("šŸ” Matching institutions with Wikidata (threshold: 0.85)...") print() enriched_count = 0 geocoded_count = 0 for i, inst in enumerate(institutions, 1): inst_name = inst.get('name', 'Unknown') inst_type = inst.get('institution_type', 'MIXED') print(f"{i:2d}. {inst_name} ({inst_type})") # Try Wikidata enrichment enrichment = enrich_institution(inst, wikidata_results) if enrichment: match_score = enrichment.get('match_score', 0.0) qid = enrichment.get('qid', '') print(f" āœ… Matched: {enrichment.get('name')} ({qid}) - Score: {match_score:.2f}") # Add Wikidata identifier if 'identifiers' not in inst: inst['identifiers'] = [] inst['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': qid, 'identifier_url': f'https://www.wikidata.org/wiki/{qid}' }) # Add other identifiers for scheme, value in enrichment.get('identifiers', {}).items(): if scheme == 'Website': inst['identifiers'].append({ 'identifier_scheme': 'Website', 'identifier_value': value, 'identifier_url': value }) else: inst['identifiers'].append({ 'identifier_scheme': scheme, 'identifier_value': value }) # Add coordinates if available if 'latitude' in enrichment and 'longitude' in enrichment: if 'locations' not in inst or not inst['locations']: inst['locations'] = [{'country': 'GE'}] inst['locations'][0]['latitude'] = enrichment['latitude'] inst['locations'][0]['longitude'] = enrichment['longitude'] print(f" šŸ“ Coordinates: {enrichment['latitude']:.4f}, {enrichment['longitude']:.4f}") # Add founding date if available if 'founding_date' in enrichment: inst['founding_date'] = enrichment['founding_date'] print(f" šŸ“… Founded: {enrichment['founding_date']}") # Add description from Wikidata if enrichment.get('description'): if not inst.get('description'): inst['description'] = enrichment['description'] print(f" šŸ“ Description: {enrichment['description'][:60]}...") # Update provenance if 'provenance' not in inst: inst['provenance'] = {} inst['provenance']['enrichment_history'] = inst['provenance'].get('enrichment_history', []) inst['provenance']['enrichment_history'].append({ 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'enrichment_method': 'Wikidata SPARQL query + fuzzy name matching', 'match_score': match_score, 'verified': False }) enriched_count += 1 else: print(f" āš ļø No Wikidata match found") # Try geocoding as fallback geocode_result = geocode_institution(inst_name) if geocode_result: if 'locations' not in inst or not inst['locations']: inst['locations'] = [{'country': 'GE'}] inst['locations'][0]['latitude'] = geocode_result['latitude'] inst['locations'][0]['longitude'] = geocode_result['longitude'] print(f" šŸ“ Geocoded: {geocode_result['latitude']:.4f}, {geocode_result['longitude']:.4f}") geocoded_count += 1 print() # Step 4: Save enriched data print("šŸ’¾ Saving enriched dataset...") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False) print(f" āœ… Saved to: {output_file}") print() # Step 5: Report results print("=" * 80) print("šŸ“Š ENRICHMENT RESULTS") print("=" * 80) print() print(f"Total institutions: {len(institutions)}") print(f"Wikidata matches: {enriched_count} ({enriched_count/len(institutions)*100:.1f}%)") print(f"Geocoded (fallback): {geocoded_count}") print(f"Still need enrichment: {len(institutions) - enriched_count}") print() if enriched_count >= 7: print("āœ… SUCCESS: Achieved 50%+ Wikidata coverage goal!") else: print(f"āš ļø Below target: {7 - enriched_count} more matches needed for 50% coverage") print() print("Next steps:") print("1. Review matches manually (verify institution identities)") print("2. Update unified dataset with enriched Georgian records") print("3. Proceed with other critical countries (GB, BE, US, LU)") print() if __name__ == "__main__": main()