#!/usr/bin/env python3 """ Enrich Georgian heritage institutions - Batch 2 (Alternative Names) Strategy: Use alternative names (including Georgian names) for fuzzy matching Target: 10 remaining institutions without Wikidata matches Goal: Achieve 50%+ total coverage (7+ institutions) Improvements over Batch 1: 1. Include alternative names in fuzzy matching 2. Try partial name matching (e.g., "Stalin Museum" → "Joseph Stalin Museum") 3. Lower fuzzy threshold to 0.80 for specific matches 4. Manual review of close matches (0.75-0.85) """ import sys from pathlib import Path from typing import Any, Optional, Dict, List from datetime import datetime, timezone import yaml from difflib import SequenceMatcher import re sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore def normalize_name(name: str) -> str: """Normalize institution name for fuzzy matching.""" name = name.lower() # Remove common prefixes/suffixes name = re.sub(r'^(museum|muzeum|library|biblioteka|archive|arkivi|state|national|central)[\s\-]+', '', name) name = re.sub(r'[\s\-]+(museum|muzeum|library|biblioteka|archive|arkivi|georgia|georgian|of georgia)$', '', name) # Remove punctuation name = re.sub(r'[^\w\s]', ' ', name) # Normalize whitespace name = ' '.join(name.split()) return name def similarity_score(name1: str, name2: str) -> float: """Calculate similarity between two names (0-1).""" norm1 = normalize_name(name1) norm2 = normalize_name(name2) return SequenceMatcher(None, norm1, norm2).ratio() def query_georgian_institutions(sparql: SPARQLWrapper) -> Dict[str, Dict[str, Any]]: """Query Wikidata for GLAM institutions in Georgia.""" query = """ SELECT DISTINCT ?item ?itemLabel ?itemDescription ?altLabel ?typeLabel ?isil ?viaf ?coords ?website ?inception WHERE { ?item wdt:P17 wd:Q230 . VALUES ?type { wd:Q7075 wd:Q166118 wd:Q33506 wd:Q1007870 wd:Q28564 wd:Q11396180 wd:Q207694 wd:Q2772772 wd:Q768717 wd:Q7406919 } ?item wdt:P31 ?type . OPTIONAL { ?item wdt:P791 ?isil . } OPTIONAL { ?item wdt:P214 ?viaf . } OPTIONAL { ?item wdt:P625 ?coords . } OPTIONAL { ?item wdt:P856 ?website . } OPTIONAL { ?item wdt:P571 ?inception . } OPTIONAL { ?item skos:altLabel ?altLabel . FILTER(LANG(?altLabel) = "en") } SERVICE wikibase:label { bd:serviceParam wikibase:language "en,ka,ru" . } } LIMIT 500 """ sparql.setQuery(query) try: raw_results = sparql.query().convert() bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else [] results = {} for binding in bindings: item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q"): continue # Collect alternative labels alt_names = [] if "altLabel" in binding: alt_names.append(binding["altLabel"]["value"]) if qid not in results: results[qid] = { "qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "description": binding.get("itemDescription", {}).get("value", ""), "type": binding.get("typeLabel", {}).get("value", ""), "alternative_names": alt_names, "identifiers": {} } else: # Merge alternative names if "altLabel" in binding: results[qid]["alternative_names"].append(binding["altLabel"]["value"]) if "isil" in binding: results[qid]["identifiers"]["ISIL"] = binding["isil"]["value"] if "viaf" in binding: results[qid]["identifiers"]["VIAF"] = binding["viaf"]["value"] if "website" in binding: results[qid]["identifiers"]["Website"] = binding["website"]["value"] if "inception" in binding: results[qid]["founding_date"] = binding["inception"]["value"].split("T")[0] if "coords" in binding: coords_str = binding["coords"]["value"] if coords_str.startswith("Point("): lon, lat = coords_str[6:-1].split() results[qid]["latitude"] = float(lat) results[qid]["longitude"] = float(lon) return results except Exception as e: print(f"\nāŒ Error querying Wikidata: {e}") return {} def find_best_match( inst: Dict[str, Any], wikidata_results: Dict[str, Dict[str, Any]], threshold: float = 0.80 ) -> Optional[tuple[Dict[str, Any], float, str]]: """ Find best Wikidata match using primary and alternative names. Returns: (wikidata_entry, score, matched_name) or None """ inst_names = [inst.get('name', '')] if 'alternative_names' in inst: inst_names.extend(inst['alternative_names']) best_match = None best_score = 0.0 matched_name = "" for inst_name in inst_names: for qid, wd_data in wikidata_results.items(): # Try primary name wd_names = [wd_data.get('name', '')] # Add alternative names if 'alternative_names' in wd_data: wd_names.extend(wd_data['alternative_names']) for wd_name in wd_names: score = similarity_score(inst_name, wd_name) if score > best_score: best_score = score best_match = wd_data matched_name = f"{inst_name} → {wd_name}" if best_score >= threshold and best_match: return (best_match, best_score, matched_name) return None def main(): print("=" * 80) print("šŸ‡¬šŸ‡Ŗ Georgia Heritage Institutions Enrichment - Batch 2") print("=" * 80) print() print("Strategy: Alternative name matching with lower threshold (0.80)") print("Target: 10 institutions without Wikidata matches") print() # Paths data_dir = Path(__file__).parent.parent / "data" / "instances" / "georgia" input_file = data_dir / "georgian_institutions_enriched_batch1.yaml" output_file = data_dir / "georgian_institutions_enriched_batch2.yaml" # Load previous batch results print("šŸ“‚ Loading Batch 1 results...") with open(input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) # Filter for institutions without Wikidata needs_enrichment = [] already_enriched = 0 for inst in institutions: has_wikidata = False if 'identifiers' in inst: for identifier in inst['identifiers']: if identifier.get('identifier_scheme') == 'Wikidata': has_wikidata = True already_enriched += 1 break if not has_wikidata: needs_enrichment.append(inst) print(f" āœ… Already enriched: {already_enriched} institutions") print(f" ā³ Need enrichment: {len(needs_enrichment)} institutions") print() # Query Wikidata print("🌐 Querying Wikidata with alternative names support...") sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setReturnFormat(SPARQL_JSON) wikidata_results = query_georgian_institutions(sparql) print(f" āœ… Found {len(wikidata_results)} institutions in Wikidata") print() # Fuzzy matching with alternative names print("šŸ” Matching with alternative names (threshold: 0.80)...") print() new_matches = 0 for i, inst in enumerate(needs_enrichment, 1): inst_name = inst.get('name', 'Unknown') inst_type = inst.get('institution_type', 'MIXED') print(f"{i:2d}. {inst_name} ({inst_type})") # Check for alternative names alt_names = inst.get('alternative_names', []) if alt_names: print(f" Alternative names: {len(alt_names)}") # Try matching match_result = find_best_match(inst, wikidata_results) if match_result: wd_data, score, matched_name = match_result qid = wd_data.get('qid', '') print(f" āœ… Matched: {wd_data.get('name')} ({qid})") print(f" Match: {matched_name}") print(f" Score: {score:.2f}") # Add Wikidata identifier if 'identifiers' not in inst: inst['identifiers'] = [] inst['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': qid, 'identifier_url': f'https://www.wikidata.org/wiki/{qid}' }) # Add other identifiers for scheme, value in wd_data.get('identifiers', {}).items(): if scheme == 'Website': inst['identifiers'].append({ 'identifier_scheme': 'Website', 'identifier_value': value, 'identifier_url': value }) else: inst['identifiers'].append({ 'identifier_scheme': scheme, 'identifier_value': value }) # Add coordinates if 'latitude' in wd_data and 'longitude' in wd_data: if 'locations' not in inst or not inst['locations']: inst['locations'] = [{'country': 'GE'}] inst['locations'][0]['latitude'] = wd_data['latitude'] inst['locations'][0]['longitude'] = wd_data['longitude'] print(f" šŸ“ Coordinates: {wd_data['latitude']:.4f}, {wd_data['longitude']:.4f}") # Add founding date if 'founding_date' in wd_data: inst['founding_date'] = wd_data['founding_date'] print(f" šŸ“… Founded: {wd_data['founding_date']}") # Update provenance if 'provenance' not in inst: inst['provenance'] = {} inst['provenance']['enrichment_history'] = inst['provenance'].get('enrichment_history', []) inst['provenance']['enrichment_history'].append({ 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'enrichment_method': 'Wikidata SPARQL + alternative name fuzzy matching', 'match_score': score, 'verified': False }) new_matches += 1 else: print(f" āš ļø No match found (tried {1 + len(alt_names)} name variants)") print() # Save results print("šŸ’¾ Saving Batch 2 results...") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False) print(f" āœ… Saved to: {output_file}") print() # Report total_enriched = already_enriched + new_matches total_institutions = len(institutions) print("=" * 80) print("šŸ“Š BATCH 2 RESULTS") print("=" * 80) print() print(f"Batch 1 enriched: {already_enriched}") print(f"Batch 2 new matches: {new_matches}") print(f"Total enriched: {total_enriched} ({total_enriched/total_institutions*100:.1f}%)") print(f"Still need enrichment: {total_institutions - total_enriched}") print() if total_enriched >= 7: print("āœ… SUCCESS: Achieved 50%+ Wikidata coverage goal!") else: print(f"āš ļø Below target: {7 - total_enriched} more matches needed") print() if __name__ == "__main__": main()