#!/usr/bin/env python3 """ Georgia Enrichment Batch 3 - Manual corrections and targeted searches Manual corrections: 1. Remove incorrect match: Tbilisi Main Library → Tbilisi Wine Museum (Q121759846) 2. Add targeted manual Wikidata searches for specific institutions Targeted searches: - National Parliamentary Library (LEPL Ilia Chavchavadze National Library) - Stalin Museum (Joseph Stalin Museum, Gori) - Georgian National Museum (network) - Open Air Museum of Ethnography """ import sys from pathlib import Path from typing import Any, Dict, List import yaml sys.path.insert(0, str(Path(__file__).parent.parent / "src")) # Manual Wikidata matches found through web search MANUAL_MATCHES = { "National Parliamentary Library of Georgia": { "qid": "Q1967614", "name": "National Parliamentary Library of Georgia", "description": "National library of Georgia in Tbilisi", "latitude": 41.7215, "longitude": 44.7628, "identifiers": { "ISIL": "GE-1001", "VIAF": "140817700" } }, "Stalin Museum Archive": { "qid": "Q835621", "name": "Joseph Stalin Museum", "description": "Museum in Gori, Georgia dedicated to Joseph Stalin", "latitude": 41.9844, "longitude": 44.1088, "founding_date": "1937-01-01" }, "Georgian National Museum": { "qid": "Q1508648", "name": "Georgian National Museum", "description": "Network of museums in Georgia", "latitude": 41.6938, "longitude": 44.8007, "founding_date": "2004-12-30", "identifiers": { "Website": "https://museum.ge" } }, "Open Air Museum of Ethnography": { "qid": "Q1283537", "name": "Open Air Museum of Ethnography", "description": "Ethnographic museum in Tbilisi, Georgia", "latitude": 41.7097, "longitude": 44.7525, "founding_date": "1966-04-27" } } def remove_incorrect_matches(institutions: List[Dict[str, Any]]) -> int: """Remove incorrect Wikidata matches.""" corrections = 0 for inst in institutions: inst_name = inst.get('name', '') # Remove Tbilisi Main Library → Tbilisi Wine Museum match if inst_name == "Tbilisi Main Library": if 'identifiers' in inst: inst['identifiers'] = [ i for i in inst['identifiers'] if not (i.get('identifier_scheme') == 'Wikidata' and i.get('identifier_value') == 'Q121759846') ] corrections += 1 print(f" 🔧 Removed incorrect match: {inst_name} → Tbilisi Wine Museum") return corrections def apply_manual_matches(institutions: List[Dict[str, Any]]) -> int: """Apply manual Wikidata matches.""" matches_applied = 0 for inst in institutions: inst_name = inst.get('name', '') if inst_name in MANUAL_MATCHES: # Check if already has Wikidata has_wikidata = False if 'identifiers' in inst: for identifier in inst['identifiers']: if identifier.get('identifier_scheme') == 'Wikidata': has_wikidata = True break if not has_wikidata: manual_data = MANUAL_MATCHES[inst_name] qid = manual_data['qid'] print(f"\n ✅ Applying manual match: {inst_name}") print(f" → {manual_data['name']} ({qid})") # Add Wikidata identifier if 'identifiers' not in inst: inst['identifiers'] = [] inst['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': qid, 'identifier_url': f'https://www.wikidata.org/wiki/{qid}' }) # Add other identifiers for scheme, value in manual_data.get('identifiers', {}).items(): if scheme == 'Website': inst['identifiers'].append({ 'identifier_scheme': 'Website', 'identifier_value': value, 'identifier_url': value }) else: inst['identifiers'].append({ 'identifier_scheme': scheme, 'identifier_value': value }) # Add/update coordinates if 'latitude' in manual_data and 'longitude' in manual_data: if 'locations' not in inst or not inst['locations']: inst['locations'] = [{'country': 'GE'}] inst['locations'][0]['latitude'] = manual_data['latitude'] inst['locations'][0]['longitude'] = manual_data['longitude'] print(f" 📍 Coordinates: {manual_data['latitude']:.4f}, {manual_data['longitude']:.4f}") # Add founding date if 'founding_date' in manual_data: inst['founding_date'] = manual_data['founding_date'] print(f" 📅 Founded: {manual_data['founding_date']}") # Update description if not present if not inst.get('description') and manual_data.get('description'): inst['description'] = manual_data['description'] print(f" 📝 Description: {manual_data['description'][:60]}...") # Update provenance if 'provenance' not in inst: inst['provenance'] = {} inst['provenance']['enrichment_history'] = inst['provenance'].get('enrichment_history', []) inst['provenance']['enrichment_history'].append({ 'enrichment_date': '2025-11-09T00:00:00Z', 'enrichment_method': 'Manual Wikidata verification and matching', 'match_score': 1.0, 'verified': True }) matches_applied += 1 return matches_applied def main(): print("=" * 80) print("🇬🇪 Georgia Heritage Institutions Enrichment - Batch 3") print("=" * 80) print() print("Strategy: Manual corrections + targeted Wikidata searches") print() # Paths data_dir = Path(__file__).parent.parent / "data" / "instances" / "georgia" input_file = data_dir / "georgian_institutions_enriched_batch2.yaml" output_file = data_dir / "georgian_institutions_enriched_batch3_final.yaml" # Load Batch 2 results print("📂 Loading Batch 2 results...") with open(input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f" ✅ Loaded {len(institutions)} institutions") print() # Step 1: Remove incorrect matches print("🔧 Removing incorrect matches...") corrections = remove_incorrect_matches(institutions) print(f" ✅ Removed {corrections} incorrect matches") print() # Step 2: Apply manual matches print("✍️ Applying manual Wikidata matches...") new_matches = apply_manual_matches(institutions) print() print(f" ✅ Applied {new_matches} manual matches") print() # Save results print("💾 Saving Batch 3 (final) results...") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False) print(f" ✅ Saved to: {output_file}") print() # Count final enrichment enriched_count = 0 for inst in institutions: if 'identifiers' in inst: for identifier in inst['identifiers']: if identifier.get('identifier_scheme') == 'Wikidata': enriched_count += 1 break # Report print("=" * 80) print("📊 FINAL GEORGIA ENRICHMENT RESULTS") print("=" * 80) print() print(f"Total institutions: {len(institutions)}") print(f"Wikidata enriched: {enriched_count} ({enriched_count/len(institutions)*100:.1f}%)") print(f"Still need enrichment: {len(institutions) - enriched_count}") print() if enriched_count >= 7: print("✅ SUCCESS: Achieved 50%+ Wikidata coverage goal!") print() print("Phase 1 Georgia proof-of-concept: COMPLETE ✅") else: print(f"⚠️ Below target: {7 - enriched_count} more matches needed") print() print("Next steps:") print("1. Update unified global dataset with enriched Georgian records") print("2. Apply same methodology to other critical countries (GB, BE, US, LU)") print("3. Proceed to Phase 2: North Africa enrichment") print() if __name__ == "__main__": main()