#!/usr/bin/env python3 """Generate comprehensive Egypt enrichment summary.""" import yaml from pathlib import Path input_file = Path("data/instances/egypt_institutions_wikidata_corrected.yaml") with open(input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) total = len(institutions) with_wikidata = [] without_wikidata = [] for inst in institutions: identifiers = inst.get('identifiers', []) wd_ids = [i for i in identifiers if i.get('identifier_scheme') == 'Wikidata'] if wd_ids: with_wikidata.append({ 'name': inst.get('name'), 'qid': wd_ids[0].get('identifier_value'), 'type': inst.get('institution_type') }) else: without_wikidata.append({ 'name': inst.get('name'), 'type': inst.get('institution_type') }) coverage = (len(with_wikidata) / total * 100) if total > 0 else 0 print("="*100) print("🇪🇬 EGYPT WIKIDATA ENRICHMENT - FINAL REPORT") print("="*100) print() print(f"📊 Overall Statistics:") print(f" Total institutions: {total}") print(f" With Wikidata identifiers: {len(with_wikidata)} ({coverage:.1f}%)") print(f" Without Wikidata identifiers: {len(without_wikidata)} ({100-coverage:.1f}%)") print() print("="*100) print("✅ INSTITUTIONS WITH WIKIDATA (4 institutions)") print("="*100) print() for idx, inst in enumerate(with_wikidata, 1): print(f"{idx}. {inst['name']}") print(f" Q-number: {inst['qid']}") print(f" Type: {inst['type']}") print(f" Link: https://www.wikidata.org/wiki/{inst['qid']}") print() print("="*100) print("❌ INSTITUTIONS WITHOUT WIKIDATA (25 institutions)") print("="*100) print() print("Major institutions missing from Wikidata:") print() priority_missing = [ "Bibliotheca Alexandrina", "Egyptian National Library and Archives (Dar al-Kutub)", "National Archives of Egypt", "Egyptian Museum Cairo (EMC)", "Grand Egyptian Museum (GEM)" ] for name in priority_missing: matching = [i for i in without_wikidata if i['name'] == name] if matching: inst = matching[0] print(f" • {inst['name']} ({inst['type']})") print() print("="*100) print("🔍 ENRICHMENT METHODOLOGY") print("="*100) print() print("1. ✅ Automated fuzzy matching (threshold: 0.75)") print(" - 6 initial matches found") print() print("2. ❌ Manual false positive removal") print(" - Removed: Al-Azhar University Library → October 6 University (wrong institution)") print(" - Removed: Nile University Library → October 6 University (wrong institution)") print() print("3. ✅ Manual SPARQL verification") print(" - Corrected: Museum of Islamic Art Cairo") print(" Q6940902 (Museum of Islamic Ceramics) → Q3330629 (Museum of Islamic Art) ✓") print() print("4. ✅ Final validated matches: 4") print() print("="*100) print("💡 RECOMMENDATIONS") print("="*100) print() print("Next steps to improve coverage:") print() print("A. Create missing Wikidata entities for major institutions:") print(" - Egyptian Museum Cairo (EMC)") print(" - Grand Egyptian Museum (GEM)") print(" - Bibliotheca Alexandrina (as library, not museum)") print(" - Egyptian National Library and Archives") print(" - National Archives of Egypt") print() print("B. Alternative enrichment sources:") print(" - VIAF (Virtual International Authority File)") print(" - ISIL codes (International Standard Identifier for Libraries)") print(" - GeoNames for location data") print(" - Institutional websites (crawl4ai extraction)") print() print("C. Manual research:") print(" - University library websites for system details") print(" - Regional museum registries") print(" - UNESCO heritage databases") print() print("="*100)