#!/usr/bin/env python3 """ Remove false positive Wikidata matches from Egypt enrichment. False Positives Detected: - Al-Azhar University Library → October 6 University Library (Q117847870) ❌ - Nile University Library → October 6 University Library (Q117847870) ❌ Also flag questionable match for manual review: - Museum of Islamic Art Cairo → Museum of Islamic Ceramics (Q6940902) ⚠️ """ import yaml from pathlib import Path # Load enriched dataset input_file = Path("data/instances/egypt_institutions_wikidata_enriched.yaml") output_file = Path("data/instances/egypt_institutions_wikidata_corrected.yaml") with open(input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print("🔍 Reviewing Wikidata matches for false positives...\n") removed_count = 0 flagged_count = 0 for inst in institutions: name = inst.get('name', '') identifiers = inst.get('identifiers', []) # Find Wikidata identifiers wd_ids = [i for i in identifiers if i.get('identifier_scheme') == 'Wikidata'] if not wd_ids: continue wd_qid = wd_ids[0].get('identifier_value', '') # FALSE POSITIVE: University libraries matched to wrong university if name in ['Al-Azhar University Library', 'Nile University Library']: if wd_qid == 'Q117847870': # October 6 University Library print(f"❌ REMOVED: {name}") print(f" False match: {wd_qid} (October 6 University Library)") print(f" Reason: Wrong university\n") # Remove Wikidata identifier inst['identifiers'] = [i for i in identifiers if i.get('identifier_scheme') != 'Wikidata'] removed_count += 1 # QUESTIONABLE: Museum of Islamic Art Cairo elif name == 'Museum of Islamic Art Cairo': if wd_qid == 'Q6940902': # Museum of Islamic Ceramics print(f"⚠️ FLAGGED: {name}") print(f" Match: {wd_qid} (Museum of Islamic Ceramics)") print(f" Score: 0.755") print(f" Action: Keeping match but needs manual verification") print(f" Note: Are 'Museum of Islamic Art' and 'Museum of Islamic Ceramics' the same?\n") # Add note to provenance if 'provenance' not in inst: inst['provenance'] = {} inst['provenance']['wikidata_match_note'] = ( "Fuzzy match to 'Museum of Islamic Ceramics' (Q6940902) with score 0.755. " "Manual verification required to confirm these are the same institution." ) flagged_count += 1 # Write corrected dataset with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, width=120) # Calculate corrected coverage total = len(institutions) with_wikidata = sum(1 for inst in institutions if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []))) coverage = (with_wikidata / total * 100) if total > 0 else 0 print("="*80) print("📊 CORRECTION RESULTS") print("="*80) print(f"❌ False positives removed: {removed_count}") print(f"⚠️ Matches flagged for review: {flagged_count}") print(f"✅ Confirmed matches: {with_wikidata - flagged_count}") print(f"\n📈 Corrected Wikidata coverage: {coverage:.1f}% ({with_wikidata}/{total})") print(f"\n💾 Output: {output_file}") print("="*80)