glam/scripts/fix_egypt_false_positives.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

90 lines
3.4 KiB
Python

#!/usr/bin/env python3
"""
Remove false positive Wikidata matches from Egypt enrichment.
False Positives Detected:
- Al-Azhar University Library → October 6 University Library (Q117847870) ❌
- Nile University Library → October 6 University Library (Q117847870) ❌
Also flag questionable match for manual review:
- Museum of Islamic Art Cairo → Museum of Islamic Ceramics (Q6940902) ⚠️
"""
import yaml
from pathlib import Path
# Load enriched dataset
input_file = Path("data/instances/egypt_institutions_wikidata_enriched.yaml")
output_file = Path("data/instances/egypt_institutions_wikidata_corrected.yaml")
with open(input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print("🔍 Reviewing Wikidata matches for false positives...\n")
removed_count = 0
flagged_count = 0
for inst in institutions:
name = inst.get('name', '')
identifiers = inst.get('identifiers', [])
# Find Wikidata identifiers
wd_ids = [i for i in identifiers if i.get('identifier_scheme') == 'Wikidata']
if not wd_ids:
continue
wd_qid = wd_ids[0].get('identifier_value', '')
# FALSE POSITIVE: University libraries matched to wrong university
if name in ['Al-Azhar University Library', 'Nile University Library']:
if wd_qid == 'Q117847870': # October 6 University Library
print(f"❌ REMOVED: {name}")
print(f" False match: {wd_qid} (October 6 University Library)")
print(f" Reason: Wrong university\n")
# Remove Wikidata identifier
inst['identifiers'] = [i for i in identifiers if i.get('identifier_scheme') != 'Wikidata']
removed_count += 1
# QUESTIONABLE: Museum of Islamic Art Cairo
elif name == 'Museum of Islamic Art Cairo':
if wd_qid == 'Q6940902': # Museum of Islamic Ceramics
print(f"⚠️ FLAGGED: {name}")
print(f" Match: {wd_qid} (Museum of Islamic Ceramics)")
print(f" Score: 0.755")
print(f" Action: Keeping match but needs manual verification")
print(f" Note: Are 'Museum of Islamic Art' and 'Museum of Islamic Ceramics' the same?\n")
# Add note to provenance
if 'provenance' not in inst:
inst['provenance'] = {}
inst['provenance']['wikidata_match_note'] = (
"Fuzzy match to 'Museum of Islamic Ceramics' (Q6940902) with score 0.755. "
"Manual verification required to confirm these are the same institution."
)
flagged_count += 1
# Write corrected dataset
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, width=120)
# Calculate corrected coverage
total = len(institutions)
with_wikidata = sum(1 for inst in institutions
if any(i.get('identifier_scheme') == 'Wikidata'
for i in inst.get('identifiers', [])))
coverage = (with_wikidata / total * 100) if total > 0 else 0
print("="*80)
print("📊 CORRECTION RESULTS")
print("="*80)
print(f"❌ False positives removed: {removed_count}")
print(f"⚠️ Matches flagged for review: {flagged_count}")
print(f"✅ Confirmed matches: {with_wikidata - flagged_count}")
print(f"\n📈 Corrected Wikidata coverage: {coverage:.1f}% ({with_wikidata}/{total})")
print(f"\n💾 Output: {output_file}")
print("="*80)