- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
90 lines
3.4 KiB
Python
90 lines
3.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Remove false positive Wikidata matches from Egypt enrichment.
|
|
|
|
False Positives Detected:
|
|
- Al-Azhar University Library → October 6 University Library (Q117847870) ❌
|
|
- Nile University Library → October 6 University Library (Q117847870) ❌
|
|
|
|
Also flag questionable match for manual review:
|
|
- Museum of Islamic Art Cairo → Museum of Islamic Ceramics (Q6940902) ⚠️
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
|
|
# Load enriched dataset
|
|
input_file = Path("data/instances/egypt_institutions_wikidata_enriched.yaml")
|
|
output_file = Path("data/instances/egypt_institutions_wikidata_corrected.yaml")
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print("🔍 Reviewing Wikidata matches for false positives...\n")
|
|
|
|
removed_count = 0
|
|
flagged_count = 0
|
|
|
|
for inst in institutions:
|
|
name = inst.get('name', '')
|
|
identifiers = inst.get('identifiers', [])
|
|
|
|
# Find Wikidata identifiers
|
|
wd_ids = [i for i in identifiers if i.get('identifier_scheme') == 'Wikidata']
|
|
|
|
if not wd_ids:
|
|
continue
|
|
|
|
wd_qid = wd_ids[0].get('identifier_value', '')
|
|
|
|
# FALSE POSITIVE: University libraries matched to wrong university
|
|
if name in ['Al-Azhar University Library', 'Nile University Library']:
|
|
if wd_qid == 'Q117847870': # October 6 University Library
|
|
print(f"❌ REMOVED: {name}")
|
|
print(f" False match: {wd_qid} (October 6 University Library)")
|
|
print(f" Reason: Wrong university\n")
|
|
|
|
# Remove Wikidata identifier
|
|
inst['identifiers'] = [i for i in identifiers if i.get('identifier_scheme') != 'Wikidata']
|
|
removed_count += 1
|
|
|
|
# QUESTIONABLE: Museum of Islamic Art Cairo
|
|
elif name == 'Museum of Islamic Art Cairo':
|
|
if wd_qid == 'Q6940902': # Museum of Islamic Ceramics
|
|
print(f"⚠️ FLAGGED: {name}")
|
|
print(f" Match: {wd_qid} (Museum of Islamic Ceramics)")
|
|
print(f" Score: 0.755")
|
|
print(f" Action: Keeping match but needs manual verification")
|
|
print(f" Note: Are 'Museum of Islamic Art' and 'Museum of Islamic Ceramics' the same?\n")
|
|
|
|
# Add note to provenance
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
|
|
inst['provenance']['wikidata_match_note'] = (
|
|
"Fuzzy match to 'Museum of Islamic Ceramics' (Q6940902) with score 0.755. "
|
|
"Manual verification required to confirm these are the same institution."
|
|
)
|
|
flagged_count += 1
|
|
|
|
# Write corrected dataset
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, width=120)
|
|
|
|
# Calculate corrected coverage
|
|
total = len(institutions)
|
|
with_wikidata = sum(1 for inst in institutions
|
|
if any(i.get('identifier_scheme') == 'Wikidata'
|
|
for i in inst.get('identifiers', [])))
|
|
coverage = (with_wikidata / total * 100) if total > 0 else 0
|
|
|
|
print("="*80)
|
|
print("📊 CORRECTION RESULTS")
|
|
print("="*80)
|
|
print(f"❌ False positives removed: {removed_count}")
|
|
print(f"⚠️ Matches flagged for review: {flagged_count}")
|
|
print(f"✅ Confirmed matches: {with_wikidata - flagged_count}")
|
|
print(f"\n📈 Corrected Wikidata coverage: {coverage:.1f}% ({with_wikidata}/{total})")
|
|
print(f"\n💾 Output: {output_file}")
|
|
print("="*80)
|
|
|