- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
119 lines
3.7 KiB
Python
119 lines
3.7 KiB
Python
#!/usr/bin/env python3
|
|
"""Generate comprehensive Egypt enrichment summary."""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
|
|
input_file = Path("data/instances/egypt_institutions_wikidata_corrected.yaml")
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
total = len(institutions)
|
|
with_wikidata = []
|
|
without_wikidata = []
|
|
|
|
for inst in institutions:
|
|
identifiers = inst.get('identifiers', [])
|
|
wd_ids = [i for i in identifiers if i.get('identifier_scheme') == 'Wikidata']
|
|
|
|
if wd_ids:
|
|
with_wikidata.append({
|
|
'name': inst.get('name'),
|
|
'qid': wd_ids[0].get('identifier_value'),
|
|
'type': inst.get('institution_type')
|
|
})
|
|
else:
|
|
without_wikidata.append({
|
|
'name': inst.get('name'),
|
|
'type': inst.get('institution_type')
|
|
})
|
|
|
|
coverage = (len(with_wikidata) / total * 100) if total > 0 else 0
|
|
|
|
print("="*100)
|
|
print("🇪🇬 EGYPT WIKIDATA ENRICHMENT - FINAL REPORT")
|
|
print("="*100)
|
|
print()
|
|
print(f"📊 Overall Statistics:")
|
|
print(f" Total institutions: {total}")
|
|
print(f" With Wikidata identifiers: {len(with_wikidata)} ({coverage:.1f}%)")
|
|
print(f" Without Wikidata identifiers: {len(without_wikidata)} ({100-coverage:.1f}%)")
|
|
print()
|
|
print("="*100)
|
|
print("✅ INSTITUTIONS WITH WIKIDATA (4 institutions)")
|
|
print("="*100)
|
|
print()
|
|
|
|
for idx, inst in enumerate(with_wikidata, 1):
|
|
print(f"{idx}. {inst['name']}")
|
|
print(f" Q-number: {inst['qid']}")
|
|
print(f" Type: {inst['type']}")
|
|
print(f" Link: https://www.wikidata.org/wiki/{inst['qid']}")
|
|
print()
|
|
|
|
print("="*100)
|
|
print("❌ INSTITUTIONS WITHOUT WIKIDATA (25 institutions)")
|
|
print("="*100)
|
|
print()
|
|
print("Major institutions missing from Wikidata:")
|
|
print()
|
|
|
|
priority_missing = [
|
|
"Bibliotheca Alexandrina",
|
|
"Egyptian National Library and Archives (Dar al-Kutub)",
|
|
"National Archives of Egypt",
|
|
"Egyptian Museum Cairo (EMC)",
|
|
"Grand Egyptian Museum (GEM)"
|
|
]
|
|
|
|
for name in priority_missing:
|
|
matching = [i for i in without_wikidata if i['name'] == name]
|
|
if matching:
|
|
inst = matching[0]
|
|
print(f" • {inst['name']} ({inst['type']})")
|
|
|
|
print()
|
|
print("="*100)
|
|
print("🔍 ENRICHMENT METHODOLOGY")
|
|
print("="*100)
|
|
print()
|
|
print("1. ✅ Automated fuzzy matching (threshold: 0.75)")
|
|
print(" - 6 initial matches found")
|
|
print()
|
|
print("2. ❌ Manual false positive removal")
|
|
print(" - Removed: Al-Azhar University Library → October 6 University (wrong institution)")
|
|
print(" - Removed: Nile University Library → October 6 University (wrong institution)")
|
|
print()
|
|
print("3. ✅ Manual SPARQL verification")
|
|
print(" - Corrected: Museum of Islamic Art Cairo")
|
|
print(" Q6940902 (Museum of Islamic Ceramics) → Q3330629 (Museum of Islamic Art) ✓")
|
|
print()
|
|
print("4. ✅ Final validated matches: 4")
|
|
print()
|
|
print("="*100)
|
|
print("💡 RECOMMENDATIONS")
|
|
print("="*100)
|
|
print()
|
|
print("Next steps to improve coverage:")
|
|
print()
|
|
print("A. Create missing Wikidata entities for major institutions:")
|
|
print(" - Egyptian Museum Cairo (EMC)")
|
|
print(" - Grand Egyptian Museum (GEM)")
|
|
print(" - Bibliotheca Alexandrina (as library, not museum)")
|
|
print(" - Egyptian National Library and Archives")
|
|
print(" - National Archives of Egypt")
|
|
print()
|
|
print("B. Alternative enrichment sources:")
|
|
print(" - VIAF (Virtual International Authority File)")
|
|
print(" - ISIL codes (International Standard Identifier for Libraries)")
|
|
print(" - GeoNames for location data")
|
|
print(" - Institutional websites (crawl4ai extraction)")
|
|
print()
|
|
print("C. Manual research:")
|
|
print(" - University library websites for system details")
|
|
print(" - Regional museum registries")
|
|
print(" - UNESCO heritage databases")
|
|
print()
|
|
print("="*100)
|
|
|