- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
103 lines
3.2 KiB
Python
103 lines
3.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Add manually discovered Wikidata identifiers from direct SPARQL searches.
|
|
|
|
FOUND:
|
|
- Egyptian Museum Cairo (EMC) → Q201219 (Egyptian Museum)
|
|
- Grand Egyptian Museum (GEM) → Q2583681 (Grand Egyptian Museum)
|
|
- Bibliotheca Alexandrina → Q501851 (Bibliotheca Alexandrina)
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
input_file = Path("data/instances/egypt_institutions_wikidata_corrected.yaml")
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print("="*80)
|
|
print("🔍 Adding manually discovered Wikidata identifiers")
|
|
print("="*80)
|
|
print()
|
|
|
|
manual_matches = {
|
|
"Egyptian Museum Cairo (EMC)": {
|
|
"qid": "Q201219",
|
|
"label": "Egyptian Museum",
|
|
"method": "Direct SPARQL search"
|
|
},
|
|
"Grand Egyptian Museum (GEM)": {
|
|
"qid": "Q2583681",
|
|
"label": "Grand Egyptian Museum",
|
|
"method": "Direct SPARQL search"
|
|
},
|
|
"Bibliotheca Alexandrina": {
|
|
"qid": "Q501851",
|
|
"label": "Bibliotheca Alexandrina",
|
|
"method": "Direct SPARQL search"
|
|
}
|
|
}
|
|
|
|
added_count = 0
|
|
|
|
for inst in institutions:
|
|
name = inst.get('name', '')
|
|
|
|
if name in manual_matches:
|
|
match = manual_matches[name]
|
|
|
|
# Check if already has Wikidata
|
|
identifiers = inst.get('identifiers', [])
|
|
has_wikidata = any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers)
|
|
|
|
if not has_wikidata:
|
|
# Add Wikidata identifier
|
|
new_identifier = {
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': match['qid'],
|
|
'identifier_url': f"https://www.wikidata.org/wiki/{match['qid']}"
|
|
}
|
|
|
|
if 'identifiers' not in inst:
|
|
inst['identifiers'] = []
|
|
|
|
inst['identifiers'].append(new_identifier)
|
|
|
|
# Add provenance note
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
|
|
inst['provenance']['wikidata_enrichment'] = {
|
|
'method': match['method'],
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'wikidata_label': match['label'],
|
|
'verified': True
|
|
}
|
|
|
|
print(f"✅ ADDED: {name}")
|
|
print(f" Q-number: {match['qid']} ({match['label']})")
|
|
print(f" Method: {match['method']}\n")
|
|
|
|
added_count += 1
|
|
|
|
# Write updated dataset
|
|
with open(input_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, width=120)
|
|
|
|
# Calculate final coverage
|
|
total = len(institutions)
|
|
with_wikidata = sum(1 for inst in institutions
|
|
if any(i.get('identifier_scheme') == 'Wikidata'
|
|
for i in inst.get('identifiers', [])))
|
|
coverage = (with_wikidata / total * 100) if total > 0 else 0
|
|
|
|
print("="*80)
|
|
print("📊 MANUAL ENRICHMENT RESULTS")
|
|
print("="*80)
|
|
print(f"✅ Institutions added: {added_count}")
|
|
print(f"📈 Wikidata coverage: {coverage:.1f}% ({with_wikidata}/{total})")
|
|
print(f"\n💾 Updated: {input_file}")
|
|
print("="*80)
|
|
|