glam/scripts/add_manual_wikidata.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

103 lines
3.2 KiB
Python

#!/usr/bin/env python3
"""
Add manually discovered Wikidata identifiers from direct SPARQL searches.
FOUND:
- Egyptian Museum Cairo (EMC) → Q201219 (Egyptian Museum)
- Grand Egyptian Museum (GEM) → Q2583681 (Grand Egyptian Museum)
- Bibliotheca Alexandrina → Q501851 (Bibliotheca Alexandrina)
"""
import yaml
from pathlib import Path
from datetime import datetime, timezone
input_file = Path("data/instances/egypt_institutions_wikidata_corrected.yaml")
with open(input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print("="*80)
print("🔍 Adding manually discovered Wikidata identifiers")
print("="*80)
print()
manual_matches = {
"Egyptian Museum Cairo (EMC)": {
"qid": "Q201219",
"label": "Egyptian Museum",
"method": "Direct SPARQL search"
},
"Grand Egyptian Museum (GEM)": {
"qid": "Q2583681",
"label": "Grand Egyptian Museum",
"method": "Direct SPARQL search"
},
"Bibliotheca Alexandrina": {
"qid": "Q501851",
"label": "Bibliotheca Alexandrina",
"method": "Direct SPARQL search"
}
}
added_count = 0
for inst in institutions:
name = inst.get('name', '')
if name in manual_matches:
match = manual_matches[name]
# Check if already has Wikidata
identifiers = inst.get('identifiers', [])
has_wikidata = any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers)
if not has_wikidata:
# Add Wikidata identifier
new_identifier = {
'identifier_scheme': 'Wikidata',
'identifier_value': match['qid'],
'identifier_url': f"https://www.wikidata.org/wiki/{match['qid']}"
}
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append(new_identifier)
# Add provenance note
if 'provenance' not in inst:
inst['provenance'] = {}
inst['provenance']['wikidata_enrichment'] = {
'method': match['method'],
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'wikidata_label': match['label'],
'verified': True
}
print(f"✅ ADDED: {name}")
print(f" Q-number: {match['qid']} ({match['label']})")
print(f" Method: {match['method']}\n")
added_count += 1
# Write updated dataset
with open(input_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, width=120)
# Calculate final coverage
total = len(institutions)
with_wikidata = sum(1 for inst in institutions
if any(i.get('identifier_scheme') == 'Wikidata'
for i in inst.get('identifiers', [])))
coverage = (with_wikidata / total * 100) if total > 0 else 0
print("="*80)
print("📊 MANUAL ENRICHMENT RESULTS")
print("="*80)
print(f"✅ Institutions added: {added_count}")
print(f"📈 Wikidata coverage: {coverage:.1f}% ({with_wikidata}/{total})")
print(f"\n💾 Updated: {input_file}")
print("="*80)