- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
120 lines
3.8 KiB
Python
120 lines
3.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Verify Phase 1 Wikidata enrichment coverage.
|
|
|
|
Phase 1 countries:
|
|
- Georgia (GE)
|
|
- Great Britain (GB)
|
|
- Belgium (BE)
|
|
- United States (US)
|
|
- Luxembourg (LU)
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
def verify_phase1_enrichment():
|
|
"""Verify Wikidata coverage for Phase 1 countries."""
|
|
|
|
# Load unified dataset
|
|
unified_path = Path('data/instances/all/globalglam-20251111.yaml')
|
|
with open(unified_path, 'r', encoding='utf-8') as f:
|
|
unified = yaml.safe_load(f)
|
|
|
|
print(f"Loaded {len(unified)} institutions from unified dataset\n")
|
|
|
|
# Phase 1 countries
|
|
phase1_countries = ['GE', 'GB', 'BE', 'US', 'LU']
|
|
|
|
# Statistics by country
|
|
def make_stats_dict():
|
|
return {'total': 0, 'with_wikidata': 0, 'institutions': []}
|
|
|
|
stats = defaultdict(make_stats_dict)
|
|
|
|
for inst in unified:
|
|
locations = inst.get('locations', [])
|
|
if not locations:
|
|
continue
|
|
|
|
country = locations[0].get('country')
|
|
if country not in phase1_countries:
|
|
continue
|
|
|
|
stats[country]['total'] += 1
|
|
|
|
# Check for Wikidata identifier
|
|
identifiers = inst.get('identifiers', [])
|
|
has_wikidata = any(
|
|
id.get('identifier_scheme') == 'Wikidata'
|
|
for id in identifiers
|
|
)
|
|
|
|
if has_wikidata:
|
|
stats[country]['with_wikidata'] += 1
|
|
wikidata_id = next(
|
|
id.get('identifier_value')
|
|
for id in identifiers
|
|
if id.get('identifier_scheme') == 'Wikidata'
|
|
)
|
|
stats[country]['institutions'].append({
|
|
'name': inst.get('name'),
|
|
'id': inst.get('id'),
|
|
'wikidata': wikidata_id
|
|
})
|
|
else:
|
|
stats[country]['institutions'].append({
|
|
'name': inst.get('name'),
|
|
'id': inst.get('id'),
|
|
'wikidata': None
|
|
})
|
|
|
|
# Print detailed results
|
|
print("="*70)
|
|
print("PHASE 1 WIKIDATA ENRICHMENT VERIFICATION")
|
|
print("="*70)
|
|
print()
|
|
|
|
total_institutions = 0
|
|
total_with_wikidata = 0
|
|
|
|
for country in sorted(phase1_countries):
|
|
country_stats = stats[country]
|
|
total = country_stats['total']
|
|
with_wikidata = country_stats['with_wikidata']
|
|
percentage = (with_wikidata / total * 100) if total > 0 else 0
|
|
|
|
total_institutions += total
|
|
total_with_wikidata += with_wikidata
|
|
|
|
status = "✅" if percentage == 100 else "❌"
|
|
|
|
print(f"{status} {country}: {with_wikidata}/{total} with Wikidata ({percentage:.1f}%)")
|
|
|
|
# Show institutions without Wikidata
|
|
missing = [inst for inst in country_stats['institutions'] if inst['wikidata'] is None]
|
|
if missing:
|
|
print(f" Missing Wikidata:")
|
|
for inst in missing:
|
|
print(f" - {inst['name']}")
|
|
print(f" {inst['id']}")
|
|
print()
|
|
|
|
# Overall summary
|
|
overall_percentage = (total_with_wikidata / total_institutions * 100) if total_institutions > 0 else 0
|
|
overall_status = "✅" if overall_percentage == 100 else "❌"
|
|
|
|
print("="*70)
|
|
print(f"{overall_status} PHASE 1 TOTAL: {total_with_wikidata}/{total_institutions} with Wikidata ({overall_percentage:.1f}%)")
|
|
print("="*70)
|
|
|
|
if overall_percentage == 100:
|
|
print("\n🎉 Phase 1 enrichment complete! All institutions have Wikidata identifiers.")
|
|
else:
|
|
print(f"\n⚠ Phase 1 incomplete: {total_institutions - total_with_wikidata} institutions need Wikidata enrichment.")
|
|
|
|
return stats
|
|
|
|
if __name__ == '__main__':
|
|
verify_phase1_enrichment()
|