glam/scripts/verify_phase1_enrichment.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

120 lines
3.8 KiB
Python

#!/usr/bin/env python3
"""
Verify Phase 1 Wikidata enrichment coverage.
Phase 1 countries:
- Georgia (GE)
- Great Britain (GB)
- Belgium (BE)
- United States (US)
- Luxembourg (LU)
"""
import yaml
from pathlib import Path
from collections import defaultdict
def verify_phase1_enrichment():
"""Verify Wikidata coverage for Phase 1 countries."""
# Load unified dataset
unified_path = Path('data/instances/all/globalglam-20251111.yaml')
with open(unified_path, 'r', encoding='utf-8') as f:
unified = yaml.safe_load(f)
print(f"Loaded {len(unified)} institutions from unified dataset\n")
# Phase 1 countries
phase1_countries = ['GE', 'GB', 'BE', 'US', 'LU']
# Statistics by country
def make_stats_dict():
return {'total': 0, 'with_wikidata': 0, 'institutions': []}
stats = defaultdict(make_stats_dict)
for inst in unified:
locations = inst.get('locations', [])
if not locations:
continue
country = locations[0].get('country')
if country not in phase1_countries:
continue
stats[country]['total'] += 1
# Check for Wikidata identifier
identifiers = inst.get('identifiers', [])
has_wikidata = any(
id.get('identifier_scheme') == 'Wikidata'
for id in identifiers
)
if has_wikidata:
stats[country]['with_wikidata'] += 1
wikidata_id = next(
id.get('identifier_value')
for id in identifiers
if id.get('identifier_scheme') == 'Wikidata'
)
stats[country]['institutions'].append({
'name': inst.get('name'),
'id': inst.get('id'),
'wikidata': wikidata_id
})
else:
stats[country]['institutions'].append({
'name': inst.get('name'),
'id': inst.get('id'),
'wikidata': None
})
# Print detailed results
print("="*70)
print("PHASE 1 WIKIDATA ENRICHMENT VERIFICATION")
print("="*70)
print()
total_institutions = 0
total_with_wikidata = 0
for country in sorted(phase1_countries):
country_stats = stats[country]
total = country_stats['total']
with_wikidata = country_stats['with_wikidata']
percentage = (with_wikidata / total * 100) if total > 0 else 0
total_institutions += total
total_with_wikidata += with_wikidata
status = "" if percentage == 100 else ""
print(f"{status} {country}: {with_wikidata}/{total} with Wikidata ({percentage:.1f}%)")
# Show institutions without Wikidata
missing = [inst for inst in country_stats['institutions'] if inst['wikidata'] is None]
if missing:
print(f" Missing Wikidata:")
for inst in missing:
print(f" - {inst['name']}")
print(f" {inst['id']}")
print()
# Overall summary
overall_percentage = (total_with_wikidata / total_institutions * 100) if total_institutions > 0 else 0
overall_status = "" if overall_percentage == 100 else ""
print("="*70)
print(f"{overall_status} PHASE 1 TOTAL: {total_with_wikidata}/{total_institutions} with Wikidata ({overall_percentage:.1f}%)")
print("="*70)
if overall_percentage == 100:
print("\n🎉 Phase 1 enrichment complete! All institutions have Wikidata identifiers.")
else:
print(f"\n⚠ Phase 1 incomplete: {total_institutions - total_with_wikidata} institutions need Wikidata enrichment.")
return stats
if __name__ == '__main__':
verify_phase1_enrichment()