glam/scripts/final_enrichment_validation_report.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

167 lines
5.8 KiB
Python

#!/usr/bin/env python3
"""
Final validation report for enrichment_history backfill project.
Checks ALL institution files for completeness.
"""
import yaml
from pathlib import Path
from collections import defaultdict
BASE_DIR = Path('/Users/kempersc/apps/glam/data/instances')
# Files to check
FILES_TO_CHECK = [
('latin_american_institutions_AUTHORITATIVE.yaml', 'Latin America (AUTHORITATIVE)'),
('georgia_glam_institutions_enriched.yaml', 'Georgia'),
('tunisia/tunisian_institutions.yaml', 'Tunisia'),
('algeria/algerian_institutions.yaml', 'Algeria'),
('libya/libyan_institutions.yaml', 'Libya'),
('brazil/brazilian_institutions_batch6_enriched.yaml', 'Brazil (Batch 6)'),
('belgium/be_institutions_enriched_manual.yaml', 'Belgium'),
('great_britain/gb_institutions_enriched_manual.yaml', 'Great Britain'),
('united_states/us_institutions_enriched_manual.yaml', 'United States'),
]
def check_file(filepath: Path, label: str):
"""Check enrichment_history completeness."""
if not filepath.exists():
return None
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Handle structure
institutions = data['institutions'] if 'institutions' in data else data
total = len(institutions)
with_wikidata = 0
with_enrichment = 0
by_country = defaultdict(lambda: {'total': 0, 'wikidata': 0, 'enrichment': 0})
for inst in institutions:
# Get country
locations = inst.get('locations', [])
country = locations[0].get('country', 'UNKNOWN') if locations else 'UNKNOWN'
by_country[country]['total'] += 1
# Check Wikidata
identifiers = inst.get('identifiers', [])
has_wikidata = any(id.get('identifier_scheme') == 'Wikidata' for id in identifiers)
if has_wikidata:
with_wikidata += 1
by_country[country]['wikidata'] += 1
# Check enrichment_history
provenance = inst.get('provenance', {})
if provenance.get('enrichment_history'):
with_enrichment += 1
by_country[country]['enrichment'] += 1
return {
'label': label,
'total': total,
'with_wikidata': with_wikidata,
'with_enrichment': with_enrichment,
'gap': with_wikidata - with_enrichment,
'by_country': dict(by_country)
}
def main():
"""Generate final validation report."""
print("=" * 80)
print("ENRICHMENT HISTORY BACKFILL PROJECT - FINAL VALIDATION REPORT")
print("=" * 80)
print()
results = []
total_institutions = 0
total_wikidata = 0
total_enrichment = 0
for filename, label in FILES_TO_CHECK:
filepath = BASE_DIR / filename
result = check_file(filepath, label)
if result is None:
print(f"⚠️ {label}: File not found")
print()
continue
results.append(result)
total_institutions += result['total']
total_wikidata += result['with_wikidata']
total_enrichment += result['with_enrichment']
status = "" if result['gap'] == 0 else ""
print(f"{status} {result['label']}")
print(f" Total institutions: {result['total']}")
print(f" With Wikidata IDs: {result['with_wikidata']}")
print(f" With enrichment_history: {result['with_enrichment']}")
print(f" Gap: {result['gap']}")
if result['by_country']:
print(" By country:")
for country, stats in sorted(result['by_country'].items()):
if stats['wikidata'] > 0:
gap = stats['wikidata'] - stats['enrichment']
status_icon = "" if gap == 0 else ""
print(f" {status_icon} {country}: {stats['wikidata']} Wikidata, "
f"{stats['enrichment']} enrichment, gap: {gap}")
print()
# Summary
total_gap = total_wikidata - total_enrichment
print("=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Total institutions processed: {total_institutions}")
print(f"Total with Wikidata IDs: {total_wikidata}")
print(f"Total with enrichment_history: {total_enrichment}")
print(f"Total gap: {total_gap}")
print()
if total_gap == 0:
print("🎉 " + "=" * 76)
print(" 100% COMPLETENESS ACHIEVED - PROJECT SUCCESS!")
print(" " + "=" * 76)
print()
print(" ✅ All institutions with Wikidata IDs have enrichment_history")
print(" ✅ Provenance tracking complete across all datasets")
print(" ✅ Data quality metadata documented for {total_wikidata} institutions")
print()
else:
print(f"{total_gap} institutions still missing enrichment_history")
print()
for result in results:
if result['gap'] > 0:
print(f" - {result['label']}: {result['gap']} institutions")
# Country breakdown
print("=" * 80)
print("BREAKDOWN BY COUNTRY")
print("=" * 80)
country_totals = defaultdict(lambda: {'wikidata': 0, 'enrichment': 0})
for result in results:
for country, stats in result['by_country'].items():
country_totals[country]['wikidata'] += stats['wikidata']
country_totals[country]['enrichment'] += stats['enrichment']
for country in sorted(country_totals.keys()):
stats = country_totals[country]
gap = stats['wikidata'] - stats['enrichment']
status = "" if gap == 0 else ""
if stats['wikidata'] > 0:
print(f"{status} {country}: {stats['wikidata']} Wikidata, "
f"{stats['enrichment']} enrichment, gap: {gap}")
if __name__ == '__main__':
main()