glam/scripts/validate_enrichment_history_targeted.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

149 lines
5.3 KiB
Python

#!/usr/bin/env python3
"""
Validate enrichment_history completeness for countries with Wikidata enrichments.
Targeted validation to avoid timeout - checks specific countries.
"""
import yaml
from pathlib import Path
from collections import defaultdict
def load_yaml(filepath):
"""Load YAML file safely."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
return data if data else []
except Exception as e:
print(f"Error loading {filepath}: {e}")
return []
def check_enrichment_history(institutions, country_name):
"""Check enrichment_history completeness for institutions with Wikidata IDs."""
stats = {
'total': 0,
'with_wikidata': 0,
'with_enrichment_history': 0,
'missing_enrichment_history': []
}
if not institutions:
return stats
for inst in institutions:
stats['total'] += 1
# Check for Wikidata identifier
identifiers = inst.get('identifiers', [])
has_wikidata = any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in identifiers
)
if has_wikidata:
stats['with_wikidata'] += 1
# Check for enrichment_history
provenance = inst.get('provenance', {})
enrichment_history = provenance.get('enrichment_history', [])
if enrichment_history:
stats['with_enrichment_history'] += 1
else:
stats['missing_enrichment_history'].append({
'name': inst.get('name', 'UNKNOWN'),
'id': inst.get('id', 'UNKNOWN')
})
return stats
def main():
"""Validate targeted countries."""
base_path = Path('/Users/kempersc/apps/glam/data/instances')
# Countries to check (known to have Wikidata enrichments)
target_countries = [
('chile', 'chile/chilean_institutions.yaml'),
('georgia', 'georgia/georgian_institutions.yaml'),
('japan', 'japan/japanese_institutions.yaml'),
('mexico', 'mexico/mexican_institutions.yaml'),
('norway', 'norway/norwegian_institutions.yaml'),
('tunisia', 'tunisia/tunisian_institutions.yaml'),
('algeria', 'algeria/algerian_institutions.yaml'),
('libya', 'libya/libyan_institutions.yaml'),
('brazil', 'brazil/brazilian_institutions_batch6_enriched.yaml'),
('belgium', 'belgium/be_institutions_enriched_manual.yaml'),
('great_britain', 'great_britain/gb_institutions_enriched_manual.yaml'),
('united_states', 'united_states/us_institutions_enriched_manual.yaml'),
]
print("=" * 80)
print("ENRICHMENT HISTORY VALIDATION - TARGETED COUNTRIES")
print("=" * 80)
print()
total_stats = defaultdict(int)
all_missing = []
for country_name, filepath in target_countries:
full_path = base_path / filepath
if not full_path.exists():
print(f"⚠️ {country_name.upper()}: File not found - {filepath}")
print()
continue
institutions = load_yaml(full_path)
stats = check_enrichment_history(institutions, country_name)
# Aggregate totals
total_stats['total'] += stats['total']
total_stats['with_wikidata'] += stats['with_wikidata']
total_stats['with_enrichment_history'] += stats['with_enrichment_history']
# Report
gap = stats['with_wikidata'] - stats['with_enrichment_history']
status = "" if gap == 0 else ""
print(f"{status} {country_name.upper()}")
print(f" Total institutions: {stats['total']}")
print(f" With Wikidata: {stats['with_wikidata']}")
print(f" With enrichment_history: {stats['with_enrichment_history']}")
print(f" Gap: {gap}")
if stats['missing_enrichment_history']:
print(f" Missing enrichment_history:")
for missing in stats['missing_enrichment_history'][:5]: # Show first 5
print(f" - {missing['name']}")
if len(stats['missing_enrichment_history']) > 5:
print(f" ... and {len(stats['missing_enrichment_history']) - 5} more")
all_missing.extend(stats['missing_enrichment_history'])
print()
# Summary
print("=" * 80)
print("SUMMARY")
print("=" * 80)
total_gap = total_stats['with_wikidata'] - total_stats['with_enrichment_history']
print(f"Total institutions: {total_stats['total']}")
print(f"With Wikidata IDs: {total_stats['with_wikidata']}")
print(f"With enrichment_history: {total_stats['with_enrichment_history']}")
print(f"Total gap: {total_gap}")
print()
if total_gap == 0:
print("✅ 100% COMPLETENESS ACHIEVED!")
print("All institutions with Wikidata IDs have enrichment_history.")
else:
print(f"{total_gap} institutions still missing enrichment_history")
print()
print("Institutions missing enrichment_history:")
for missing in all_missing[:20]: # Show first 20
print(f" - {missing['name']} ({missing['id']})")
if len(all_missing) > 20:
print(f" ... and {len(all_missing) - 20} more")
if __name__ == '__main__':
main()