- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
149 lines
5.3 KiB
Python
149 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Validate enrichment_history completeness for countries with Wikidata enrichments.
|
|
|
|
Targeted validation to avoid timeout - checks specific countries.
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
def load_yaml(filepath):
|
|
"""Load YAML file safely."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
return data if data else []
|
|
except Exception as e:
|
|
print(f"Error loading {filepath}: {e}")
|
|
return []
|
|
|
|
def check_enrichment_history(institutions, country_name):
|
|
"""Check enrichment_history completeness for institutions with Wikidata IDs."""
|
|
stats = {
|
|
'total': 0,
|
|
'with_wikidata': 0,
|
|
'with_enrichment_history': 0,
|
|
'missing_enrichment_history': []
|
|
}
|
|
|
|
if not institutions:
|
|
return stats
|
|
|
|
for inst in institutions:
|
|
stats['total'] += 1
|
|
|
|
# Check for Wikidata identifier
|
|
identifiers = inst.get('identifiers', [])
|
|
has_wikidata = any(
|
|
id_obj.get('identifier_scheme') == 'Wikidata'
|
|
for id_obj in identifiers
|
|
)
|
|
|
|
if has_wikidata:
|
|
stats['with_wikidata'] += 1
|
|
|
|
# Check for enrichment_history
|
|
provenance = inst.get('provenance', {})
|
|
enrichment_history = provenance.get('enrichment_history', [])
|
|
|
|
if enrichment_history:
|
|
stats['with_enrichment_history'] += 1
|
|
else:
|
|
stats['missing_enrichment_history'].append({
|
|
'name': inst.get('name', 'UNKNOWN'),
|
|
'id': inst.get('id', 'UNKNOWN')
|
|
})
|
|
|
|
return stats
|
|
|
|
def main():
|
|
"""Validate targeted countries."""
|
|
base_path = Path('/Users/kempersc/apps/glam/data/instances')
|
|
|
|
# Countries to check (known to have Wikidata enrichments)
|
|
target_countries = [
|
|
('chile', 'chile/chilean_institutions.yaml'),
|
|
('georgia', 'georgia/georgian_institutions.yaml'),
|
|
('japan', 'japan/japanese_institutions.yaml'),
|
|
('mexico', 'mexico/mexican_institutions.yaml'),
|
|
('norway', 'norway/norwegian_institutions.yaml'),
|
|
('tunisia', 'tunisia/tunisian_institutions.yaml'),
|
|
('algeria', 'algeria/algerian_institutions.yaml'),
|
|
('libya', 'libya/libyan_institutions.yaml'),
|
|
('brazil', 'brazil/brazilian_institutions_batch6_enriched.yaml'),
|
|
('belgium', 'belgium/be_institutions_enriched_manual.yaml'),
|
|
('great_britain', 'great_britain/gb_institutions_enriched_manual.yaml'),
|
|
('united_states', 'united_states/us_institutions_enriched_manual.yaml'),
|
|
]
|
|
|
|
print("=" * 80)
|
|
print("ENRICHMENT HISTORY VALIDATION - TARGETED COUNTRIES")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
total_stats = defaultdict(int)
|
|
all_missing = []
|
|
|
|
for country_name, filepath in target_countries:
|
|
full_path = base_path / filepath
|
|
|
|
if not full_path.exists():
|
|
print(f"⚠️ {country_name.upper()}: File not found - {filepath}")
|
|
print()
|
|
continue
|
|
|
|
institutions = load_yaml(full_path)
|
|
stats = check_enrichment_history(institutions, country_name)
|
|
|
|
# Aggregate totals
|
|
total_stats['total'] += stats['total']
|
|
total_stats['with_wikidata'] += stats['with_wikidata']
|
|
total_stats['with_enrichment_history'] += stats['with_enrichment_history']
|
|
|
|
# Report
|
|
gap = stats['with_wikidata'] - stats['with_enrichment_history']
|
|
status = "✅" if gap == 0 else "❌"
|
|
|
|
print(f"{status} {country_name.upper()}")
|
|
print(f" Total institutions: {stats['total']}")
|
|
print(f" With Wikidata: {stats['with_wikidata']}")
|
|
print(f" With enrichment_history: {stats['with_enrichment_history']}")
|
|
print(f" Gap: {gap}")
|
|
|
|
if stats['missing_enrichment_history']:
|
|
print(f" Missing enrichment_history:")
|
|
for missing in stats['missing_enrichment_history'][:5]: # Show first 5
|
|
print(f" - {missing['name']}")
|
|
if len(stats['missing_enrichment_history']) > 5:
|
|
print(f" ... and {len(stats['missing_enrichment_history']) - 5} more")
|
|
all_missing.extend(stats['missing_enrichment_history'])
|
|
|
|
print()
|
|
|
|
# Summary
|
|
print("=" * 80)
|
|
print("SUMMARY")
|
|
print("=" * 80)
|
|
total_gap = total_stats['with_wikidata'] - total_stats['with_enrichment_history']
|
|
print(f"Total institutions: {total_stats['total']}")
|
|
print(f"With Wikidata IDs: {total_stats['with_wikidata']}")
|
|
print(f"With enrichment_history: {total_stats['with_enrichment_history']}")
|
|
print(f"Total gap: {total_gap}")
|
|
print()
|
|
|
|
if total_gap == 0:
|
|
print("✅ 100% COMPLETENESS ACHIEVED!")
|
|
print("All institutions with Wikidata IDs have enrichment_history.")
|
|
else:
|
|
print(f"❌ {total_gap} institutions still missing enrichment_history")
|
|
print()
|
|
print("Institutions missing enrichment_history:")
|
|
for missing in all_missing[:20]: # Show first 20
|
|
print(f" - {missing['name']} ({missing['id']})")
|
|
if len(all_missing) > 20:
|
|
print(f" ... and {len(all_missing) - 20} more")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|