- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
167 lines
5.8 KiB
Python
167 lines
5.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Final validation report for enrichment_history backfill project.
|
|
|
|
Checks ALL institution files for completeness.
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
BASE_DIR = Path('/Users/kempersc/apps/glam/data/instances')
|
|
|
|
# Files to check
|
|
FILES_TO_CHECK = [
|
|
('latin_american_institutions_AUTHORITATIVE.yaml', 'Latin America (AUTHORITATIVE)'),
|
|
('georgia_glam_institutions_enriched.yaml', 'Georgia'),
|
|
('tunisia/tunisian_institutions.yaml', 'Tunisia'),
|
|
('algeria/algerian_institutions.yaml', 'Algeria'),
|
|
('libya/libyan_institutions.yaml', 'Libya'),
|
|
('brazil/brazilian_institutions_batch6_enriched.yaml', 'Brazil (Batch 6)'),
|
|
('belgium/be_institutions_enriched_manual.yaml', 'Belgium'),
|
|
('great_britain/gb_institutions_enriched_manual.yaml', 'Great Britain'),
|
|
('united_states/us_institutions_enriched_manual.yaml', 'United States'),
|
|
]
|
|
|
|
def check_file(filepath: Path, label: str):
|
|
"""Check enrichment_history completeness."""
|
|
if not filepath.exists():
|
|
return None
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Handle structure
|
|
institutions = data['institutions'] if 'institutions' in data else data
|
|
|
|
total = len(institutions)
|
|
with_wikidata = 0
|
|
with_enrichment = 0
|
|
by_country = defaultdict(lambda: {'total': 0, 'wikidata': 0, 'enrichment': 0})
|
|
|
|
for inst in institutions:
|
|
# Get country
|
|
locations = inst.get('locations', [])
|
|
country = locations[0].get('country', 'UNKNOWN') if locations else 'UNKNOWN'
|
|
|
|
by_country[country]['total'] += 1
|
|
|
|
# Check Wikidata
|
|
identifiers = inst.get('identifiers', [])
|
|
has_wikidata = any(id.get('identifier_scheme') == 'Wikidata' for id in identifiers)
|
|
|
|
if has_wikidata:
|
|
with_wikidata += 1
|
|
by_country[country]['wikidata'] += 1
|
|
|
|
# Check enrichment_history
|
|
provenance = inst.get('provenance', {})
|
|
if provenance.get('enrichment_history'):
|
|
with_enrichment += 1
|
|
by_country[country]['enrichment'] += 1
|
|
|
|
return {
|
|
'label': label,
|
|
'total': total,
|
|
'with_wikidata': with_wikidata,
|
|
'with_enrichment': with_enrichment,
|
|
'gap': with_wikidata - with_enrichment,
|
|
'by_country': dict(by_country)
|
|
}
|
|
|
|
def main():
|
|
"""Generate final validation report."""
|
|
print("=" * 80)
|
|
print("ENRICHMENT HISTORY BACKFILL PROJECT - FINAL VALIDATION REPORT")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
results = []
|
|
total_institutions = 0
|
|
total_wikidata = 0
|
|
total_enrichment = 0
|
|
|
|
for filename, label in FILES_TO_CHECK:
|
|
filepath = BASE_DIR / filename
|
|
result = check_file(filepath, label)
|
|
|
|
if result is None:
|
|
print(f"⚠️ {label}: File not found")
|
|
print()
|
|
continue
|
|
|
|
results.append(result)
|
|
total_institutions += result['total']
|
|
total_wikidata += result['with_wikidata']
|
|
total_enrichment += result['with_enrichment']
|
|
|
|
status = "✅" if result['gap'] == 0 else "❌"
|
|
print(f"{status} {result['label']}")
|
|
print(f" Total institutions: {result['total']}")
|
|
print(f" With Wikidata IDs: {result['with_wikidata']}")
|
|
print(f" With enrichment_history: {result['with_enrichment']}")
|
|
print(f" Gap: {result['gap']}")
|
|
|
|
if result['by_country']:
|
|
print(" By country:")
|
|
for country, stats in sorted(result['by_country'].items()):
|
|
if stats['wikidata'] > 0:
|
|
gap = stats['wikidata'] - stats['enrichment']
|
|
status_icon = "✅" if gap == 0 else "❌"
|
|
print(f" {status_icon} {country}: {stats['wikidata']} Wikidata, "
|
|
f"{stats['enrichment']} enrichment, gap: {gap}")
|
|
|
|
print()
|
|
|
|
# Summary
|
|
total_gap = total_wikidata - total_enrichment
|
|
|
|
print("=" * 80)
|
|
print("SUMMARY")
|
|
print("=" * 80)
|
|
print(f"Total institutions processed: {total_institutions}")
|
|
print(f"Total with Wikidata IDs: {total_wikidata}")
|
|
print(f"Total with enrichment_history: {total_enrichment}")
|
|
print(f"Total gap: {total_gap}")
|
|
print()
|
|
|
|
if total_gap == 0:
|
|
print("🎉 " + "=" * 76)
|
|
print(" 100% COMPLETENESS ACHIEVED - PROJECT SUCCESS!")
|
|
print(" " + "=" * 76)
|
|
print()
|
|
print(" ✅ All institutions with Wikidata IDs have enrichment_history")
|
|
print(" ✅ Provenance tracking complete across all datasets")
|
|
print(" ✅ Data quality metadata documented for {total_wikidata} institutions")
|
|
print()
|
|
else:
|
|
print(f"❌ {total_gap} institutions still missing enrichment_history")
|
|
print()
|
|
for result in results:
|
|
if result['gap'] > 0:
|
|
print(f" - {result['label']}: {result['gap']} institutions")
|
|
|
|
# Country breakdown
|
|
print("=" * 80)
|
|
print("BREAKDOWN BY COUNTRY")
|
|
print("=" * 80)
|
|
|
|
country_totals = defaultdict(lambda: {'wikidata': 0, 'enrichment': 0})
|
|
|
|
for result in results:
|
|
for country, stats in result['by_country'].items():
|
|
country_totals[country]['wikidata'] += stats['wikidata']
|
|
country_totals[country]['enrichment'] += stats['enrichment']
|
|
|
|
for country in sorted(country_totals.keys()):
|
|
stats = country_totals[country]
|
|
gap = stats['wikidata'] - stats['enrichment']
|
|
status = "✅" if gap == 0 else "❌"
|
|
|
|
if stats['wikidata'] > 0:
|
|
print(f"{status} {country}: {stats['wikidata']} Wikidata, "
|
|
f"{stats['enrichment']} enrichment, gap: {gap}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|