glam/scripts/validate_kb_libraries_report.py
kempersc 30162e6526 Add script to validate KB library entries and generate enrichment report
- Implemented a Python script to validate KB library YAML files for required fields and data quality.
- Analyzed enrichment coverage from Wikidata and Google Maps, generating statistics.
- Created a comprehensive markdown report summarizing validation results and enrichment quality.
- Included error handling for file loading and validation processes.
- Generated JSON statistics for further analysis.
2025-11-28 14:48:33 +01:00

496 lines
18 KiB
Python

#!/usr/bin/env python3
"""
Validate KB library entries and generate a comprehensive summary report.
This script:
1. Validates all KB library YAML files for required fields
2. Analyzes enrichment coverage and data quality
3. Generates statistics and a summary report
"""
import os
import yaml
import json
from pathlib import Path
from datetime import datetime, timezone
from collections import defaultdict
# Paths
ENTRIES_DIR = Path("/Users/kempersc/apps/glam/data/nde/enriched/entries")
REPORTS_DIR = Path("/Users/kempersc/apps/glam/reports")
def load_kb_library_files():
"""Load all KB library YAML files."""
entries = []
errors = []
for filepath in sorted(ENTRIES_DIR.glob("*_kb_isil.yaml")):
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
data['_filepath'] = str(filepath)
data['_filename'] = filepath.name
entries.append(data)
except Exception as e:
errors.append({
'file': str(filepath),
'error': str(e)
})
return entries, errors
def validate_entry(entry):
"""Validate a single entry for required fields and data quality."""
issues = []
warnings = []
# Required fields
required_fields = ['original_entry', 'entry_index', 'kb_enrichment']
for field in required_fields:
if field not in entry:
issues.append(f"Missing required field: {field}")
# Validate original_entry
if 'original_entry' in entry:
orig = entry['original_entry']
if not orig.get('organisatie'):
issues.append("Missing organisation name in original_entry")
if not orig.get('isil_code_kb'):
issues.append("Missing ISIL code in original_entry")
# Validate KB enrichment
if 'kb_enrichment' in entry:
kb = entry['kb_enrichment']
if not kb.get('isil_code'):
issues.append("Missing ISIL code in kb_enrichment")
if not kb.get('name'):
warnings.append("Missing name in kb_enrichment")
if not kb.get('city'):
warnings.append("Missing city in kb_enrichment")
# Check Wikidata enrichment
wikidata_status = "not_attempted"
if 'wikidata_enrichment' in entry:
wikidata_status = "success"
wd = entry['wikidata_enrichment']
if not wd.get('wikidata_entity_id'):
issues.append("Wikidata enrichment present but missing entity ID")
elif entry.get('wikidata_enrichment_status') == 'NOT_FOUND':
wikidata_status = "not_found"
# Check Google Maps enrichment
google_status = "not_attempted"
if 'google_maps_enrichment' in entry:
google_status = "success"
gm = entry['google_maps_enrichment']
if not gm.get('place_id'):
issues.append("Google Maps enrichment present but missing place_id")
if not gm.get('coordinates'):
warnings.append("Google Maps enrichment missing coordinates")
elif entry.get('google_maps_status') == 'NOT_FOUND':
google_status = "not_found"
return {
'valid': len(issues) == 0,
'issues': issues,
'warnings': warnings,
'wikidata_status': wikidata_status,
'google_status': google_status
}
def analyze_enrichment_quality(entries):
"""Analyze the quality and coverage of enrichments."""
stats = {
'total_entries': len(entries),
'wikidata': {
'success': 0,
'not_found': 0,
'not_attempted': 0,
'with_coordinates': 0,
'with_inception': 0,
'with_viaf': 0,
'with_website': 0,
'match_methods': defaultdict(int)
},
'google_maps': {
'success': 0,
'not_found': 0,
'not_attempted': 0,
'with_coordinates': 0,
'with_address': 0,
'with_phone': 0,
'with_website': 0,
'with_opening_hours': 0,
'with_rating': 0,
'business_statuses': defaultdict(int),
'provinces': defaultdict(int)
},
'cities': defaultdict(int),
'isil_prefixes': defaultdict(int)
}
for entry in entries:
# City statistics
city = entry.get('kb_enrichment', {}).get('city', 'Unknown')
stats['cities'][city] += 1
# ISIL prefix statistics
isil = entry.get('kb_enrichment', {}).get('isil_code', '')
if isil.startswith('NL-'):
# Extract prefix pattern (first 4 digits after NL-)
prefix = isil[:7] if len(isil) >= 7 else isil
stats['isil_prefixes'][prefix] += 1
# Wikidata statistics
if 'wikidata_enrichment' in entry:
stats['wikidata']['success'] += 1
wd = entry['wikidata_enrichment']
if wd.get('wikidata_coordinates'):
stats['wikidata']['with_coordinates'] += 1
if wd.get('wikidata_inception'):
stats['wikidata']['with_inception'] += 1
identifiers = wd.get('wikidata_identifiers', {})
if identifiers.get('VIAF'):
stats['wikidata']['with_viaf'] += 1
if identifiers.get('Website'):
stats['wikidata']['with_website'] += 1
method = wd.get('match_method', 'unknown')
stats['wikidata']['match_methods'][method] += 1
elif entry.get('wikidata_enrichment_status') == 'NOT_FOUND':
stats['wikidata']['not_found'] += 1
else:
stats['wikidata']['not_attempted'] += 1
# Google Maps statistics
if 'google_maps_enrichment' in entry:
stats['google_maps']['success'] += 1
gm = entry['google_maps_enrichment']
if gm.get('coordinates'):
stats['google_maps']['with_coordinates'] += 1
if gm.get('formatted_address'):
stats['google_maps']['with_address'] += 1
if gm.get('phone_international') or gm.get('phone_local'):
stats['google_maps']['with_phone'] += 1
if gm.get('website'):
stats['google_maps']['with_website'] += 1
if gm.get('opening_hours'):
stats['google_maps']['with_opening_hours'] += 1
if gm.get('rating'):
stats['google_maps']['with_rating'] += 1
status = gm.get('business_status', 'UNKNOWN')
stats['google_maps']['business_statuses'][status] += 1
# Extract province from address components
for component in gm.get('address_components', []):
if 'administrative_area_level_1' in component.get('types', []):
province = component.get('long_name', 'Unknown')
stats['google_maps']['provinces'][province] += 1
break
elif entry.get('google_maps_status') == 'NOT_FOUND':
stats['google_maps']['not_found'] += 1
else:
stats['google_maps']['not_attempted'] += 1
return stats
def generate_report(entries, validation_results, stats, file_errors):
"""Generate a comprehensive markdown report."""
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')
# Count validation issues
valid_count = sum(1 for v in validation_results if v['valid'])
with_issues = [v for v in validation_results if not v['valid']]
with_warnings = [v for v in validation_results if v['warnings']]
report = f"""# KB Netherlands Public Libraries - Enrichment Report
**Generated**: {timestamp}
**Total Entries**: {stats['total_entries']}
## Executive Summary
The KB Netherlands library ISIL data has been successfully integrated and enriched with external data sources.
| Metric | Count | Percentage |
|--------|-------|------------|
| Total KB Library Entries | {stats['total_entries']} | 100% |
| Valid Entries | {valid_count} | {valid_count/stats['total_entries']*100:.1f}% |
| Wikidata Enriched | {stats['wikidata']['success']} | {stats['wikidata']['success']/stats['total_entries']*100:.1f}% |
| Google Maps Enriched | {stats['google_maps']['success']} | {stats['google_maps']['success']/stats['total_entries']*100:.1f}% |
---
## Wikidata Enrichment
### Coverage
| Status | Count | Percentage |
|--------|-------|------------|
| Successfully enriched | {stats['wikidata']['success']} | {stats['wikidata']['success']/stats['total_entries']*100:.1f}% |
| Not found in Wikidata | {stats['wikidata']['not_found']} | {stats['wikidata']['not_found']/stats['total_entries']*100:.1f}% |
| Not attempted | {stats['wikidata']['not_attempted']} | {stats['wikidata']['not_attempted']/stats['total_entries']*100:.1f}% |
### Match Methods
| Method | Count |
|--------|-------|
"""
for method, count in sorted(stats['wikidata']['match_methods'].items(), key=lambda x: -x[1]):
report += f"| {method} | {count} |\n"
report += f"""
### Data Completeness (of {stats['wikidata']['success']} enriched)
| Field | Count | Percentage |
|-------|-------|------------|
| Coordinates | {stats['wikidata']['with_coordinates']} | {stats['wikidata']['with_coordinates']/max(stats['wikidata']['success'],1)*100:.1f}% |
| Inception Date | {stats['wikidata']['with_inception']} | {stats['wikidata']['with_inception']/max(stats['wikidata']['success'],1)*100:.1f}% |
| VIAF ID | {stats['wikidata']['with_viaf']} | {stats['wikidata']['with_viaf']/max(stats['wikidata']['success'],1)*100:.1f}% |
| Website | {stats['wikidata']['with_website']} | {stats['wikidata']['with_website']/max(stats['wikidata']['success'],1)*100:.1f}% |
---
## Google Maps Enrichment
### Coverage
| Status | Count | Percentage |
|--------|-------|------------|
| Successfully enriched | {stats['google_maps']['success']} | {stats['google_maps']['success']/stats['total_entries']*100:.1f}% |
| Not found | {stats['google_maps']['not_found']} | {stats['google_maps']['not_found']/stats['total_entries']*100:.1f}% |
| Not attempted | {stats['google_maps']['not_attempted']} | {stats['google_maps']['not_attempted']/stats['total_entries']*100:.1f}% |
### Data Completeness (of {stats['google_maps']['success']} enriched)
| Field | Count | Percentage |
|-------|-------|------------|
| Coordinates | {stats['google_maps']['with_coordinates']} | {stats['google_maps']['with_coordinates']/max(stats['google_maps']['success'],1)*100:.1f}% |
| Full Address | {stats['google_maps']['with_address']} | {stats['google_maps']['with_address']/max(stats['google_maps']['success'],1)*100:.1f}% |
| Phone Number | {stats['google_maps']['with_phone']} | {stats['google_maps']['with_phone']/max(stats['google_maps']['success'],1)*100:.1f}% |
| Website | {stats['google_maps']['with_website']} | {stats['google_maps']['with_website']/max(stats['google_maps']['success'],1)*100:.1f}% |
| Opening Hours | {stats['google_maps']['with_opening_hours']} | {stats['google_maps']['with_opening_hours']/max(stats['google_maps']['success'],1)*100:.1f}% |
| Rating | {stats['google_maps']['with_rating']} | {stats['google_maps']['with_rating']/max(stats['google_maps']['success'],1)*100:.1f}% |
### Business Status
| Status | Count |
|--------|-------|
"""
for status, count in sorted(stats['google_maps']['business_statuses'].items(), key=lambda x: -x[1]):
report += f"| {status} | {count} |\n"
report += """
### Geographic Distribution by Province
| Province | Count |
|----------|-------|
"""
for province, count in sorted(stats['google_maps']['provinces'].items(), key=lambda x: -x[1]):
report += f"| {province} | {count} |\n"
report += f"""
---
## Geographic Distribution by City
Top 20 cities with most library entries:
| City | Count |
|------|-------|
"""
for city, count in sorted(stats['cities'].items(), key=lambda x: -x[1])[:20]:
report += f"| {city} | {count} |\n"
report += f"""
---
## Validation Results
### Summary
- **Valid entries**: {valid_count} ({valid_count/stats['total_entries']*100:.1f}%)
- **Entries with issues**: {len(with_issues)}
- **Entries with warnings**: {len(with_warnings)}
- **File parsing errors**: {len(file_errors)}
"""
if with_issues:
report += """### Entries with Issues
| File | Issues |
|------|--------|
"""
for i, v in enumerate(with_issues[:10]): # Show first 10
entry = entries[i] if i < len(entries) else {}
filename = entry.get('_filename', 'unknown')
issues_str = "; ".join(v['issues'])
report += f"| {filename} | {issues_str} |\n"
if len(with_issues) > 10:
report += f"\n*...and {len(with_issues) - 10} more entries with issues*\n"
if file_errors:
report += """
### File Parsing Errors
| File | Error |
|------|-------|
"""
for err in file_errors:
report += f"| {Path(err['file']).name} | {err['error'][:50]}... |\n"
report += f"""
---
## Data Sources
1. **KB Netherlands Library Network** (Primary)
- Source file: `KB_Netherlands_ISIL_2025-04-01.xlsx`
- URL: https://www.bibliotheeknetwerk.nl/
- 149 library entries with ISIL codes
2. **Wikidata** (Enrichment)
- SPARQL endpoint: https://query.wikidata.org/sparql
- Match methods: ISIL code lookup, fuzzy name matching
- Coverage: {stats['wikidata']['success']}/{stats['total_entries']} ({stats['wikidata']['success']/stats['total_entries']*100:.1f}%)
3. **Google Maps Places API** (Enrichment)
- API: Places API (New)
- Coverage: {stats['google_maps']['success']}/{stats['total_entries']} ({stats['google_maps']['success']/stats['total_entries']*100:.1f}%)
---
## Files Generated
- Entry files: `data/nde/enriched/entries/{{index}}_kb_isil.yaml` (149 files)
- This report: `reports/kb_libraries_enrichment_report.md`
- Statistics JSON: `reports/kb_libraries_enrichment_stats.json`
---
*Report generated by validate_kb_libraries_report.py*
"""
return report
def main():
print("=" * 60)
print("KB Netherlands Libraries - Validation and Report Generator")
print("=" * 60)
# Ensure reports directory exists
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
# Load all entries
print("\n1. Loading KB library entries...")
entries, file_errors = load_kb_library_files()
print(f" Loaded: {len(entries)} entries")
print(f" File errors: {len(file_errors)}")
# Validate entries
print("\n2. Validating entries...")
validation_results = []
for entry in entries:
result = validate_entry(entry)
result['entry_index'] = entry.get('entry_index')
result['filename'] = entry.get('_filename')
validation_results.append(result)
valid_count = sum(1 for v in validation_results if v['valid'])
print(f" Valid: {valid_count}/{len(entries)}")
# Analyze enrichment quality
print("\n3. Analyzing enrichment quality...")
stats = analyze_enrichment_quality(entries)
print(f" Wikidata enriched: {stats['wikidata']['success']}/{stats['total_entries']}")
print(f" Google Maps enriched: {stats['google_maps']['success']}/{stats['total_entries']}")
print(f" Unique cities: {len(stats['cities'])}")
print(f" Provinces covered: {len(stats['google_maps']['provinces'])}")
# Generate report
print("\n4. Generating report...")
report = generate_report(entries, validation_results, stats, file_errors)
report_path = REPORTS_DIR / "kb_libraries_enrichment_report.md"
with open(report_path, 'w', encoding='utf-8') as f:
f.write(report)
print(f" Report saved: {report_path}")
# Save statistics as JSON
stats_json = {
'timestamp': datetime.now(timezone.utc).isoformat(),
'total_entries': stats['total_entries'],
'validation': {
'valid': valid_count,
'with_issues': len(entries) - valid_count,
'file_errors': len(file_errors)
},
'wikidata': {
'success': stats['wikidata']['success'],
'not_found': stats['wikidata']['not_found'],
'coverage_pct': round(stats['wikidata']['success'] / stats['total_entries'] * 100, 1),
'match_methods': dict(stats['wikidata']['match_methods']),
'with_coordinates': stats['wikidata']['with_coordinates'],
'with_inception': stats['wikidata']['with_inception'],
'with_viaf': stats['wikidata']['with_viaf'],
'with_website': stats['wikidata']['with_website']
},
'google_maps': {
'success': stats['google_maps']['success'],
'not_found': stats['google_maps']['not_found'],
'coverage_pct': round(stats['google_maps']['success'] / stats['total_entries'] * 100, 1),
'with_coordinates': stats['google_maps']['with_coordinates'],
'with_address': stats['google_maps']['with_address'],
'with_phone': stats['google_maps']['with_phone'],
'with_website': stats['google_maps']['with_website'],
'with_opening_hours': stats['google_maps']['with_opening_hours'],
'with_rating': stats['google_maps']['with_rating'],
'business_statuses': dict(stats['google_maps']['business_statuses']),
'provinces': dict(stats['google_maps']['provinces'])
},
'geographic': {
'unique_cities': len(stats['cities']),
'provinces_covered': len(stats['google_maps']['provinces']),
'top_cities': dict(sorted(stats['cities'].items(), key=lambda x: -x[1])[:20])
}
}
stats_path = REPORTS_DIR / "kb_libraries_enrichment_stats.json"
with open(stats_path, 'w', encoding='utf-8') as f:
json.dump(stats_json, f, indent=2)
print(f" Stats saved: {stats_path}")
# Print summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Total KB Library Entries: {stats['total_entries']}")
print(f"Valid Entries: {valid_count} ({valid_count/stats['total_entries']*100:.1f}%)")
print(f"Wikidata Coverage: {stats['wikidata']['success']} ({stats['wikidata']['success']/stats['total_entries']*100:.1f}%)")
print(f"Google Maps Coverage: {stats['google_maps']['success']} ({stats['google_maps']['success']/stats['total_entries']*100:.1f}%)")
print(f"Unique Cities: {len(stats['cities'])}")
print(f"Provinces Covered: {len(stats['google_maps']['provinces'])}")
print("=" * 60)
return stats_json
if __name__ == "__main__":
main()