- Implemented a Python script to validate KB library YAML files for required fields and data quality. - Analyzed enrichment coverage from Wikidata and Google Maps, generating statistics. - Created a comprehensive markdown report summarizing validation results and enrichment quality. - Included error handling for file loading and validation processes. - Generated JSON statistics for further analysis.
496 lines
18 KiB
Python
496 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Validate KB library entries and generate a comprehensive summary report.
|
|
|
|
This script:
|
|
1. Validates all KB library YAML files for required fields
|
|
2. Analyzes enrichment coverage and data quality
|
|
3. Generates statistics and a summary report
|
|
"""
|
|
|
|
import os
|
|
import yaml
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from collections import defaultdict
|
|
|
|
# Paths
|
|
ENTRIES_DIR = Path("/Users/kempersc/apps/glam/data/nde/enriched/entries")
|
|
REPORTS_DIR = Path("/Users/kempersc/apps/glam/reports")
|
|
|
|
def load_kb_library_files():
|
|
"""Load all KB library YAML files."""
|
|
entries = []
|
|
errors = []
|
|
|
|
for filepath in sorted(ENTRIES_DIR.glob("*_kb_isil.yaml")):
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
data['_filepath'] = str(filepath)
|
|
data['_filename'] = filepath.name
|
|
entries.append(data)
|
|
except Exception as e:
|
|
errors.append({
|
|
'file': str(filepath),
|
|
'error': str(e)
|
|
})
|
|
|
|
return entries, errors
|
|
|
|
|
|
def validate_entry(entry):
|
|
"""Validate a single entry for required fields and data quality."""
|
|
issues = []
|
|
warnings = []
|
|
|
|
# Required fields
|
|
required_fields = ['original_entry', 'entry_index', 'kb_enrichment']
|
|
for field in required_fields:
|
|
if field not in entry:
|
|
issues.append(f"Missing required field: {field}")
|
|
|
|
# Validate original_entry
|
|
if 'original_entry' in entry:
|
|
orig = entry['original_entry']
|
|
if not orig.get('organisatie'):
|
|
issues.append("Missing organisation name in original_entry")
|
|
if not orig.get('isil_code_kb'):
|
|
issues.append("Missing ISIL code in original_entry")
|
|
|
|
# Validate KB enrichment
|
|
if 'kb_enrichment' in entry:
|
|
kb = entry['kb_enrichment']
|
|
if not kb.get('isil_code'):
|
|
issues.append("Missing ISIL code in kb_enrichment")
|
|
if not kb.get('name'):
|
|
warnings.append("Missing name in kb_enrichment")
|
|
if not kb.get('city'):
|
|
warnings.append("Missing city in kb_enrichment")
|
|
|
|
# Check Wikidata enrichment
|
|
wikidata_status = "not_attempted"
|
|
if 'wikidata_enrichment' in entry:
|
|
wikidata_status = "success"
|
|
wd = entry['wikidata_enrichment']
|
|
if not wd.get('wikidata_entity_id'):
|
|
issues.append("Wikidata enrichment present but missing entity ID")
|
|
elif entry.get('wikidata_enrichment_status') == 'NOT_FOUND':
|
|
wikidata_status = "not_found"
|
|
|
|
# Check Google Maps enrichment
|
|
google_status = "not_attempted"
|
|
if 'google_maps_enrichment' in entry:
|
|
google_status = "success"
|
|
gm = entry['google_maps_enrichment']
|
|
if not gm.get('place_id'):
|
|
issues.append("Google Maps enrichment present but missing place_id")
|
|
if not gm.get('coordinates'):
|
|
warnings.append("Google Maps enrichment missing coordinates")
|
|
elif entry.get('google_maps_status') == 'NOT_FOUND':
|
|
google_status = "not_found"
|
|
|
|
return {
|
|
'valid': len(issues) == 0,
|
|
'issues': issues,
|
|
'warnings': warnings,
|
|
'wikidata_status': wikidata_status,
|
|
'google_status': google_status
|
|
}
|
|
|
|
|
|
def analyze_enrichment_quality(entries):
|
|
"""Analyze the quality and coverage of enrichments."""
|
|
stats = {
|
|
'total_entries': len(entries),
|
|
'wikidata': {
|
|
'success': 0,
|
|
'not_found': 0,
|
|
'not_attempted': 0,
|
|
'with_coordinates': 0,
|
|
'with_inception': 0,
|
|
'with_viaf': 0,
|
|
'with_website': 0,
|
|
'match_methods': defaultdict(int)
|
|
},
|
|
'google_maps': {
|
|
'success': 0,
|
|
'not_found': 0,
|
|
'not_attempted': 0,
|
|
'with_coordinates': 0,
|
|
'with_address': 0,
|
|
'with_phone': 0,
|
|
'with_website': 0,
|
|
'with_opening_hours': 0,
|
|
'with_rating': 0,
|
|
'business_statuses': defaultdict(int),
|
|
'provinces': defaultdict(int)
|
|
},
|
|
'cities': defaultdict(int),
|
|
'isil_prefixes': defaultdict(int)
|
|
}
|
|
|
|
for entry in entries:
|
|
# City statistics
|
|
city = entry.get('kb_enrichment', {}).get('city', 'Unknown')
|
|
stats['cities'][city] += 1
|
|
|
|
# ISIL prefix statistics
|
|
isil = entry.get('kb_enrichment', {}).get('isil_code', '')
|
|
if isil.startswith('NL-'):
|
|
# Extract prefix pattern (first 4 digits after NL-)
|
|
prefix = isil[:7] if len(isil) >= 7 else isil
|
|
stats['isil_prefixes'][prefix] += 1
|
|
|
|
# Wikidata statistics
|
|
if 'wikidata_enrichment' in entry:
|
|
stats['wikidata']['success'] += 1
|
|
wd = entry['wikidata_enrichment']
|
|
|
|
if wd.get('wikidata_coordinates'):
|
|
stats['wikidata']['with_coordinates'] += 1
|
|
if wd.get('wikidata_inception'):
|
|
stats['wikidata']['with_inception'] += 1
|
|
|
|
identifiers = wd.get('wikidata_identifiers', {})
|
|
if identifiers.get('VIAF'):
|
|
stats['wikidata']['with_viaf'] += 1
|
|
if identifiers.get('Website'):
|
|
stats['wikidata']['with_website'] += 1
|
|
|
|
method = wd.get('match_method', 'unknown')
|
|
stats['wikidata']['match_methods'][method] += 1
|
|
|
|
elif entry.get('wikidata_enrichment_status') == 'NOT_FOUND':
|
|
stats['wikidata']['not_found'] += 1
|
|
else:
|
|
stats['wikidata']['not_attempted'] += 1
|
|
|
|
# Google Maps statistics
|
|
if 'google_maps_enrichment' in entry:
|
|
stats['google_maps']['success'] += 1
|
|
gm = entry['google_maps_enrichment']
|
|
|
|
if gm.get('coordinates'):
|
|
stats['google_maps']['with_coordinates'] += 1
|
|
if gm.get('formatted_address'):
|
|
stats['google_maps']['with_address'] += 1
|
|
if gm.get('phone_international') or gm.get('phone_local'):
|
|
stats['google_maps']['with_phone'] += 1
|
|
if gm.get('website'):
|
|
stats['google_maps']['with_website'] += 1
|
|
if gm.get('opening_hours'):
|
|
stats['google_maps']['with_opening_hours'] += 1
|
|
if gm.get('rating'):
|
|
stats['google_maps']['with_rating'] += 1
|
|
|
|
status = gm.get('business_status', 'UNKNOWN')
|
|
stats['google_maps']['business_statuses'][status] += 1
|
|
|
|
# Extract province from address components
|
|
for component in gm.get('address_components', []):
|
|
if 'administrative_area_level_1' in component.get('types', []):
|
|
province = component.get('long_name', 'Unknown')
|
|
stats['google_maps']['provinces'][province] += 1
|
|
break
|
|
|
|
elif entry.get('google_maps_status') == 'NOT_FOUND':
|
|
stats['google_maps']['not_found'] += 1
|
|
else:
|
|
stats['google_maps']['not_attempted'] += 1
|
|
|
|
return stats
|
|
|
|
|
|
def generate_report(entries, validation_results, stats, file_errors):
|
|
"""Generate a comprehensive markdown report."""
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')
|
|
|
|
# Count validation issues
|
|
valid_count = sum(1 for v in validation_results if v['valid'])
|
|
with_issues = [v for v in validation_results if not v['valid']]
|
|
with_warnings = [v for v in validation_results if v['warnings']]
|
|
|
|
report = f"""# KB Netherlands Public Libraries - Enrichment Report
|
|
|
|
**Generated**: {timestamp}
|
|
**Total Entries**: {stats['total_entries']}
|
|
|
|
## Executive Summary
|
|
|
|
The KB Netherlands library ISIL data has been successfully integrated and enriched with external data sources.
|
|
|
|
| Metric | Count | Percentage |
|
|
|--------|-------|------------|
|
|
| Total KB Library Entries | {stats['total_entries']} | 100% |
|
|
| Valid Entries | {valid_count} | {valid_count/stats['total_entries']*100:.1f}% |
|
|
| Wikidata Enriched | {stats['wikidata']['success']} | {stats['wikidata']['success']/stats['total_entries']*100:.1f}% |
|
|
| Google Maps Enriched | {stats['google_maps']['success']} | {stats['google_maps']['success']/stats['total_entries']*100:.1f}% |
|
|
|
|
---
|
|
|
|
## Wikidata Enrichment
|
|
|
|
### Coverage
|
|
|
|
| Status | Count | Percentage |
|
|
|--------|-------|------------|
|
|
| Successfully enriched | {stats['wikidata']['success']} | {stats['wikidata']['success']/stats['total_entries']*100:.1f}% |
|
|
| Not found in Wikidata | {stats['wikidata']['not_found']} | {stats['wikidata']['not_found']/stats['total_entries']*100:.1f}% |
|
|
| Not attempted | {stats['wikidata']['not_attempted']} | {stats['wikidata']['not_attempted']/stats['total_entries']*100:.1f}% |
|
|
|
|
### Match Methods
|
|
|
|
| Method | Count |
|
|
|--------|-------|
|
|
"""
|
|
|
|
for method, count in sorted(stats['wikidata']['match_methods'].items(), key=lambda x: -x[1]):
|
|
report += f"| {method} | {count} |\n"
|
|
|
|
report += f"""
|
|
### Data Completeness (of {stats['wikidata']['success']} enriched)
|
|
|
|
| Field | Count | Percentage |
|
|
|-------|-------|------------|
|
|
| Coordinates | {stats['wikidata']['with_coordinates']} | {stats['wikidata']['with_coordinates']/max(stats['wikidata']['success'],1)*100:.1f}% |
|
|
| Inception Date | {stats['wikidata']['with_inception']} | {stats['wikidata']['with_inception']/max(stats['wikidata']['success'],1)*100:.1f}% |
|
|
| VIAF ID | {stats['wikidata']['with_viaf']} | {stats['wikidata']['with_viaf']/max(stats['wikidata']['success'],1)*100:.1f}% |
|
|
| Website | {stats['wikidata']['with_website']} | {stats['wikidata']['with_website']/max(stats['wikidata']['success'],1)*100:.1f}% |
|
|
|
|
---
|
|
|
|
## Google Maps Enrichment
|
|
|
|
### Coverage
|
|
|
|
| Status | Count | Percentage |
|
|
|--------|-------|------------|
|
|
| Successfully enriched | {stats['google_maps']['success']} | {stats['google_maps']['success']/stats['total_entries']*100:.1f}% |
|
|
| Not found | {stats['google_maps']['not_found']} | {stats['google_maps']['not_found']/stats['total_entries']*100:.1f}% |
|
|
| Not attempted | {stats['google_maps']['not_attempted']} | {stats['google_maps']['not_attempted']/stats['total_entries']*100:.1f}% |
|
|
|
|
### Data Completeness (of {stats['google_maps']['success']} enriched)
|
|
|
|
| Field | Count | Percentage |
|
|
|-------|-------|------------|
|
|
| Coordinates | {stats['google_maps']['with_coordinates']} | {stats['google_maps']['with_coordinates']/max(stats['google_maps']['success'],1)*100:.1f}% |
|
|
| Full Address | {stats['google_maps']['with_address']} | {stats['google_maps']['with_address']/max(stats['google_maps']['success'],1)*100:.1f}% |
|
|
| Phone Number | {stats['google_maps']['with_phone']} | {stats['google_maps']['with_phone']/max(stats['google_maps']['success'],1)*100:.1f}% |
|
|
| Website | {stats['google_maps']['with_website']} | {stats['google_maps']['with_website']/max(stats['google_maps']['success'],1)*100:.1f}% |
|
|
| Opening Hours | {stats['google_maps']['with_opening_hours']} | {stats['google_maps']['with_opening_hours']/max(stats['google_maps']['success'],1)*100:.1f}% |
|
|
| Rating | {stats['google_maps']['with_rating']} | {stats['google_maps']['with_rating']/max(stats['google_maps']['success'],1)*100:.1f}% |
|
|
|
|
### Business Status
|
|
|
|
| Status | Count |
|
|
|--------|-------|
|
|
"""
|
|
|
|
for status, count in sorted(stats['google_maps']['business_statuses'].items(), key=lambda x: -x[1]):
|
|
report += f"| {status} | {count} |\n"
|
|
|
|
report += """
|
|
### Geographic Distribution by Province
|
|
|
|
| Province | Count |
|
|
|----------|-------|
|
|
"""
|
|
|
|
for province, count in sorted(stats['google_maps']['provinces'].items(), key=lambda x: -x[1]):
|
|
report += f"| {province} | {count} |\n"
|
|
|
|
report += f"""
|
|
---
|
|
|
|
## Geographic Distribution by City
|
|
|
|
Top 20 cities with most library entries:
|
|
|
|
| City | Count |
|
|
|------|-------|
|
|
"""
|
|
|
|
for city, count in sorted(stats['cities'].items(), key=lambda x: -x[1])[:20]:
|
|
report += f"| {city} | {count} |\n"
|
|
|
|
report += f"""
|
|
---
|
|
|
|
## Validation Results
|
|
|
|
### Summary
|
|
|
|
- **Valid entries**: {valid_count} ({valid_count/stats['total_entries']*100:.1f}%)
|
|
- **Entries with issues**: {len(with_issues)}
|
|
- **Entries with warnings**: {len(with_warnings)}
|
|
- **File parsing errors**: {len(file_errors)}
|
|
|
|
"""
|
|
|
|
if with_issues:
|
|
report += """### Entries with Issues
|
|
|
|
| File | Issues |
|
|
|------|--------|
|
|
"""
|
|
for i, v in enumerate(with_issues[:10]): # Show first 10
|
|
entry = entries[i] if i < len(entries) else {}
|
|
filename = entry.get('_filename', 'unknown')
|
|
issues_str = "; ".join(v['issues'])
|
|
report += f"| {filename} | {issues_str} |\n"
|
|
|
|
if len(with_issues) > 10:
|
|
report += f"\n*...and {len(with_issues) - 10} more entries with issues*\n"
|
|
|
|
if file_errors:
|
|
report += """
|
|
### File Parsing Errors
|
|
|
|
| File | Error |
|
|
|------|-------|
|
|
"""
|
|
for err in file_errors:
|
|
report += f"| {Path(err['file']).name} | {err['error'][:50]}... |\n"
|
|
|
|
report += f"""
|
|
---
|
|
|
|
## Data Sources
|
|
|
|
1. **KB Netherlands Library Network** (Primary)
|
|
- Source file: `KB_Netherlands_ISIL_2025-04-01.xlsx`
|
|
- URL: https://www.bibliotheeknetwerk.nl/
|
|
- 149 library entries with ISIL codes
|
|
|
|
2. **Wikidata** (Enrichment)
|
|
- SPARQL endpoint: https://query.wikidata.org/sparql
|
|
- Match methods: ISIL code lookup, fuzzy name matching
|
|
- Coverage: {stats['wikidata']['success']}/{stats['total_entries']} ({stats['wikidata']['success']/stats['total_entries']*100:.1f}%)
|
|
|
|
3. **Google Maps Places API** (Enrichment)
|
|
- API: Places API (New)
|
|
- Coverage: {stats['google_maps']['success']}/{stats['total_entries']} ({stats['google_maps']['success']/stats['total_entries']*100:.1f}%)
|
|
|
|
---
|
|
|
|
## Files Generated
|
|
|
|
- Entry files: `data/nde/enriched/entries/{{index}}_kb_isil.yaml` (149 files)
|
|
- This report: `reports/kb_libraries_enrichment_report.md`
|
|
- Statistics JSON: `reports/kb_libraries_enrichment_stats.json`
|
|
|
|
---
|
|
|
|
*Report generated by validate_kb_libraries_report.py*
|
|
"""
|
|
|
|
return report
|
|
|
|
|
|
def main():
|
|
print("=" * 60)
|
|
print("KB Netherlands Libraries - Validation and Report Generator")
|
|
print("=" * 60)
|
|
|
|
# Ensure reports directory exists
|
|
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Load all entries
|
|
print("\n1. Loading KB library entries...")
|
|
entries, file_errors = load_kb_library_files()
|
|
print(f" Loaded: {len(entries)} entries")
|
|
print(f" File errors: {len(file_errors)}")
|
|
|
|
# Validate entries
|
|
print("\n2. Validating entries...")
|
|
validation_results = []
|
|
for entry in entries:
|
|
result = validate_entry(entry)
|
|
result['entry_index'] = entry.get('entry_index')
|
|
result['filename'] = entry.get('_filename')
|
|
validation_results.append(result)
|
|
|
|
valid_count = sum(1 for v in validation_results if v['valid'])
|
|
print(f" Valid: {valid_count}/{len(entries)}")
|
|
|
|
# Analyze enrichment quality
|
|
print("\n3. Analyzing enrichment quality...")
|
|
stats = analyze_enrichment_quality(entries)
|
|
|
|
print(f" Wikidata enriched: {stats['wikidata']['success']}/{stats['total_entries']}")
|
|
print(f" Google Maps enriched: {stats['google_maps']['success']}/{stats['total_entries']}")
|
|
print(f" Unique cities: {len(stats['cities'])}")
|
|
print(f" Provinces covered: {len(stats['google_maps']['provinces'])}")
|
|
|
|
# Generate report
|
|
print("\n4. Generating report...")
|
|
report = generate_report(entries, validation_results, stats, file_errors)
|
|
|
|
report_path = REPORTS_DIR / "kb_libraries_enrichment_report.md"
|
|
with open(report_path, 'w', encoding='utf-8') as f:
|
|
f.write(report)
|
|
print(f" Report saved: {report_path}")
|
|
|
|
# Save statistics as JSON
|
|
stats_json = {
|
|
'timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'total_entries': stats['total_entries'],
|
|
'validation': {
|
|
'valid': valid_count,
|
|
'with_issues': len(entries) - valid_count,
|
|
'file_errors': len(file_errors)
|
|
},
|
|
'wikidata': {
|
|
'success': stats['wikidata']['success'],
|
|
'not_found': stats['wikidata']['not_found'],
|
|
'coverage_pct': round(stats['wikidata']['success'] / stats['total_entries'] * 100, 1),
|
|
'match_methods': dict(stats['wikidata']['match_methods']),
|
|
'with_coordinates': stats['wikidata']['with_coordinates'],
|
|
'with_inception': stats['wikidata']['with_inception'],
|
|
'with_viaf': stats['wikidata']['with_viaf'],
|
|
'with_website': stats['wikidata']['with_website']
|
|
},
|
|
'google_maps': {
|
|
'success': stats['google_maps']['success'],
|
|
'not_found': stats['google_maps']['not_found'],
|
|
'coverage_pct': round(stats['google_maps']['success'] / stats['total_entries'] * 100, 1),
|
|
'with_coordinates': stats['google_maps']['with_coordinates'],
|
|
'with_address': stats['google_maps']['with_address'],
|
|
'with_phone': stats['google_maps']['with_phone'],
|
|
'with_website': stats['google_maps']['with_website'],
|
|
'with_opening_hours': stats['google_maps']['with_opening_hours'],
|
|
'with_rating': stats['google_maps']['with_rating'],
|
|
'business_statuses': dict(stats['google_maps']['business_statuses']),
|
|
'provinces': dict(stats['google_maps']['provinces'])
|
|
},
|
|
'geographic': {
|
|
'unique_cities': len(stats['cities']),
|
|
'provinces_covered': len(stats['google_maps']['provinces']),
|
|
'top_cities': dict(sorted(stats['cities'].items(), key=lambda x: -x[1])[:20])
|
|
}
|
|
}
|
|
|
|
stats_path = REPORTS_DIR / "kb_libraries_enrichment_stats.json"
|
|
with open(stats_path, 'w', encoding='utf-8') as f:
|
|
json.dump(stats_json, f, indent=2)
|
|
print(f" Stats saved: {stats_path}")
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Total KB Library Entries: {stats['total_entries']}")
|
|
print(f"Valid Entries: {valid_count} ({valid_count/stats['total_entries']*100:.1f}%)")
|
|
print(f"Wikidata Coverage: {stats['wikidata']['success']} ({stats['wikidata']['success']/stats['total_entries']*100:.1f}%)")
|
|
print(f"Google Maps Coverage: {stats['google_maps']['success']} ({stats['google_maps']['success']/stats['total_entries']*100:.1f}%)")
|
|
print(f"Unique Cities: {len(stats['cities'])}")
|
|
print(f"Provinces Covered: {len(stats['google_maps']['provinces'])}")
|
|
print("=" * 60)
|
|
|
|
return stats_json
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|