#!/usr/bin/env python3 """ Validate KB library entries and generate a comprehensive summary report. This script: 1. Validates all KB library YAML files for required fields 2. Analyzes enrichment coverage and data quality 3. Generates statistics and a summary report """ import os import yaml import json from pathlib import Path from datetime import datetime, timezone from collections import defaultdict # Paths ENTRIES_DIR = Path("/Users/kempersc/apps/glam/data/nde/enriched/entries") REPORTS_DIR = Path("/Users/kempersc/apps/glam/reports") def load_kb_library_files(): """Load all KB library YAML files.""" entries = [] errors = [] for filepath in sorted(ENTRIES_DIR.glob("*_kb_isil.yaml")): try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) data['_filepath'] = str(filepath) data['_filename'] = filepath.name entries.append(data) except Exception as e: errors.append({ 'file': str(filepath), 'error': str(e) }) return entries, errors def validate_entry(entry): """Validate a single entry for required fields and data quality.""" issues = [] warnings = [] # Required fields required_fields = ['original_entry', 'entry_index', 'kb_enrichment'] for field in required_fields: if field not in entry: issues.append(f"Missing required field: {field}") # Validate original_entry if 'original_entry' in entry: orig = entry['original_entry'] if not orig.get('organisatie'): issues.append("Missing organisation name in original_entry") if not orig.get('isil_code_kb'): issues.append("Missing ISIL code in original_entry") # Validate KB enrichment if 'kb_enrichment' in entry: kb = entry['kb_enrichment'] if not kb.get('isil_code'): issues.append("Missing ISIL code in kb_enrichment") if not kb.get('name'): warnings.append("Missing name in kb_enrichment") if not kb.get('city'): warnings.append("Missing city in kb_enrichment") # Check Wikidata enrichment wikidata_status = "not_attempted" if 'wikidata_enrichment' in entry: wikidata_status = "success" wd = entry['wikidata_enrichment'] if not wd.get('wikidata_entity_id'): issues.append("Wikidata enrichment present but missing entity ID") elif entry.get('wikidata_enrichment_status') == 'NOT_FOUND': wikidata_status = "not_found" # Check Google Maps enrichment google_status = "not_attempted" if 'google_maps_enrichment' in entry: google_status = "success" gm = entry['google_maps_enrichment'] if not gm.get('place_id'): issues.append("Google Maps enrichment present but missing place_id") if not gm.get('coordinates'): warnings.append("Google Maps enrichment missing coordinates") elif entry.get('google_maps_status') == 'NOT_FOUND': google_status = "not_found" return { 'valid': len(issues) == 0, 'issues': issues, 'warnings': warnings, 'wikidata_status': wikidata_status, 'google_status': google_status } def analyze_enrichment_quality(entries): """Analyze the quality and coverage of enrichments.""" stats = { 'total_entries': len(entries), 'wikidata': { 'success': 0, 'not_found': 0, 'not_attempted': 0, 'with_coordinates': 0, 'with_inception': 0, 'with_viaf': 0, 'with_website': 0, 'match_methods': defaultdict(int) }, 'google_maps': { 'success': 0, 'not_found': 0, 'not_attempted': 0, 'with_coordinates': 0, 'with_address': 0, 'with_phone': 0, 'with_website': 0, 'with_opening_hours': 0, 'with_rating': 0, 'business_statuses': defaultdict(int), 'provinces': defaultdict(int) }, 'cities': defaultdict(int), 'isil_prefixes': defaultdict(int) } for entry in entries: # City statistics city = entry.get('kb_enrichment', {}).get('city', 'Unknown') stats['cities'][city] += 1 # ISIL prefix statistics isil = entry.get('kb_enrichment', {}).get('isil_code', '') if isil.startswith('NL-'): # Extract prefix pattern (first 4 digits after NL-) prefix = isil[:7] if len(isil) >= 7 else isil stats['isil_prefixes'][prefix] += 1 # Wikidata statistics if 'wikidata_enrichment' in entry: stats['wikidata']['success'] += 1 wd = entry['wikidata_enrichment'] if wd.get('wikidata_coordinates'): stats['wikidata']['with_coordinates'] += 1 if wd.get('wikidata_inception'): stats['wikidata']['with_inception'] += 1 identifiers = wd.get('wikidata_identifiers', {}) if identifiers.get('VIAF'): stats['wikidata']['with_viaf'] += 1 if identifiers.get('Website'): stats['wikidata']['with_website'] += 1 method = wd.get('match_method', 'unknown') stats['wikidata']['match_methods'][method] += 1 elif entry.get('wikidata_enrichment_status') == 'NOT_FOUND': stats['wikidata']['not_found'] += 1 else: stats['wikidata']['not_attempted'] += 1 # Google Maps statistics if 'google_maps_enrichment' in entry: stats['google_maps']['success'] += 1 gm = entry['google_maps_enrichment'] if gm.get('coordinates'): stats['google_maps']['with_coordinates'] += 1 if gm.get('formatted_address'): stats['google_maps']['with_address'] += 1 if gm.get('phone_international') or gm.get('phone_local'): stats['google_maps']['with_phone'] += 1 if gm.get('website'): stats['google_maps']['with_website'] += 1 if gm.get('opening_hours'): stats['google_maps']['with_opening_hours'] += 1 if gm.get('rating'): stats['google_maps']['with_rating'] += 1 status = gm.get('business_status', 'UNKNOWN') stats['google_maps']['business_statuses'][status] += 1 # Extract province from address components for component in gm.get('address_components', []): if 'administrative_area_level_1' in component.get('types', []): province = component.get('long_name', 'Unknown') stats['google_maps']['provinces'][province] += 1 break elif entry.get('google_maps_status') == 'NOT_FOUND': stats['google_maps']['not_found'] += 1 else: stats['google_maps']['not_attempted'] += 1 return stats def generate_report(entries, validation_results, stats, file_errors): """Generate a comprehensive markdown report.""" timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC') # Count validation issues valid_count = sum(1 for v in validation_results if v['valid']) with_issues = [v for v in validation_results if not v['valid']] with_warnings = [v for v in validation_results if v['warnings']] report = f"""# KB Netherlands Public Libraries - Enrichment Report **Generated**: {timestamp} **Total Entries**: {stats['total_entries']} ## Executive Summary The KB Netherlands library ISIL data has been successfully integrated and enriched with external data sources. | Metric | Count | Percentage | |--------|-------|------------| | Total KB Library Entries | {stats['total_entries']} | 100% | | Valid Entries | {valid_count} | {valid_count/stats['total_entries']*100:.1f}% | | Wikidata Enriched | {stats['wikidata']['success']} | {stats['wikidata']['success']/stats['total_entries']*100:.1f}% | | Google Maps Enriched | {stats['google_maps']['success']} | {stats['google_maps']['success']/stats['total_entries']*100:.1f}% | --- ## Wikidata Enrichment ### Coverage | Status | Count | Percentage | |--------|-------|------------| | Successfully enriched | {stats['wikidata']['success']} | {stats['wikidata']['success']/stats['total_entries']*100:.1f}% | | Not found in Wikidata | {stats['wikidata']['not_found']} | {stats['wikidata']['not_found']/stats['total_entries']*100:.1f}% | | Not attempted | {stats['wikidata']['not_attempted']} | {stats['wikidata']['not_attempted']/stats['total_entries']*100:.1f}% | ### Match Methods | Method | Count | |--------|-------| """ for method, count in sorted(stats['wikidata']['match_methods'].items(), key=lambda x: -x[1]): report += f"| {method} | {count} |\n" report += f""" ### Data Completeness (of {stats['wikidata']['success']} enriched) | Field | Count | Percentage | |-------|-------|------------| | Coordinates | {stats['wikidata']['with_coordinates']} | {stats['wikidata']['with_coordinates']/max(stats['wikidata']['success'],1)*100:.1f}% | | Inception Date | {stats['wikidata']['with_inception']} | {stats['wikidata']['with_inception']/max(stats['wikidata']['success'],1)*100:.1f}% | | VIAF ID | {stats['wikidata']['with_viaf']} | {stats['wikidata']['with_viaf']/max(stats['wikidata']['success'],1)*100:.1f}% | | Website | {stats['wikidata']['with_website']} | {stats['wikidata']['with_website']/max(stats['wikidata']['success'],1)*100:.1f}% | --- ## Google Maps Enrichment ### Coverage | Status | Count | Percentage | |--------|-------|------------| | Successfully enriched | {stats['google_maps']['success']} | {stats['google_maps']['success']/stats['total_entries']*100:.1f}% | | Not found | {stats['google_maps']['not_found']} | {stats['google_maps']['not_found']/stats['total_entries']*100:.1f}% | | Not attempted | {stats['google_maps']['not_attempted']} | {stats['google_maps']['not_attempted']/stats['total_entries']*100:.1f}% | ### Data Completeness (of {stats['google_maps']['success']} enriched) | Field | Count | Percentage | |-------|-------|------------| | Coordinates | {stats['google_maps']['with_coordinates']} | {stats['google_maps']['with_coordinates']/max(stats['google_maps']['success'],1)*100:.1f}% | | Full Address | {stats['google_maps']['with_address']} | {stats['google_maps']['with_address']/max(stats['google_maps']['success'],1)*100:.1f}% | | Phone Number | {stats['google_maps']['with_phone']} | {stats['google_maps']['with_phone']/max(stats['google_maps']['success'],1)*100:.1f}% | | Website | {stats['google_maps']['with_website']} | {stats['google_maps']['with_website']/max(stats['google_maps']['success'],1)*100:.1f}% | | Opening Hours | {stats['google_maps']['with_opening_hours']} | {stats['google_maps']['with_opening_hours']/max(stats['google_maps']['success'],1)*100:.1f}% | | Rating | {stats['google_maps']['with_rating']} | {stats['google_maps']['with_rating']/max(stats['google_maps']['success'],1)*100:.1f}% | ### Business Status | Status | Count | |--------|-------| """ for status, count in sorted(stats['google_maps']['business_statuses'].items(), key=lambda x: -x[1]): report += f"| {status} | {count} |\n" report += """ ### Geographic Distribution by Province | Province | Count | |----------|-------| """ for province, count in sorted(stats['google_maps']['provinces'].items(), key=lambda x: -x[1]): report += f"| {province} | {count} |\n" report += f""" --- ## Geographic Distribution by City Top 20 cities with most library entries: | City | Count | |------|-------| """ for city, count in sorted(stats['cities'].items(), key=lambda x: -x[1])[:20]: report += f"| {city} | {count} |\n" report += f""" --- ## Validation Results ### Summary - **Valid entries**: {valid_count} ({valid_count/stats['total_entries']*100:.1f}%) - **Entries with issues**: {len(with_issues)} - **Entries with warnings**: {len(with_warnings)} - **File parsing errors**: {len(file_errors)} """ if with_issues: report += """### Entries with Issues | File | Issues | |------|--------| """ for i, v in enumerate(with_issues[:10]): # Show first 10 entry = entries[i] if i < len(entries) else {} filename = entry.get('_filename', 'unknown') issues_str = "; ".join(v['issues']) report += f"| {filename} | {issues_str} |\n" if len(with_issues) > 10: report += f"\n*...and {len(with_issues) - 10} more entries with issues*\n" if file_errors: report += """ ### File Parsing Errors | File | Error | |------|-------| """ for err in file_errors: report += f"| {Path(err['file']).name} | {err['error'][:50]}... |\n" report += f""" --- ## Data Sources 1. **KB Netherlands Library Network** (Primary) - Source file: `KB_Netherlands_ISIL_2025-04-01.xlsx` - URL: https://www.bibliotheeknetwerk.nl/ - 149 library entries with ISIL codes 2. **Wikidata** (Enrichment) - SPARQL endpoint: https://query.wikidata.org/sparql - Match methods: ISIL code lookup, fuzzy name matching - Coverage: {stats['wikidata']['success']}/{stats['total_entries']} ({stats['wikidata']['success']/stats['total_entries']*100:.1f}%) 3. **Google Maps Places API** (Enrichment) - API: Places API (New) - Coverage: {stats['google_maps']['success']}/{stats['total_entries']} ({stats['google_maps']['success']/stats['total_entries']*100:.1f}%) --- ## Files Generated - Entry files: `data/nde/enriched/entries/{{index}}_kb_isil.yaml` (149 files) - This report: `reports/kb_libraries_enrichment_report.md` - Statistics JSON: `reports/kb_libraries_enrichment_stats.json` --- *Report generated by validate_kb_libraries_report.py* """ return report def main(): print("=" * 60) print("KB Netherlands Libraries - Validation and Report Generator") print("=" * 60) # Ensure reports directory exists REPORTS_DIR.mkdir(parents=True, exist_ok=True) # Load all entries print("\n1. Loading KB library entries...") entries, file_errors = load_kb_library_files() print(f" Loaded: {len(entries)} entries") print(f" File errors: {len(file_errors)}") # Validate entries print("\n2. Validating entries...") validation_results = [] for entry in entries: result = validate_entry(entry) result['entry_index'] = entry.get('entry_index') result['filename'] = entry.get('_filename') validation_results.append(result) valid_count = sum(1 for v in validation_results if v['valid']) print(f" Valid: {valid_count}/{len(entries)}") # Analyze enrichment quality print("\n3. Analyzing enrichment quality...") stats = analyze_enrichment_quality(entries) print(f" Wikidata enriched: {stats['wikidata']['success']}/{stats['total_entries']}") print(f" Google Maps enriched: {stats['google_maps']['success']}/{stats['total_entries']}") print(f" Unique cities: {len(stats['cities'])}") print(f" Provinces covered: {len(stats['google_maps']['provinces'])}") # Generate report print("\n4. Generating report...") report = generate_report(entries, validation_results, stats, file_errors) report_path = REPORTS_DIR / "kb_libraries_enrichment_report.md" with open(report_path, 'w', encoding='utf-8') as f: f.write(report) print(f" Report saved: {report_path}") # Save statistics as JSON stats_json = { 'timestamp': datetime.now(timezone.utc).isoformat(), 'total_entries': stats['total_entries'], 'validation': { 'valid': valid_count, 'with_issues': len(entries) - valid_count, 'file_errors': len(file_errors) }, 'wikidata': { 'success': stats['wikidata']['success'], 'not_found': stats['wikidata']['not_found'], 'coverage_pct': round(stats['wikidata']['success'] / stats['total_entries'] * 100, 1), 'match_methods': dict(stats['wikidata']['match_methods']), 'with_coordinates': stats['wikidata']['with_coordinates'], 'with_inception': stats['wikidata']['with_inception'], 'with_viaf': stats['wikidata']['with_viaf'], 'with_website': stats['wikidata']['with_website'] }, 'google_maps': { 'success': stats['google_maps']['success'], 'not_found': stats['google_maps']['not_found'], 'coverage_pct': round(stats['google_maps']['success'] / stats['total_entries'] * 100, 1), 'with_coordinates': stats['google_maps']['with_coordinates'], 'with_address': stats['google_maps']['with_address'], 'with_phone': stats['google_maps']['with_phone'], 'with_website': stats['google_maps']['with_website'], 'with_opening_hours': stats['google_maps']['with_opening_hours'], 'with_rating': stats['google_maps']['with_rating'], 'business_statuses': dict(stats['google_maps']['business_statuses']), 'provinces': dict(stats['google_maps']['provinces']) }, 'geographic': { 'unique_cities': len(stats['cities']), 'provinces_covered': len(stats['google_maps']['provinces']), 'top_cities': dict(sorted(stats['cities'].items(), key=lambda x: -x[1])[:20]) } } stats_path = REPORTS_DIR / "kb_libraries_enrichment_stats.json" with open(stats_path, 'w', encoding='utf-8') as f: json.dump(stats_json, f, indent=2) print(f" Stats saved: {stats_path}") # Print summary print("\n" + "=" * 60) print("SUMMARY") print("=" * 60) print(f"Total KB Library Entries: {stats['total_entries']}") print(f"Valid Entries: {valid_count} ({valid_count/stats['total_entries']*100:.1f}%)") print(f"Wikidata Coverage: {stats['wikidata']['success']} ({stats['wikidata']['success']/stats['total_entries']*100:.1f}%)") print(f"Google Maps Coverage: {stats['google_maps']['success']} ({stats['google_maps']['success']/stats['total_entries']*100:.1f}%)") print(f"Unique Cities: {len(stats['cities'])}") print(f"Provinces Covered: {len(stats['google_maps']['provinces'])}") print("=" * 60) return stats_json if __name__ == "__main__": main()