#!/usr/bin/env python3 """ Validate Japan ISIL Dataset Creates a comprehensive validation report for the Japan heritage institutions dataset. Checks schema compliance, data quality, and completeness. """ import sys import yaml from pathlib import Path from collections import Counter, defaultdict from datetime import datetime from typing import Dict, List, Any def load_yaml_file(file_path: Path) -> List[Dict[str, Any]]: """Load YAML file as list of records.""" with open(file_path, 'r', encoding='utf-8') as f: records = yaml.safe_load(f) # Handle both list and single document if not isinstance(records, list): records = [records] return records def validate_record(record: Dict[str, Any], index: int) -> Dict[str, Any]: """Validate a single record - basic schema checks without Pydantic.""" result = { 'index': index, 'id': record.get('id', 'UNKNOWN'), 'name': record.get('name', 'UNKNOWN'), 'valid': False, 'errors': [] } try: # Basic validation checks required_fields = ['id', 'name', 'institution_type', 'provenance'] missing = [f for f in required_fields if f not in record or not record[f]] if missing: result['errors'].append(f"Missing required fields: {', '.join(missing)}") else: result['valid'] = True result['institution_type'] = record.get('institution_type', 'UNKNOWN') # Check for website identifiers = record.get('identifiers', []) result['has_website'] = any( i.get('identifier_scheme') == 'Website' for i in identifiers ) # Check for address locations = record.get('locations', []) result['has_address'] = bool(locations and locations[0].get('street_address')) # Get GHCID result['ghcid'] = record.get('ghcid', '') except Exception as e: result['errors'].append(f"Unexpected error: {str(e)}") return result def analyze_dataset(records: List[Dict[str, Any]]) -> Dict[str, Any]: """Perform comprehensive analysis of the dataset.""" print(f"Analyzing {len(records)} records...") validation_results = [] institution_types = Counter() prefectures = Counter() data_quality = { 'has_website': 0, 'has_address': 0, 'has_postal_code': 0, 'has_ghcid': 0 } errors_by_type = defaultdict(int) sample_errors = [] for idx, record in enumerate(records): if idx % 1000 == 0: print(f" Processed {idx}/{len(records)} records...") result = validate_record(record, idx) validation_results.append(result) if result['valid']: institution_types[result.get('institution_type', 'UNKNOWN')] += 1 if result.get('has_website'): data_quality['has_website'] += 1 if result.get('has_address'): data_quality['has_address'] += 1 if result.get('ghcid'): data_quality['has_ghcid'] += 1 # Extract prefecture from GHCID or location if result.get('ghcid'): parts = result['ghcid'].split('-') if len(parts) >= 3: prefectures[parts[1]] += 1 # Prefecture code # Check postal code locations = record.get('locations', []) if locations and locations[0].get('postal_code'): data_quality['has_postal_code'] += 1 else: for error in result['errors']: errors_by_type[error] += 1 if len(sample_errors) < 10: sample_errors.append({ 'record_id': result['id'], 'name': result['name'], 'error': error }) valid_count = sum(1 for r in validation_results if r['valid']) invalid_count = len(validation_results) - valid_count return { 'total_records': len(records), 'valid_records': valid_count, 'invalid_records': invalid_count, 'validation_rate': (valid_count / len(records) * 100) if records else 0, 'institution_types': dict(institution_types.most_common()), 'prefectures': dict(prefectures.most_common()), 'prefecture_count': len(prefectures), 'data_quality': data_quality, 'errors_by_type': dict(errors_by_type), 'sample_errors': sample_errors, 'validation_results': validation_results } def generate_report(analysis: Dict[str, Any], output_path: Path): """Generate comprehensive validation report.""" report_lines = [] report_lines.append("=" * 80) report_lines.append("JAPAN ISIL DATASET VALIDATION REPORT") report_lines.append("=" * 80) report_lines.append(f"Generated: {datetime.now().isoformat()}") report_lines.append(f"Dataset: data/instances/japan/jp_institutions.yaml") report_lines.append("") # Summary report_lines.append("VALIDATION SUMMARY") report_lines.append("-" * 80) report_lines.append(f"Total Records: {analysis['total_records']:,}") report_lines.append(f"Valid Records: {analysis['valid_records']:,} ({analysis['validation_rate']:.2f}%)") report_lines.append(f"Invalid Records: {analysis['invalid_records']:,}") report_lines.append("") # Institution Type Breakdown report_lines.append("INSTITUTION TYPE BREAKDOWN") report_lines.append("-" * 80) for inst_type, count in sorted(analysis['institution_types'].items(), key=lambda x: -x[1]): percentage = (count / analysis['total_records'] * 100) report_lines.append(f" {inst_type:20s} {count:6,} ({percentage:5.2f}%)") report_lines.append("") # Geographic Coverage report_lines.append("GEOGRAPHIC COVERAGE") report_lines.append("-" * 80) report_lines.append(f"Total Prefectures: {analysis['prefecture_count']}") report_lines.append("") report_lines.append("Top 10 Prefectures by Institution Count:") for prefecture, count in list(analysis['prefectures'].items())[:10]: report_lines.append(f" {prefecture:10s} {count:6,}") report_lines.append("") # Data Quality Metrics report_lines.append("DATA QUALITY METRICS") report_lines.append("-" * 80) total = analysis['total_records'] dq = analysis['data_quality'] report_lines.append(f"GHCID Coverage: {dq['has_ghcid']:6,} / {total:6,} ({dq['has_ghcid']/total*100:5.2f}%)") report_lines.append(f"Website URLs: {dq['has_website']:6,} / {total:6,} ({dq['has_website']/total*100:5.2f}%)") report_lines.append(f"Street Addresses: {dq['has_address']:6,} / {total:6,} ({dq['has_address']/total*100:5.2f}%)") report_lines.append(f"Postal Codes: {dq['has_postal_code']:6,} / {total:6,} ({dq['has_postal_code']/total*100:5.2f}%)") report_lines.append("") # Validation Errors if analysis['errors_by_type']: report_lines.append("VALIDATION ERRORS") report_lines.append("-" * 80) for error_type, count in sorted(analysis['errors_by_type'].items(), key=lambda x: -x[1])[:10]: report_lines.append(f" [{count:4}] {error_type[:70]}") report_lines.append("") if analysis['sample_errors']: report_lines.append("Sample Error Records (first 10):") for i, sample in enumerate(analysis['sample_errors'], 1): report_lines.append(f" {i}. {sample['name']}") report_lines.append(f" ID: {sample['record_id']}") report_lines.append(f" Error: {sample['error'][:70]}") report_lines.append("") else: report_lines.append("✅ NO VALIDATION ERRORS - All records valid!") report_lines.append("") # Schema Compliance report_lines.append("SCHEMA COMPLIANCE") report_lines.append("-" * 80) if analysis['validation_rate'] == 100.0: report_lines.append("✅ PASS - All records conform to LinkML schema") elif analysis['validation_rate'] >= 95.0: report_lines.append("⚠️ WARN - Most records valid, minor issues detected") else: report_lines.append("❌ FAIL - Significant schema validation issues") report_lines.append("") # Recommendations report_lines.append("RECOMMENDATIONS") report_lines.append("-" * 80) if dq['has_website'] / total < 0.90: report_lines.append(f"⚠️ Website coverage at {dq['has_website']/total*100:.1f}% - consider enriching from web") else: report_lines.append(f"✅ Website coverage excellent ({dq['has_website']/total*100:.1f}%)") if dq['has_address'] / total < 0.95: report_lines.append(f"⚠️ Address coverage at {dq['has_address']/total*100:.1f}% - geocoding may be incomplete") else: report_lines.append(f"✅ Address coverage excellent ({dq['has_address']/total*100:.1f}%)") if analysis['prefecture_count'] < 47: report_lines.append(f"ℹ️ Only {analysis['prefecture_count']}/47 prefectures represented - may be incomplete") else: report_lines.append(f"✅ All 47 prefectures represented") report_lines.append("") report_lines.append("NEXT STEPS") report_lines.append("-" * 80) report_lines.append("1. Review sample error records if validation < 100%") report_lines.append("2. Consider geocoding addresses to add lat/lon coordinates") report_lines.append("3. Map prefecture codes to ISO 3166-2 (JP-01 through JP-47)") report_lines.append("4. Merge with global dataset (NL, EUR, Latin America)") report_lines.append("5. Export to GeoJSON for geographic visualization") report_lines.append("") report_lines.append("=" * 80) report_lines.append("END OF REPORT") report_lines.append("=" * 80) # Write report report_text = "\n".join(report_lines) with open(output_path, 'w', encoding='utf-8') as f: f.write(report_text) return report_text def main(): """Main validation workflow.""" # Paths base_dir = Path(__file__).parent.parent dataset_path = base_dir / 'data' / 'instances' / 'japan' / 'jp_institutions.yaml' report_path = base_dir / 'data' / 'instances' / 'japan' / 'validation_report.txt' print("=" * 80) print("Japan ISIL Dataset Validation") print("=" * 80) print(f"Input: {dataset_path}") print(f"Output: {report_path}") print() # Load dataset print("Loading dataset...") records = load_yaml_file(dataset_path) print(f"Loaded {len(records)} records") print() # Analyze analysis = analyze_dataset(records) # Generate report print("\nGenerating report...") report_text = generate_report(analysis, report_path) # Print to console print("\n" + report_text) print(f"\n✅ Validation report saved to: {report_path}") # Return exit code based on validation if analysis['validation_rate'] < 95.0: sys.exit(1) # Fail if < 95% valid else: sys.exit(0) if __name__ == '__main__': main()