glam/scripts/validate_japan_dataset.py

#!/usr/bin/env python3
"""
Validate Japan ISIL Dataset

Creates a comprehensive validation report for the Japan heritage institutions dataset.
Checks schema compliance, data quality, and completeness.
"""

import sys
import yaml
from pathlib import Path
from collections import Counter, defaultdict
from datetime import datetime
from typing import Dict, List, Any


def load_yaml_file(file_path: Path) -> List[Dict[str, Any]]:
    """Load YAML file as list of records."""
    with open(file_path, 'r', encoding='utf-8') as f:
        records = yaml.safe_load(f)
    # Handle both list and single document
    if not isinstance(records, list):
        records = [records]
    return records


def validate_record(record: Dict[str, Any], index: int) -> Dict[str, Any]:
    """Validate a single record - basic schema checks without Pydantic."""
    result = {
        'index': index,
        'id': record.get('id', 'UNKNOWN'),
        'name': record.get('name', 'UNKNOWN'),
        'valid': False,
        'errors': []
    }

    try:
        # Basic validation checks
        required_fields = ['id', 'name', 'institution_type', 'provenance']
        missing = [f for f in required_fields if f not in record or not record[f]]

        if missing:
            result['errors'].append(f"Missing required fields: {', '.join(missing)}")
        else:
            result['valid'] = True
            result['institution_type'] = record.get('institution_type', 'UNKNOWN')

            # Check for website
            identifiers = record.get('identifiers', [])
            result['has_website'] = any(
                i.get('identifier_scheme') == 'Website' for i in identifiers
            )

            # Check for address
            locations = record.get('locations', [])
            result['has_address'] = bool(locations and locations[0].get('street_address'))

            # Get GHCID
            result['ghcid'] = record.get('ghcid', '')

    except Exception as e:
        result['errors'].append(f"Unexpected error: {str(e)}")

    return result


def analyze_dataset(records: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Perform comprehensive analysis of the dataset."""

    print(f"Analyzing {len(records)} records...")

    validation_results = []
    institution_types = Counter()
    prefectures = Counter()
    data_quality = {
        'has_website': 0,
        'has_address': 0,
        'has_postal_code': 0,
        'has_ghcid': 0
    }

    errors_by_type = defaultdict(int)
    sample_errors = []

    for idx, record in enumerate(records):
        if idx % 1000 == 0:
            print(f"  Processed {idx}/{len(records)} records...")

        result = validate_record(record, idx)
        validation_results.append(result)

        if result['valid']:
            institution_types[result.get('institution_type', 'UNKNOWN')] += 1

            if result.get('has_website'):
                data_quality['has_website'] += 1
            if result.get('has_address'):
                data_quality['has_address'] += 1
            if result.get('ghcid'):
                data_quality['has_ghcid'] += 1

            # Extract prefecture from GHCID or location
            if result.get('ghcid'):
                parts = result['ghcid'].split('-')
                if len(parts) >= 3:
                    prefectures[parts[1]] += 1  # Prefecture code

            # Check postal code
            locations = record.get('locations', [])
            if locations and locations[0].get('postal_code'):
                data_quality['has_postal_code'] += 1
        else:
            for error in result['errors']:
                errors_by_type[error] += 1
                if len(sample_errors) < 10:
                    sample_errors.append({
                        'record_id': result['id'],
                        'name': result['name'],
                        'error': error
                    })

    valid_count = sum(1 for r in validation_results if r['valid'])
    invalid_count = len(validation_results) - valid_count

    return {
        'total_records': len(records),
        'valid_records': valid_count,
        'invalid_records': invalid_count,
        'validation_rate': (valid_count / len(records) * 100) if records else 0,
        'institution_types': dict(institution_types.most_common()),
        'prefectures': dict(prefectures.most_common()),
        'prefecture_count': len(prefectures),
        'data_quality': data_quality,
        'errors_by_type': dict(errors_by_type),
        'sample_errors': sample_errors,
        'validation_results': validation_results
    }


def generate_report(analysis: Dict[str, Any], output_path: Path):
    """Generate comprehensive validation report."""

    report_lines = []

    report_lines.append("=" * 80)
    report_lines.append("JAPAN ISIL DATASET VALIDATION REPORT")
    report_lines.append("=" * 80)
    report_lines.append(f"Generated: {datetime.now().isoformat()}")
    report_lines.append(f"Dataset: data/instances/japan/jp_institutions.yaml")
    report_lines.append("")

    # Summary
    report_lines.append("VALIDATION SUMMARY")
    report_lines.append("-" * 80)
    report_lines.append(f"Total Records:       {analysis['total_records']:,}")
    report_lines.append(f"Valid Records:       {analysis['valid_records']:,} ({analysis['validation_rate']:.2f}%)")
    report_lines.append(f"Invalid Records:     {analysis['invalid_records']:,}")
    report_lines.append("")

    # Institution Type Breakdown
    report_lines.append("INSTITUTION TYPE BREAKDOWN")
    report_lines.append("-" * 80)
    for inst_type, count in sorted(analysis['institution_types'].items(), key=lambda x: -x[1]):
        percentage = (count / analysis['total_records'] * 100)
        report_lines.append(f"  {inst_type:20s} {count:6,} ({percentage:5.2f}%)")
    report_lines.append("")

    # Geographic Coverage
    report_lines.append("GEOGRAPHIC COVERAGE")
    report_lines.append("-" * 80)
    report_lines.append(f"Total Prefectures:   {analysis['prefecture_count']}")
    report_lines.append("")
    report_lines.append("Top 10 Prefectures by Institution Count:")
    for prefecture, count in list(analysis['prefectures'].items())[:10]:
        report_lines.append(f"  {prefecture:10s} {count:6,}")
    report_lines.append("")

    # Data Quality Metrics
    report_lines.append("DATA QUALITY METRICS")
    report_lines.append("-" * 80)
    total = analysis['total_records']
    dq = analysis['data_quality']

    report_lines.append(f"GHCID Coverage:      {dq['has_ghcid']:6,} / {total:6,} ({dq['has_ghcid']/total*100:5.2f}%)")
    report_lines.append(f"Website URLs:        {dq['has_website']:6,} / {total:6,} ({dq['has_website']/total*100:5.2f}%)")
    report_lines.append(f"Street Addresses:    {dq['has_address']:6,} / {total:6,} ({dq['has_address']/total*100:5.2f}%)")
    report_lines.append(f"Postal Codes:        {dq['has_postal_code']:6,} / {total:6,} ({dq['has_postal_code']/total*100:5.2f}%)")
    report_lines.append("")

    # Validation Errors
    if analysis['errors_by_type']:
        report_lines.append("VALIDATION ERRORS")
        report_lines.append("-" * 80)
        for error_type, count in sorted(analysis['errors_by_type'].items(), key=lambda x: -x[1])[:10]:
            report_lines.append(f"  [{count:4}] {error_type[:70]}")
        report_lines.append("")

        if analysis['sample_errors']:
            report_lines.append("Sample Error Records (first 10):")
            for i, sample in enumerate(analysis['sample_errors'], 1):
                report_lines.append(f"  {i}. {sample['name']}")
                report_lines.append(f"     ID: {sample['record_id']}")
                report_lines.append(f"     Error: {sample['error'][:70]}")
                report_lines.append("")
    else:
        report_lines.append("✅ NO VALIDATION ERRORS - All records valid!")
        report_lines.append("")

    # Schema Compliance
    report_lines.append("SCHEMA COMPLIANCE")
    report_lines.append("-" * 80)
    if analysis['validation_rate'] == 100.0:
        report_lines.append("✅ PASS - All records conform to LinkML schema")
    elif analysis['validation_rate'] >= 95.0:
        report_lines.append("⚠️  WARN - Most records valid, minor issues detected")
    else:
        report_lines.append("❌ FAIL - Significant schema validation issues")
    report_lines.append("")

    # Recommendations
    report_lines.append("RECOMMENDATIONS")
    report_lines.append("-" * 80)

    if dq['has_website'] / total < 0.90:
        report_lines.append(f"⚠️  Website coverage at {dq['has_website']/total*100:.1f}% - consider enriching from web")
    else:
        report_lines.append(f"✅ Website coverage excellent ({dq['has_website']/total*100:.1f}%)")

    if dq['has_address'] / total < 0.95:
        report_lines.append(f"⚠️  Address coverage at {dq['has_address']/total*100:.1f}% - geocoding may be incomplete")
    else:
        report_lines.append(f"✅ Address coverage excellent ({dq['has_address']/total*100:.1f}%)")

    if analysis['prefecture_count'] < 47:
        report_lines.append(f"ℹ️  Only {analysis['prefecture_count']}/47 prefectures represented - may be incomplete")
    else:
        report_lines.append(f"✅ All 47 prefectures represented")

    report_lines.append("")
    report_lines.append("NEXT STEPS")
    report_lines.append("-" * 80)
    report_lines.append("1. Review sample error records if validation < 100%")
    report_lines.append("2. Consider geocoding addresses to add lat/lon coordinates")
    report_lines.append("3. Map prefecture codes to ISO 3166-2 (JP-01 through JP-47)")
    report_lines.append("4. Merge with global dataset (NL, EUR, Latin America)")
    report_lines.append("5. Export to GeoJSON for geographic visualization")
    report_lines.append("")

    report_lines.append("=" * 80)
    report_lines.append("END OF REPORT")
    report_lines.append("=" * 80)

    # Write report
    report_text = "\n".join(report_lines)
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(report_text)

    return report_text


def main():
    """Main validation workflow."""

    # Paths
    base_dir = Path(__file__).parent.parent
    dataset_path = base_dir / 'data' / 'instances' / 'japan' / 'jp_institutions.yaml'
    report_path = base_dir / 'data' / 'instances' / 'japan' / 'validation_report.txt'

    print("=" * 80)
    print("Japan ISIL Dataset Validation")
    print("=" * 80)
    print(f"Input:  {dataset_path}")
    print(f"Output: {report_path}")
    print()

    # Load dataset
    print("Loading dataset...")
    records = load_yaml_file(dataset_path)
    print(f"Loaded {len(records)} records")
    print()

    # Analyze
    analysis = analyze_dataset(records)

    # Generate report
    print("\nGenerating report...")
    report_text = generate_report(analysis, report_path)

    # Print to console
    print("\n" + report_text)

    print(f"\n✅ Validation report saved to: {report_path}")

    # Return exit code based on validation
    if analysis['validation_rate'] < 95.0:
        sys.exit(1)  # Fail if < 95% valid
    else:
        sys.exit(0)


if __name__ == '__main__':
    main()