glam/scripts/export_japanese_isil_to_linkml.py

#!/usr/bin/env python3
"""
Export ALL Japanese ISIL registry institutions to LinkML format.

This script performs a COMPLETE export of all Japanese heritage institutions
from the National Diet Library ISIL registry (12,065+ institutions) into a
single consolidated LinkML-compliant YAML file.

Data Sources:
- Public libraries (libraries_public.csv)
- Academic/specialized libraries (libraries_other.csv)
- Museums (museums.csv)
- Archives (archives.csv)

Output: data/instances/japan/jp_institutions.yaml (COMPLETE dataset)

Data Quality: TIER_1_AUTHORITATIVE (official National Diet Library registry)
Coverage: All Japanese prefectures (47 prefectures)
"""

import sys
from pathlib import Path
from datetime import datetime
from typing import Dict, List
from collections import Counter

# Add project root to Python path
project_root = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(project_root / "src"))

from glam_extractor.parsers.japanese_isil import JapaneseISILParser
from glam_extractor.models import InstitutionType, HeritageCustodian
import yaml


def serialize_custodian(custodian: HeritageCustodian) -> dict:
    """
    Serialize HeritageCustodian to YAML-safe dictionary.

    Uses JSON serialization for clean conversion of all Pydantic models,
    datetime objects, and HttpUrl objects to plain Python types.
    """
    import json

    # Use Pydantic's JSON serialization (handles all nested models correctly)
    json_str = custodian.json(exclude_none=True)
    data = json.loads(json_str)

    return data


def analyze_coverage(custodians: List[HeritageCustodian]) -> Dict:
    """Generate comprehensive statistics about the dataset."""

    total = len(custodians)

    # Count by institution type
    types = Counter(c.institution_type for c in custodians)

    # Count by prefecture
    prefectures = Counter(
        c.locations[0].region if c.locations and c.locations[0].region else "UNKNOWN"
        for c in custodians
    )

    # Coverage metrics
    with_ghcid = sum(1 for c in custodians if c.ghcid_numeric is not None)
    with_url = sum(
        1 for c in custodians
        if c.identifiers and any(i.identifier_scheme == "Website" for i in c.identifiers)
    )
    with_phone = sum(
        1 for c in custodians
        if c.description and ("Tel:" in c.description or "電話:" in c.description)
    )
    with_postal = sum(
        1 for c in custodians
        if c.locations and c.locations[0].postal_code
    )
    with_street = sum(
        1 for c in custodians
        if c.locations and c.locations[0].street_address
    )

    # ISIL code patterns
    isil_codes = [
        next((i.identifier_value for i in c.identifiers if i.identifier_scheme == "ISIL"), None)
        for c in custodians
    ]
    isil_prefixes = Counter(
        code.split('-')[1][:2] if code and '-' in code else "UNKNOWN"
        for code in isil_codes
    )

    return {
        "total_institutions": total,
        "by_type": dict(types),
        "by_prefecture": dict(prefectures.most_common(10)),  # Top 10
        "total_prefectures": len([p for p in prefectures if p != "UNKNOWN"]),
        "coverage": {
            "ghcid": {"count": with_ghcid, "percentage": f"{with_ghcid/total*100:.1f}%"},
            "website": {"count": with_url, "percentage": f"{with_url/total*100:.1f}%"},
            "phone": {"count": with_phone, "percentage": f"{with_phone/total*100:.1f}%"},
            "postal_code": {"count": with_postal, "percentage": f"{with_postal/total*100:.1f}%"},
            "street_address": {"count": with_street, "percentage": f"{with_street/total*100:.1f}%"},
        },
        "isil_distribution": dict(isil_prefixes.most_common(10)),  # Top 10 ISIL prefixes
    }


def main():
    """Export all Japanese ISIL institutions to consolidated LinkML YAML."""

    print("="*80)
    print("JAPANESE ISIL REGISTRY → LinkML EXPORT")
    print("="*80)
    print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print()

    data_dir = project_root / "data" / "isil" / "JP"
    output_dir = project_root / "data" / "instances" / "japan"
    output_dir.mkdir(parents=True, exist_ok=True)

    # Output file for complete dataset
    output_path = output_dir / "jp_institutions.yaml"

    parser = JapaneseISILParser()

    # Define CSV files and their institution types
    csv_files = [
        ("libraries_public.csv", InstitutionType.LIBRARY, "Public libraries"),
        ("libraries_other.csv", InstitutionType.LIBRARY, "Academic/specialized libraries"),
        ("museums.csv", InstitutionType.MUSEUM, "Museums"),
        ("archives.csv", InstitutionType.ARCHIVE, "Archives"),
    ]

    all_custodians: List[HeritageCustodian] = []
    file_stats: Dict[str, Dict] = {}

    # Process each CSV file
    for csv_filename, inst_type, description in csv_files:
        csv_path = data_dir / csv_filename

        if not csv_path.exists():
            print(f"⚠️  File not found: {csv_path}")
            print(f"    Skipping {description}")
            continue

        print(f"\n{'─'*80}")
        print(f"Processing: {csv_filename}")
        print(f"Type: {inst_type.value}")
        print(f"Description: {description}")
        print(f"{'─'*80}")

        try:
            # Parse CSV
            print(f"  → Parsing CSV...")
            records = parser.parse_file(csv_path, inst_type)
            print(f"    ✓ Parsed {len(records):,} raw records")

            # Convert to HeritageCustodian models
            print(f"  → Converting to LinkML format...")
            custodians = [
                parser.to_heritage_custodian(record, inst_type, str(csv_path))
                for record in records
            ]
            print(f"    ✓ Converted {len(custodians):,} HeritageCustodian instances")

            # Calculate statistics
            with_ghcid = sum(1 for c in custodians if c.ghcid_numeric is not None)
            with_url = sum(
                1 for c in custodians
                if c.identifiers and any(i.identifier_scheme == "Website" for i in c.identifiers)
            )
            prefectures = len(set(
                c.locations[0].region
                for c in custodians
                if c.locations and c.locations[0].region
            ))

            stats = {
                "total_records": len(records),
                "with_ghcid": with_ghcid,
                "with_url": with_url,
                "prefectures_covered": prefectures,
                "ghcid_coverage": f"{with_ghcid/len(records)*100:.1f}%" if records else "0%",
                "url_coverage": f"{with_url/len(records)*100:.1f}%" if records else "0%",
            }
            file_stats[csv_filename] = stats

            print(f"  → Statistics:")
            print(f"    • GHCID coverage: {with_ghcid:,}/{len(records):,} ({stats['ghcid_coverage']})")
            print(f"    • Website URLs: {with_url:,}/{len(records):,} ({stats['url_coverage']})")
            print(f"    • Prefectures: {prefectures}")

            # Add to master list
            all_custodians.extend(custodians)
            print(f"    ✓ Added to master dataset")

        except Exception as e:
            print(f"    ❌ Error processing {csv_filename}: {e}")
            import traceback
            traceback.print_exc()
            continue

    # Generate comprehensive statistics
    print(f"\n{'='*80}")
    print("ANALYZING COMPLETE DATASET")
    print(f"{'='*80}")

    stats = analyze_coverage(all_custodians)

    print(f"\n📊 Dataset Overview:")
    print(f"  • Total institutions: {stats['total_institutions']:,}")
    print(f"  • Prefectures covered: {stats['total_prefectures']}/47")
    print(f"\n📚 By Institution Type:")
    for inst_type, count in stats["by_type"].items():
        print(f"  • {inst_type}: {count:,}")
    print(f"\n🗺️  Top 10 Prefectures:")
    for prefecture, count in stats["by_prefecture"].items():
        print(f"  • {prefecture}: {count:,}")
    print(f"\n✅ Data Coverage:")
    for field, coverage in stats["coverage"].items():
        print(f"  • {field.replace('_', ' ').title()}: {coverage['count']:,} ({coverage['percentage']})")

    # Export to YAML
    print(f"\n{'='*80}")
    print("EXPORTING TO LINKML YAML")
    print(f"{'='*80}")

    print(f"  → Serializing {len(all_custodians):,} institutions...")
    serialized = [serialize_custodian(c) for c in all_custodians]

    print(f"  → Writing to {output_path}...")
    with open(output_path, 'w', encoding='utf-8') as f:
        yaml.dump(
            serialized,
            f,
            allow_unicode=True,
            sort_keys=False,
            default_flow_style=False,
            width=120,
        )

    file_size_mb = output_path.stat().st_size / (1024 * 1024)
    print(f"    ✓ Export complete!")
    print(f"    📁 File: {output_path}")
    print(f"    📏 Size: {file_size_mb:.2f} MB")

    # Export statistics summary
    stats_path = output_dir / "jp_institutions_statistics.yaml"
    print(f"\n  → Writing statistics to {stats_path}...")
    with open(stats_path, 'w', encoding='utf-8') as f:
        yaml.dump(
            {
                "export_date": datetime.now().isoformat(),
                "source_files": file_stats,
                "dataset_statistics": stats,
            },
            f,
            allow_unicode=True,
            sort_keys=False,
            default_flow_style=False,
        )
    print(f"    ✓ Statistics exported")

    # Final summary
    print(f"\n{'='*80}")
    print("✅ EXPORT COMPLETE")
    print(f"{'='*80}")
    print(f"\n📦 Exported Files:")
    print(f"  • Main dataset: {output_path}")
    print(f"  • Statistics: {stats_path}")
    print(f"\n📊 Summary:")
    print(f"  • Total institutions: {len(all_custodians):,}")
    print(f"  • Data tier: TIER_1_AUTHORITATIVE")
    print(f"  • Source: National Diet Library ISIL Registry")
    print(f"  • Geographic coverage: {stats['total_prefectures']}/47 Japanese prefectures")
    print(f"\n⏱️  Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print()


if __name__ == "__main__":
    main()