glam/parse_japan_isil.py

#!/usr/bin/env python3
"""Parse Japanese ISIL registries to LinkML format."""

import csv
import yaml
from datetime import datetime, timezone
from typing import List, Dict

def parse_japanese_csv(filepath: str, inst_type: str) -> List[Dict]:
    """Parse Japanese ISIL CSV file."""
    institutions = []

    print(f"Parsing {filepath}...")

    with open(filepath, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)

        for row in reader:
            isil = row.get('ISIL', '').strip()
            if not isil or isil.startswith('DELETE'):
                continue

            name_en = row.get('Institution name in English', '').strip()
            if not name_en:
                continue

            # Create institution ID
            inst_id = f"https://w3id.org/heritage/custodian/jp/{isil.lower().replace('-', '')}"

            # Parse location
            prefecture = row.get('Prefecture', '').strip().title()
            city = row.get('City/Ward/Town/Village', '').strip().title()
            postal_code = row.get('Postal code', '').strip()
            street = row.get('Address (Street Level)', '').strip()

            # Build address
            address_parts = [street, city, prefecture, postal_code]
            street_address = ', '.join([p for p in address_parts if p])

            # Get additional fields
            phone = row.get('Telephone number', '').strip()
            website = row.get('URL', '').strip()

            # Create LinkML record
            institution = {
                'id': inst_id,
                'name': name_en,
                'institution_type': inst_type,
                'locations': [{
                    'country': 'JP',
                    'city': city if city else None,
                    'region': prefecture if prefecture else None,
                    'postal_code': postal_code if postal_code else None,
                    'street_address': street_address if street_address else None
                }],
                'identifiers': [
                    {
                        'identifier_scheme': 'ISIL',
                        'identifier_value': isil,
                        'identifier_url': f'https://isil.org/{isil}'
                    }
                ],
                'provenance': {
                    'data_source': 'CSV_REGISTRY',
                    'data_tier': 'TIER_1_AUTHORITATIVE',
                    'extraction_date': datetime.now(timezone.utc).isoformat(),
                    'extraction_method': 'CSV parsing from Japanese ISIL registry',
                    'confidence_score': 0.98,
                    'source_url': 'https://www.ndl.go.jp/jp/aboutus/isil.html'
                }
            }

            # Add optional fields
            if website:
                institution['homepage'] = website
                institution['identifiers'].append({
                    'identifier_scheme': 'Website',
                    'identifier_value': website,
                    'identifier_url': website
                })

            if phone:
                institution['contact_info'] = {'phone': phone}

            # Clean up empty location fields
            institution['locations'] = [
                {k: v for k, v in loc.items() if v}
                for loc in institution['locations']
            ]

            institutions.append(institution)

    print(f"  ✓ Parsed {len(institutions)} institutions")
    return institutions

def main():
    print("=" * 70)
    print("Japanese ISIL Registry Parsing")
    print("=" * 70)

    datasets = [
        ('data/isil/JP/archives.csv', 'ARCHIVE', 'archives'),
        ('data/isil/JP/museums.csv', 'MUSEUM', 'museums'),
        ('data/isil/JP/libraries_public.csv', 'LIBRARY', 'libraries_public'),
        ('data/isil/JP/libraries_other.csv', 'LIBRARY', 'libraries_other')
    ]

    all_institutions = []
    stats = {}

    for filepath, inst_type, label in datasets:
        institutions = parse_japanese_csv(filepath, inst_type)
        all_institutions.extend(institutions)
        stats[label] = len(institutions)

    print(f"\n✓ Total institutions parsed: {len(all_institutions)}")
    print("\nBreakdown:")
    for label, count in stats.items():
        print(f"  {label}: {count}")

    # Save combined file
    output_file = "data/instances/japan_isil_all.yaml"
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("---\n")
        f.write("# Japanese ISIL Registry - All Institutions\n")
        f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
        f.write(f"# Total institutions: {len(all_institutions)}\n")
        f.write(f"# Archives: {stats.get('archives', 0)}\n")
        f.write(f"# Museums: {stats.get('museums', 0)}\n")
        f.write(f"# Public Libraries: {stats.get('libraries_public', 0)}\n")
        f.write(f"# Other Libraries: {stats.get('libraries_other', 0)}\n\n")
        yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    print(f"\n✓ Saved combined dataset to {output_file}")

    # Also save individual files
    for filepath, inst_type, label in datasets:
        institutions = [i for i in all_institutions if i['institution_type'] == inst_type and label in filepath]
        if not institutions:
            continue

        output = f"data/instances/japan_{label}.yaml"
        with open(output, 'w', encoding='utf-8') as f:
            f.write("---\n")
            f.write(f"# Japanese ISIL Registry - {label.replace('_', ' ').title()}\n")
            f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
            f.write(f"# Total institutions: {len(institutions)}\n\n")
            yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

        print(f"  ✓ Saved {output}")

    print("\n" + "=" * 70)
    print("✓ Japanese ISIL parsing complete!")
    print("=" * 70)

if __name__ == '__main__':
    main()