#!/usr/bin/env python3 """ Export ALL Japanese ISIL registry institutions to LinkML format. This script performs a COMPLETE export of all Japanese heritage institutions from the National Diet Library ISIL registry (12,065+ institutions) into a single consolidated LinkML-compliant YAML file. Data Sources: - Public libraries (libraries_public.csv) - Academic/specialized libraries (libraries_other.csv) - Museums (museums.csv) - Archives (archives.csv) Output: data/instances/japan/jp_institutions.yaml (COMPLETE dataset) Data Quality: TIER_1_AUTHORITATIVE (official National Diet Library registry) Coverage: All Japanese prefectures (47 prefectures) """ import sys from pathlib import Path from datetime import datetime from typing import Dict, List from collections import Counter # Add project root to Python path project_root = Path(__file__).resolve().parent.parent sys.path.insert(0, str(project_root / "src")) from glam_extractor.parsers.japanese_isil import JapaneseISILParser from glam_extractor.models import InstitutionType, HeritageCustodian import yaml def serialize_custodian(custodian: HeritageCustodian) -> dict: """ Serialize HeritageCustodian to YAML-safe dictionary. Uses JSON serialization for clean conversion of all Pydantic models, datetime objects, and HttpUrl objects to plain Python types. """ import json # Use Pydantic's JSON serialization (handles all nested models correctly) json_str = custodian.json(exclude_none=True) data = json.loads(json_str) return data def analyze_coverage(custodians: List[HeritageCustodian]) -> Dict: """Generate comprehensive statistics about the dataset.""" total = len(custodians) # Count by institution type types = Counter(c.institution_type for c in custodians) # Count by prefecture prefectures = Counter( c.locations[0].region if c.locations and c.locations[0].region else "UNKNOWN" for c in custodians ) # Coverage metrics with_ghcid = sum(1 for c in custodians if c.ghcid_numeric is not None) with_url = sum( 1 for c in custodians if c.identifiers and any(i.identifier_scheme == "Website" for i in c.identifiers) ) with_phone = sum( 1 for c in custodians if c.description and ("Tel:" in c.description or "電話:" in c.description) ) with_postal = sum( 1 for c in custodians if c.locations and c.locations[0].postal_code ) with_street = sum( 1 for c in custodians if c.locations and c.locations[0].street_address ) # ISIL code patterns isil_codes = [ next((i.identifier_value for i in c.identifiers if i.identifier_scheme == "ISIL"), None) for c in custodians ] isil_prefixes = Counter( code.split('-')[1][:2] if code and '-' in code else "UNKNOWN" for code in isil_codes ) return { "total_institutions": total, "by_type": dict(types), "by_prefecture": dict(prefectures.most_common(10)), # Top 10 "total_prefectures": len([p for p in prefectures if p != "UNKNOWN"]), "coverage": { "ghcid": {"count": with_ghcid, "percentage": f"{with_ghcid/total*100:.1f}%"}, "website": {"count": with_url, "percentage": f"{with_url/total*100:.1f}%"}, "phone": {"count": with_phone, "percentage": f"{with_phone/total*100:.1f}%"}, "postal_code": {"count": with_postal, "percentage": f"{with_postal/total*100:.1f}%"}, "street_address": {"count": with_street, "percentage": f"{with_street/total*100:.1f}%"}, }, "isil_distribution": dict(isil_prefixes.most_common(10)), # Top 10 ISIL prefixes } def main(): """Export all Japanese ISIL institutions to consolidated LinkML YAML.""" print("="*80) print("JAPANESE ISIL REGISTRY → LinkML EXPORT") print("="*80) print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print() data_dir = project_root / "data" / "isil" / "JP" output_dir = project_root / "data" / "instances" / "japan" output_dir.mkdir(parents=True, exist_ok=True) # Output file for complete dataset output_path = output_dir / "jp_institutions.yaml" parser = JapaneseISILParser() # Define CSV files and their institution types csv_files = [ ("libraries_public.csv", InstitutionType.LIBRARY, "Public libraries"), ("libraries_other.csv", InstitutionType.LIBRARY, "Academic/specialized libraries"), ("museums.csv", InstitutionType.MUSEUM, "Museums"), ("archives.csv", InstitutionType.ARCHIVE, "Archives"), ] all_custodians: List[HeritageCustodian] = [] file_stats: Dict[str, Dict] = {} # Process each CSV file for csv_filename, inst_type, description in csv_files: csv_path = data_dir / csv_filename if not csv_path.exists(): print(f"⚠️ File not found: {csv_path}") print(f" Skipping {description}") continue print(f"\n{'─'*80}") print(f"Processing: {csv_filename}") print(f"Type: {inst_type.value}") print(f"Description: {description}") print(f"{'─'*80}") try: # Parse CSV print(f" → Parsing CSV...") records = parser.parse_file(csv_path, inst_type) print(f" ✓ Parsed {len(records):,} raw records") # Convert to HeritageCustodian models print(f" → Converting to LinkML format...") custodians = [ parser.to_heritage_custodian(record, inst_type, str(csv_path)) for record in records ] print(f" ✓ Converted {len(custodians):,} HeritageCustodian instances") # Calculate statistics with_ghcid = sum(1 for c in custodians if c.ghcid_numeric is not None) with_url = sum( 1 for c in custodians if c.identifiers and any(i.identifier_scheme == "Website" for i in c.identifiers) ) prefectures = len(set( c.locations[0].region for c in custodians if c.locations and c.locations[0].region )) stats = { "total_records": len(records), "with_ghcid": with_ghcid, "with_url": with_url, "prefectures_covered": prefectures, "ghcid_coverage": f"{with_ghcid/len(records)*100:.1f}%" if records else "0%", "url_coverage": f"{with_url/len(records)*100:.1f}%" if records else "0%", } file_stats[csv_filename] = stats print(f" → Statistics:") print(f" • GHCID coverage: {with_ghcid:,}/{len(records):,} ({stats['ghcid_coverage']})") print(f" • Website URLs: {with_url:,}/{len(records):,} ({stats['url_coverage']})") print(f" • Prefectures: {prefectures}") # Add to master list all_custodians.extend(custodians) print(f" ✓ Added to master dataset") except Exception as e: print(f" ❌ Error processing {csv_filename}: {e}") import traceback traceback.print_exc() continue # Generate comprehensive statistics print(f"\n{'='*80}") print("ANALYZING COMPLETE DATASET") print(f"{'='*80}") stats = analyze_coverage(all_custodians) print(f"\n📊 Dataset Overview:") print(f" • Total institutions: {stats['total_institutions']:,}") print(f" • Prefectures covered: {stats['total_prefectures']}/47") print(f"\n📚 By Institution Type:") for inst_type, count in stats["by_type"].items(): print(f" • {inst_type}: {count:,}") print(f"\n🗺️ Top 10 Prefectures:") for prefecture, count in stats["by_prefecture"].items(): print(f" • {prefecture}: {count:,}") print(f"\n✅ Data Coverage:") for field, coverage in stats["coverage"].items(): print(f" • {field.replace('_', ' ').title()}: {coverage['count']:,} ({coverage['percentage']})") # Export to YAML print(f"\n{'='*80}") print("EXPORTING TO LINKML YAML") print(f"{'='*80}") print(f" → Serializing {len(all_custodians):,} institutions...") serialized = [serialize_custodian(c) for c in all_custodians] print(f" → Writing to {output_path}...") with open(output_path, 'w', encoding='utf-8') as f: yaml.dump( serialized, f, allow_unicode=True, sort_keys=False, default_flow_style=False, width=120, ) file_size_mb = output_path.stat().st_size / (1024 * 1024) print(f" ✓ Export complete!") print(f" 📁 File: {output_path}") print(f" 📏 Size: {file_size_mb:.2f} MB") # Export statistics summary stats_path = output_dir / "jp_institutions_statistics.yaml" print(f"\n → Writing statistics to {stats_path}...") with open(stats_path, 'w', encoding='utf-8') as f: yaml.dump( { "export_date": datetime.now().isoformat(), "source_files": file_stats, "dataset_statistics": stats, }, f, allow_unicode=True, sort_keys=False, default_flow_style=False, ) print(f" ✓ Statistics exported") # Final summary print(f"\n{'='*80}") print("✅ EXPORT COMPLETE") print(f"{'='*80}") print(f"\n📦 Exported Files:") print(f" • Main dataset: {output_path}") print(f" • Statistics: {stats_path}") print(f"\n📊 Summary:") print(f" • Total institutions: {len(all_custodians):,}") print(f" • Data tier: TIER_1_AUTHORITATIVE") print(f" • Source: National Diet Library ISIL Registry") print(f" • Geographic coverage: {stats['total_prefectures']}/47 Japanese prefectures") print(f"\n⏱️ Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print() if __name__ == "__main__": main()