#!/usr/bin/env python3 """ Parse Japanese ISIL registry CSV files and generate LinkML-compliant instances. This script processes the Japanese ISIL registry CSVs from the National Diet Library: - Public libraries - Other libraries (academic, specialized) - Museums - Archives Output: YAML files in data/instances/japan/ """ import sys from pathlib import Path from datetime import datetime # Add project root to Python path project_root = Path(__file__).resolve().parent.parent sys.path.insert(0, str(project_root / "src")) from glam_extractor.parsers.japanese_isil import JapaneseISILParser from glam_extractor.models import InstitutionType import yaml def main(): """Parse all Japanese ISIL CSV files""" data_dir = project_root / "data" / "isil" / "JP" output_dir = project_root / "data" / "instances" / "japan" output_dir.mkdir(parents=True, exist_ok=True) parser = JapaneseISILParser() # Define CSV files and their institution types csv_files = [ ("libraries_public.csv", InstitutionType.LIBRARY, "Japanese public libraries"), ("libraries_other.csv", InstitutionType.LIBRARY, "Japanese academic and specialized libraries"), ("museums.csv", InstitutionType.MUSEUM, "Japanese museums"), ("archives.csv", InstitutionType.ARCHIVE, "Japanese archives"), ] all_stats = {} for csv_filename, inst_type, description in csv_files: csv_path = data_dir / csv_filename if not csv_path.exists(): print(f"⚠️ File not found: {csv_path}") continue print(f"\n{'='*70}") print(f"Processing: {csv_filename}") print(f"Type: {inst_type}") print(f"Description: {description}") print(f"{'='*70}\n") try: # Parse CSV records = parser.parse_file(csv_path, inst_type) print(f"✓ Parsed {len(records)} raw records") # Convert to HeritageCustodian models custodians = [ parser.to_heritage_custodian(record, inst_type, str(csv_path)) for record in records ] print(f"✓ Converted to {len(custodians)} HeritageCustodian instances") # Statistics with_ghcid = sum(1 for c in custodians if c.ghcid_numeric is not None) with_url = sum(1 for c in custodians if any(i.identifier_scheme == "Website" for i in c.identifiers)) stats = { "total_records": len(records), "with_ghcid": with_ghcid, "with_url": with_url, "ghcid_coverage": f"{with_ghcid/len(records)*100:.1f}%" if records else "0%", "url_coverage": f"{with_url/len(records)*100:.1f}%" if records else "0%", } all_stats[csv_filename] = stats print(f" - Records with GHCID: {with_ghcid}/{len(records)} ({stats['ghcid_coverage']})") print(f" - Records with URL: {with_url}/{len(records)} ({stats['url_coverage']})") # Export to YAML (sample first 10 for testing) output_filename = csv_filename.replace(".csv", "_sample.yaml") output_path = output_dir / output_filename sample_custodians = custodians[:10] # First 10 records # Convert to dicts for YAML serialization sample_dicts = [] for custodian in sample_custodians: # Use model_dump() for Pydantic v2, dict() for v1 try: data = custodian.model_dump(exclude_none=True) except AttributeError: data = custodian.dict(exclude_none=True) # Convert datetime objects to ISO strings if data.get("provenance"): prov = data["provenance"] if prov.get("extraction_date"): prov["extraction_date"] = prov["extraction_date"].isoformat() if prov.get("verified_date"): prov["verified_date"] = prov["verified_date"].isoformat() # Convert history dates if data.get("ghcid_history"): for entry in data["ghcid_history"]: if entry.get("valid_from"): entry["valid_from"] = entry["valid_from"].isoformat() if entry.get("valid_to"): entry["valid_to"] = entry["valid_to"].isoformat() sample_dicts.append(data) with open(output_path, 'w', encoding='utf-8') as f: yaml.dump(sample_dicts, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print(f"✓ Exported sample (10 records) to: {output_path}") except Exception as e: print(f"❌ Error processing {csv_filename}: {e}") import traceback traceback.print_exc() continue # Print summary print(f"\n{'='*70}") print("SUMMARY") print(f"{'='*70}\n") for filename, stats in all_stats.items(): print(f"{filename}:") print(f" Total records: {stats['total_records']}") print(f" GHCID coverage: {stats['ghcid_coverage']}") print(f" URL coverage: {stats['url_coverage']}") print() total_records = sum(s["total_records"] for s in all_stats.values()) print(f"Grand total: {total_records} Japanese heritage institutions") print(f"\nSample YAML files written to: {output_dir}") if __name__ == "__main__": main()