#!/usr/bin/env python3 """ Export Belgian ISIL institutions to LinkML YAML instance files. This script: 1. Parses Belgian ISIL registry CSV (421 institutions) 2. Converts to HeritageCustodian LinkML models 3. Exports to YAML instance files (one file with all institutions) Output: - data/instances/belgium_isil_institutions.yaml (all 421 institutions) """ import sys from pathlib import Path sys.path.insert(0, 'src') from glam_extractor.parsers.belgian_isil import BelgianISILParser from linkml_runtime.dumpers import yaml_dumper def main(): """Parse Belgian ISIL registry and export to YAML""" # Input file csv_file = Path("data/isil/belgian_isil_detailed.csv") # Output file output_file = Path("data/instances/belgium_isil_institutions.yaml") output_file.parent.mkdir(parents=True, exist_ok=True) print("=" * 60) print("Belgian ISIL Registry → LinkML Instance Export") print("=" * 60) # Parse CSV print(f"\n1. Parsing {csv_file}...") parser = BelgianISILParser() custodians = parser.parse_and_convert(csv_file) print(f" ✓ Parsed {len(custodians)} institutions") # Statistics from collections import Counter type_counts = Counter(str(c.institution_type) for c in custodians) print(f"\n2. Institution types:") for inst_type, count in type_counts.most_common(): print(f" - {inst_type}: {count}") # Quality check with_identifiers = sum(1 for c in custodians if c.identifiers) with_descriptions = sum(1 for c in custodians if c.description) with_alt_names = sum(1 for c in custodians if c.alternative_names) print(f"\n3. Data quality:") print(f" - With ISIL identifiers: {with_identifiers} / {len(custodians)}") print(f" - With descriptions: {with_descriptions} / {len(custodians)}") print(f" - With alternative names: {with_alt_names} / {len(custodians)}") # Export to YAML print(f"\n4. Exporting to {output_file}...") # Write YAML manually to preserve comments and structure with open(output_file, 'w', encoding='utf-8') as f: f.write("# Belgian ISIL Registry Institutions\n") f.write("# Scraped from https://isil.kbr.be/ (Royal Library of Belgium)\n") f.write(f"# Total institutions: {len(custodians)}\n") f.write(f"# Data tier: TIER_1_AUTHORITATIVE\n") f.write(f"# Generated: {custodians[0].provenance.extraction_date if custodians else 'unknown'}\n") f.write("#\n") f.write("# Distribution:\n") for inst_type, count in type_counts.most_common(): f.write(f"# - {inst_type}: {count}\n") f.write("#\n") f.write("---\n\n") # Export each institution for idx, custodian in enumerate(custodians, 1): # Write as YAML dict yaml_str = yaml_dumper.dumps(custodian) f.write(f"# Institution {idx}/{len(custodians)}\n") f.write(yaml_str) f.write("\n") if idx % 50 == 0: print(f" ... exported {idx} institutions") print(f" ✓ Exported {len(custodians)} institutions to YAML") # File size file_size_kb = output_file.stat().st_size / 1024 print(f"\n5. Output file:") print(f" - Path: {output_file}") print(f" - Size: {file_size_kb:.1f} KB") print(f" - Format: LinkML YAML") print("\n" + "=" * 60) print("✓ Export complete!") print("=" * 60) # Sample records print(f"\nSample records:\n") for i, c in enumerate(custodians[:3], 1): print(f"{i}. {c.name} ({c.id})") print(f" Type: {c.institution_type}") print(f" ISIL: {c.identifiers[0].identifier_value if c.identifiers else 'N/A'}") if c.alternative_names: print(f" Alt names: {', '.join(c.alternative_names)}") print() if __name__ == "__main__": main()