glam/scripts/export_belgian_institutions.py
2025-11-19 23:25:22 +01:00

110 lines
3.9 KiB
Python

#!/usr/bin/env python3
"""
Export Belgian ISIL institutions to LinkML YAML instance files.
This script:
1. Parses Belgian ISIL registry CSV (421 institutions)
2. Converts to HeritageCustodian LinkML models
3. Exports to YAML instance files (one file with all institutions)
Output:
- data/instances/belgium_isil_institutions.yaml (all 421 institutions)
"""
import sys
from pathlib import Path
sys.path.insert(0, 'src')
from glam_extractor.parsers.belgian_isil import BelgianISILParser
from linkml_runtime.dumpers import yaml_dumper
def main():
"""Parse Belgian ISIL registry and export to YAML"""
# Input file
csv_file = Path("data/isil/belgian_isil_detailed.csv")
# Output file
output_file = Path("data/instances/belgium_isil_institutions.yaml")
output_file.parent.mkdir(parents=True, exist_ok=True)
print("=" * 60)
print("Belgian ISIL Registry → LinkML Instance Export")
print("=" * 60)
# Parse CSV
print(f"\n1. Parsing {csv_file}...")
parser = BelgianISILParser()
custodians = parser.parse_and_convert(csv_file)
print(f" ✓ Parsed {len(custodians)} institutions")
# Statistics
from collections import Counter
type_counts = Counter(str(c.institution_type) for c in custodians)
print(f"\n2. Institution types:")
for inst_type, count in type_counts.most_common():
print(f" - {inst_type}: {count}")
# Quality check
with_identifiers = sum(1 for c in custodians if c.identifiers)
with_descriptions = sum(1 for c in custodians if c.description)
with_alt_names = sum(1 for c in custodians if c.alternative_names)
print(f"\n3. Data quality:")
print(f" - With ISIL identifiers: {with_identifiers} / {len(custodians)}")
print(f" - With descriptions: {with_descriptions} / {len(custodians)}")
print(f" - With alternative names: {with_alt_names} / {len(custodians)}")
# Export to YAML
print(f"\n4. Exporting to {output_file}...")
# Write YAML manually to preserve comments and structure
with open(output_file, 'w', encoding='utf-8') as f:
f.write("# Belgian ISIL Registry Institutions\n")
f.write("# Scraped from https://isil.kbr.be/ (Royal Library of Belgium)\n")
f.write(f"# Total institutions: {len(custodians)}\n")
f.write(f"# Data tier: TIER_1_AUTHORITATIVE\n")
f.write(f"# Generated: {custodians[0].provenance.extraction_date if custodians else 'unknown'}\n")
f.write("#\n")
f.write("# Distribution:\n")
for inst_type, count in type_counts.most_common():
f.write(f"# - {inst_type}: {count}\n")
f.write("#\n")
f.write("---\n\n")
# Export each institution
for idx, custodian in enumerate(custodians, 1):
# Write as YAML dict
yaml_str = yaml_dumper.dumps(custodian)
f.write(f"# Institution {idx}/{len(custodians)}\n")
f.write(yaml_str)
f.write("\n")
if idx % 50 == 0:
print(f" ... exported {idx} institutions")
print(f" ✓ Exported {len(custodians)} institutions to YAML")
# File size
file_size_kb = output_file.stat().st_size / 1024
print(f"\n5. Output file:")
print(f" - Path: {output_file}")
print(f" - Size: {file_size_kb:.1f} KB")
print(f" - Format: LinkML YAML")
print("\n" + "=" * 60)
print("✓ Export complete!")
print("=" * 60)
# Sample records
print(f"\nSample records:\n")
for i, c in enumerate(custodians[:3], 1):
print(f"{i}. {c.name} ({c.id})")
print(f" Type: {c.institution_type}")
print(f" ISIL: {c.identifiers[0].identifier_value if c.identifiers else 'N/A'}")
if c.alternative_names:
print(f" Alt names: {', '.join(c.alternative_names)}")
print()
if __name__ == "__main__":
main()