110 lines
3.9 KiB
Python
110 lines
3.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Export Belgian ISIL institutions to LinkML YAML instance files.
|
|
|
|
This script:
|
|
1. Parses Belgian ISIL registry CSV (421 institutions)
|
|
2. Converts to HeritageCustodian LinkML models
|
|
3. Exports to YAML instance files (one file with all institutions)
|
|
|
|
Output:
|
|
- data/instances/belgium_isil_institutions.yaml (all 421 institutions)
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
sys.path.insert(0, 'src')
|
|
|
|
from glam_extractor.parsers.belgian_isil import BelgianISILParser
|
|
from linkml_runtime.dumpers import yaml_dumper
|
|
|
|
def main():
|
|
"""Parse Belgian ISIL registry and export to YAML"""
|
|
|
|
# Input file
|
|
csv_file = Path("data/isil/belgian_isil_detailed.csv")
|
|
|
|
# Output file
|
|
output_file = Path("data/instances/belgium_isil_institutions.yaml")
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
print("=" * 60)
|
|
print("Belgian ISIL Registry → LinkML Instance Export")
|
|
print("=" * 60)
|
|
|
|
# Parse CSV
|
|
print(f"\n1. Parsing {csv_file}...")
|
|
parser = BelgianISILParser()
|
|
custodians = parser.parse_and_convert(csv_file)
|
|
print(f" ✓ Parsed {len(custodians)} institutions")
|
|
|
|
# Statistics
|
|
from collections import Counter
|
|
type_counts = Counter(str(c.institution_type) for c in custodians)
|
|
print(f"\n2. Institution types:")
|
|
for inst_type, count in type_counts.most_common():
|
|
print(f" - {inst_type}: {count}")
|
|
|
|
# Quality check
|
|
with_identifiers = sum(1 for c in custodians if c.identifiers)
|
|
with_descriptions = sum(1 for c in custodians if c.description)
|
|
with_alt_names = sum(1 for c in custodians if c.alternative_names)
|
|
|
|
print(f"\n3. Data quality:")
|
|
print(f" - With ISIL identifiers: {with_identifiers} / {len(custodians)}")
|
|
print(f" - With descriptions: {with_descriptions} / {len(custodians)}")
|
|
print(f" - With alternative names: {with_alt_names} / {len(custodians)}")
|
|
|
|
# Export to YAML
|
|
print(f"\n4. Exporting to {output_file}...")
|
|
|
|
# Write YAML manually to preserve comments and structure
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write("# Belgian ISIL Registry Institutions\n")
|
|
f.write("# Scraped from https://isil.kbr.be/ (Royal Library of Belgium)\n")
|
|
f.write(f"# Total institutions: {len(custodians)}\n")
|
|
f.write(f"# Data tier: TIER_1_AUTHORITATIVE\n")
|
|
f.write(f"# Generated: {custodians[0].provenance.extraction_date if custodians else 'unknown'}\n")
|
|
f.write("#\n")
|
|
f.write("# Distribution:\n")
|
|
for inst_type, count in type_counts.most_common():
|
|
f.write(f"# - {inst_type}: {count}\n")
|
|
f.write("#\n")
|
|
f.write("---\n\n")
|
|
|
|
# Export each institution
|
|
for idx, custodian in enumerate(custodians, 1):
|
|
# Write as YAML dict
|
|
yaml_str = yaml_dumper.dumps(custodian)
|
|
f.write(f"# Institution {idx}/{len(custodians)}\n")
|
|
f.write(yaml_str)
|
|
f.write("\n")
|
|
|
|
if idx % 50 == 0:
|
|
print(f" ... exported {idx} institutions")
|
|
|
|
print(f" ✓ Exported {len(custodians)} institutions to YAML")
|
|
|
|
# File size
|
|
file_size_kb = output_file.stat().st_size / 1024
|
|
print(f"\n5. Output file:")
|
|
print(f" - Path: {output_file}")
|
|
print(f" - Size: {file_size_kb:.1f} KB")
|
|
print(f" - Format: LinkML YAML")
|
|
|
|
print("\n" + "=" * 60)
|
|
print("✓ Export complete!")
|
|
print("=" * 60)
|
|
|
|
# Sample records
|
|
print(f"\nSample records:\n")
|
|
for i, c in enumerate(custodians[:3], 1):
|
|
print(f"{i}. {c.name} ({c.id})")
|
|
print(f" Type: {c.institution_type}")
|
|
print(f" ISIL: {c.identifiers[0].identifier_value if c.identifiers else 'N/A'}")
|
|
if c.alternative_names:
|
|
print(f" Alt names: {', '.join(c.alternative_names)}")
|
|
print()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|