glam/convert_canadian_to_linkml.py
2025-11-19 23:25:22 +01:00

101 lines
3.3 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Convert Canadian ISIL records to LinkML format.
Processes all Canadian library records and exports to YAML/JSON.
"""
import sys
import json
from pathlib import Path
from datetime import datetime
# Add src to Python path
sys.path.insert(0, str(Path(__file__).parent / "src"))
from glam_extractor.parsers.canadian_isil import CanadianISILParser
def main():
"""Convert Canadian ISIL records to LinkML format"""
# Input/output paths
input_file = Path("data/isil/canada/canadian_libraries_all.json")
output_dir = Path("data/instances/canada")
output_dir.mkdir(parents=True, exist_ok=True)
# Initialize parser
print(f"Parsing {input_file}...")
parser = CanadianISILParser()
# Parse all records
custodians = list(parser.parse_file(input_file))
print(f"\n✅ Successfully parsed {len(custodians)} institutions")
# Statistics
active = sum(1 for c in custodians if c.organization_status == "ACTIVE")
inactive = sum(1 for c in custodians if c.organization_status == "INACTIVE")
# Count by type
type_counts = {}
for c in custodians:
inst_type = str(c.institution_type) if c.institution_type else "UNKNOWN"
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
# Count by province
province_counts = {}
for c in custodians:
if c.locations:
region = c.locations[0].region
province_counts[region] = province_counts.get(region, 0) + 1
print(f"\n📊 Statistics:")
print(f" Active institutions: {active}")
print(f" Inactive/Closed: {inactive}")
print(f"\n By Institution Type:")
for inst_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" {inst_type}: {count}")
print(f"\n By Province (Top 5):")
for province, count in sorted(province_counts.items(), key=lambda x: -x[1])[:5]:
print(f" {province}: {count}")
# Export to JSON
json_output = output_dir / "canadian_heritage_custodians.json"
print(f"\n💾 Exporting to {json_output}...")
json_data = []
for custodian in custodians:
# Convert to JSON object (dict)
data = custodian._as_json_obj()
json_data.append(data)
with open(json_output, 'w', encoding='utf-8') as f:
json.dump(json_data, f, indent=2, ensure_ascii=False, default=str)
print(f"✅ Exported {len(json_data)} records to JSON")
# Export sample YAML (first 100 records)
yaml_output = output_dir / "canadian_heritage_custodians_sample.yaml"
print(f"\n💾 Exporting sample (100 records) to {yaml_output}...")
try:
import yaml
sample_data = [custodian._as_json_obj() for custodian in custodians[:100]]
with open(yaml_output, 'w', encoding='utf-8') as f:
yaml.dump(sample_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"✅ Exported sample to YAML")
except ImportError:
print("⚠️ PyYAML not installed, skipping YAML export")
print(f"\n✨ Conversion complete!")
print(f"\n📁 Output files:")
print(f" {json_output}")
if yaml_output.exists():
print(f" {yaml_output}")
if __name__ == "__main__":
main()