101 lines
3.3 KiB
Python
Executable file
101 lines
3.3 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Convert Canadian ISIL records to LinkML format.
|
|
|
|
Processes all Canadian library records and exports to YAML/JSON.
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
# Add src to Python path
|
|
sys.path.insert(0, str(Path(__file__).parent / "src"))
|
|
|
|
from glam_extractor.parsers.canadian_isil import CanadianISILParser
|
|
|
|
|
|
def main():
|
|
"""Convert Canadian ISIL records to LinkML format"""
|
|
|
|
# Input/output paths
|
|
input_file = Path("data/isil/canada/canadian_libraries_all.json")
|
|
output_dir = Path("data/instances/canada")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Initialize parser
|
|
print(f"Parsing {input_file}...")
|
|
parser = CanadianISILParser()
|
|
|
|
# Parse all records
|
|
custodians = list(parser.parse_file(input_file))
|
|
print(f"\n✅ Successfully parsed {len(custodians)} institutions")
|
|
|
|
# Statistics
|
|
active = sum(1 for c in custodians if c.organization_status == "ACTIVE")
|
|
inactive = sum(1 for c in custodians if c.organization_status == "INACTIVE")
|
|
|
|
# Count by type
|
|
type_counts = {}
|
|
for c in custodians:
|
|
inst_type = str(c.institution_type) if c.institution_type else "UNKNOWN"
|
|
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
|
|
|
|
# Count by province
|
|
province_counts = {}
|
|
for c in custodians:
|
|
if c.locations:
|
|
region = c.locations[0].region
|
|
province_counts[region] = province_counts.get(region, 0) + 1
|
|
|
|
print(f"\n📊 Statistics:")
|
|
print(f" Active institutions: {active}")
|
|
print(f" Inactive/Closed: {inactive}")
|
|
print(f"\n By Institution Type:")
|
|
for inst_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {inst_type}: {count}")
|
|
print(f"\n By Province (Top 5):")
|
|
for province, count in sorted(province_counts.items(), key=lambda x: -x[1])[:5]:
|
|
print(f" {province}: {count}")
|
|
|
|
# Export to JSON
|
|
json_output = output_dir / "canadian_heritage_custodians.json"
|
|
print(f"\n💾 Exporting to {json_output}...")
|
|
|
|
json_data = []
|
|
for custodian in custodians:
|
|
# Convert to JSON object (dict)
|
|
data = custodian._as_json_obj()
|
|
json_data.append(data)
|
|
|
|
with open(json_output, 'w', encoding='utf-8') as f:
|
|
json.dump(json_data, f, indent=2, ensure_ascii=False, default=str)
|
|
|
|
print(f"✅ Exported {len(json_data)} records to JSON")
|
|
|
|
# Export sample YAML (first 100 records)
|
|
yaml_output = output_dir / "canadian_heritage_custodians_sample.yaml"
|
|
print(f"\n💾 Exporting sample (100 records) to {yaml_output}...")
|
|
|
|
try:
|
|
import yaml
|
|
|
|
sample_data = [custodian._as_json_obj() for custodian in custodians[:100]]
|
|
|
|
with open(yaml_output, 'w', encoding='utf-8') as f:
|
|
yaml.dump(sample_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"✅ Exported sample to YAML")
|
|
except ImportError:
|
|
print("⚠️ PyYAML not installed, skipping YAML export")
|
|
|
|
print(f"\n✨ Conversion complete!")
|
|
print(f"\n📁 Output files:")
|
|
print(f" {json_output}")
|
|
if yaml_output.exists():
|
|
print(f" {yaml_output}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|