glam/scripts/export_belgian_rdf.py
2025-11-19 23:25:22 +01:00

135 lines
4.6 KiB
Python

#!/usr/bin/env python3
"""
Export Belgian ISIL institutions to RDF/Turtle format.
This script:
1. Loads Belgian institutions from Wikidata-enriched YAML
2. Converts to RDF using the LinkML RDF exporter
3. Exports to Turtle (.ttl) format
4. Generates statistics on exported triples
"""
import sys
from pathlib import Path
import yaml
import re
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from glam_extractor.exporters.rdf_exporter import RDFExporter
from glam_extractor.models import HeritageCustodian
from linkml_runtime.loaders import yaml_loader
def load_belgian_institutions(yaml_file: Path) -> list[HeritageCustodian]:
"""Load Belgian institutions from YAML file."""
with open(yaml_file, 'r', encoding='utf-8') as f:
content = f.read()
# Skip header comments and first ---
lines = content.split('\n')
start_idx = next((i for i, line in enumerate(lines) if line.strip() == '---'), 0)
yaml_content = '\n'.join(lines[start_idx+1:])
# Split into individual YAML documents
records_text = re.split(r'\n(?=id: BE-)', yaml_content)
records_text = [r.strip() for r in records_text if r.strip()]
# Parse each record as HeritageCustodian
institutions = []
for record_text in records_text:
try:
inst_dict = yaml.safe_load(record_text)
if inst_dict:
# Convert dict to HeritageCustodian object
inst = HeritageCustodian(**inst_dict)
institutions.append(inst)
except Exception as e:
print(f"Warning: Skipping record due to error: {e}")
continue
return institutions
def main():
"""Main RDF export workflow."""
print("=" * 70)
print("Belgian Institutions RDF/Turtle Export")
print("=" * 70)
# Input/output files
input_file = Path("data/instances/belgium_isil_institutions_wikidata.yaml")
output_file = Path("data/rdf/belgium_isil_institutions.ttl")
output_file.parent.mkdir(parents=True, exist_ok=True)
if not input_file.exists():
print(f"\n❌ Input file not found: {input_file}")
print(" Run scripts/enrich_belgian_wikidata.py first")
return
# Load institutions
print(f"\n1. Loading institutions from {input_file}...")
institutions = load_belgian_institutions(input_file)
print(f" ✓ Loaded {len(institutions)} institutions")
# Initialize RDF exporter
print(f"\n2. Initializing RDF exporter...")
exporter = RDFExporter()
# Export to RDF
print(f"\n3. Converting to RDF/Turtle...")
print(f" Processing institutions...")
# Add all institutions to the RDF graph
for inst in institutions:
exporter.add_custodian(inst)
# Count triples
triple_count = len(exporter.graph)
print(f" ✓ Generated {triple_count:,} RDF triples")
# Serialize to Turtle
print(f"\n4. Serializing to Turtle format...")
exporter.graph.serialize(destination=output_file, format='turtle')
file_size_kb = output_file.stat().st_size / 1024
print(f" ✓ Exported to: {output_file}")
print(f" ✓ File size: {file_size_kb:.1f} KB")
# Read back for sample display
with open(output_file, 'r', encoding='utf-8') as f:
ttl_content = f.read()
# Statistics
print("\n" + "=" * 70)
print("RDF Export Summary")
print("=" * 70)
print(f"Total institutions: {len(institutions)}")
print(f"RDF triples generated: {triple_count:,}")
print(f"Average triples per inst: {triple_count/len(institutions):.1f}")
# Count institutions with key properties
with_wikidata = sum(1 for inst in institutions if any(
i.identifier_scheme == "Wikidata" for i in inst.identifiers or []
))
with_locations = sum(1 for inst in institutions if inst.locations)
with_coords = sum(1 for inst in institutions if inst.locations and any(
loc.latitude for loc in inst.locations
))
print(f"With Wikidata Q-numbers: {with_wikidata} ({with_wikidata/len(institutions)*100:.1f}%)")
print(f"With locations: {with_locations} ({with_locations/len(institutions)*100:.1f}%)")
print(f"With coordinates: {with_coords} ({with_coords/len(institutions)*100:.1f}%)")
# Sample triples
print(f"\n5. Sample RDF triples (first 10 lines):")
lines = ttl_content.split('\n')
for line in lines[:10]:
if line.strip():
print(f" {line}")
print("\n✓ RDF export complete!")
if __name__ == "__main__":
main()