#!/usr/bin/env python3 """ Export Belgian ISIL institutions to RDF/Turtle format. This script: 1. Loads Belgian institutions from Wikidata-enriched YAML 2. Converts to RDF using the LinkML RDF exporter 3. Exports to Turtle (.ttl) format 4. Generates statistics on exported triples """ import sys from pathlib import Path import yaml import re sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from glam_extractor.exporters.rdf_exporter import RDFExporter from glam_extractor.models import HeritageCustodian from linkml_runtime.loaders import yaml_loader def load_belgian_institutions(yaml_file: Path) -> list[HeritageCustodian]: """Load Belgian institutions from YAML file.""" with open(yaml_file, 'r', encoding='utf-8') as f: content = f.read() # Skip header comments and first --- lines = content.split('\n') start_idx = next((i for i, line in enumerate(lines) if line.strip() == '---'), 0) yaml_content = '\n'.join(lines[start_idx+1:]) # Split into individual YAML documents records_text = re.split(r'\n(?=id: BE-)', yaml_content) records_text = [r.strip() for r in records_text if r.strip()] # Parse each record as HeritageCustodian institutions = [] for record_text in records_text: try: inst_dict = yaml.safe_load(record_text) if inst_dict: # Convert dict to HeritageCustodian object inst = HeritageCustodian(**inst_dict) institutions.append(inst) except Exception as e: print(f"Warning: Skipping record due to error: {e}") continue return institutions def main(): """Main RDF export workflow.""" print("=" * 70) print("Belgian Institutions RDF/Turtle Export") print("=" * 70) # Input/output files input_file = Path("data/instances/belgium_isil_institutions_wikidata.yaml") output_file = Path("data/rdf/belgium_isil_institutions.ttl") output_file.parent.mkdir(parents=True, exist_ok=True) if not input_file.exists(): print(f"\n❌ Input file not found: {input_file}") print(" Run scripts/enrich_belgian_wikidata.py first") return # Load institutions print(f"\n1. Loading institutions from {input_file}...") institutions = load_belgian_institutions(input_file) print(f" ✓ Loaded {len(institutions)} institutions") # Initialize RDF exporter print(f"\n2. Initializing RDF exporter...") exporter = RDFExporter() # Export to RDF print(f"\n3. Converting to RDF/Turtle...") print(f" Processing institutions...") # Add all institutions to the RDF graph for inst in institutions: exporter.add_custodian(inst) # Count triples triple_count = len(exporter.graph) print(f" ✓ Generated {triple_count:,} RDF triples") # Serialize to Turtle print(f"\n4. Serializing to Turtle format...") exporter.graph.serialize(destination=output_file, format='turtle') file_size_kb = output_file.stat().st_size / 1024 print(f" ✓ Exported to: {output_file}") print(f" ✓ File size: {file_size_kb:.1f} KB") # Read back for sample display with open(output_file, 'r', encoding='utf-8') as f: ttl_content = f.read() # Statistics print("\n" + "=" * 70) print("RDF Export Summary") print("=" * 70) print(f"Total institutions: {len(institutions)}") print(f"RDF triples generated: {triple_count:,}") print(f"Average triples per inst: {triple_count/len(institutions):.1f}") # Count institutions with key properties with_wikidata = sum(1 for inst in institutions if any( i.identifier_scheme == "Wikidata" for i in inst.identifiers or [] )) with_locations = sum(1 for inst in institutions if inst.locations) with_coords = sum(1 for inst in institutions if inst.locations and any( loc.latitude for loc in inst.locations )) print(f"With Wikidata Q-numbers: {with_wikidata} ({with_wikidata/len(institutions)*100:.1f}%)") print(f"With locations: {with_locations} ({with_locations/len(institutions)*100:.1f}%)") print(f"With coordinates: {with_coords} ({with_coords/len(institutions)*100:.1f}%)") # Sample triples print(f"\n5. Sample RDF triples (first 10 lines):") lines = ttl_content.split('\n') for line in lines[:10]: if line.strip(): print(f" {line}") print("\n✓ RDF export complete!") if __name__ == "__main__": main()