135 lines
4.6 KiB
Python
135 lines
4.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Export Belgian ISIL institutions to RDF/Turtle format.
|
|
|
|
This script:
|
|
1. Loads Belgian institutions from Wikidata-enriched YAML
|
|
2. Converts to RDF using the LinkML RDF exporter
|
|
3. Exports to Turtle (.ttl) format
|
|
4. Generates statistics on exported triples
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
import yaml
|
|
import re
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from glam_extractor.exporters.rdf_exporter import RDFExporter
|
|
from glam_extractor.models import HeritageCustodian
|
|
from linkml_runtime.loaders import yaml_loader
|
|
|
|
def load_belgian_institutions(yaml_file: Path) -> list[HeritageCustodian]:
|
|
"""Load Belgian institutions from YAML file."""
|
|
with open(yaml_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Skip header comments and first ---
|
|
lines = content.split('\n')
|
|
start_idx = next((i for i, line in enumerate(lines) if line.strip() == '---'), 0)
|
|
yaml_content = '\n'.join(lines[start_idx+1:])
|
|
|
|
# Split into individual YAML documents
|
|
records_text = re.split(r'\n(?=id: BE-)', yaml_content)
|
|
records_text = [r.strip() for r in records_text if r.strip()]
|
|
|
|
# Parse each record as HeritageCustodian
|
|
institutions = []
|
|
for record_text in records_text:
|
|
try:
|
|
inst_dict = yaml.safe_load(record_text)
|
|
if inst_dict:
|
|
# Convert dict to HeritageCustodian object
|
|
inst = HeritageCustodian(**inst_dict)
|
|
institutions.append(inst)
|
|
except Exception as e:
|
|
print(f"Warning: Skipping record due to error: {e}")
|
|
continue
|
|
|
|
return institutions
|
|
|
|
|
|
def main():
|
|
"""Main RDF export workflow."""
|
|
|
|
print("=" * 70)
|
|
print("Belgian Institutions RDF/Turtle Export")
|
|
print("=" * 70)
|
|
|
|
# Input/output files
|
|
input_file = Path("data/instances/belgium_isil_institutions_wikidata.yaml")
|
|
output_file = Path("data/rdf/belgium_isil_institutions.ttl")
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
if not input_file.exists():
|
|
print(f"\n❌ Input file not found: {input_file}")
|
|
print(" Run scripts/enrich_belgian_wikidata.py first")
|
|
return
|
|
|
|
# Load institutions
|
|
print(f"\n1. Loading institutions from {input_file}...")
|
|
institutions = load_belgian_institutions(input_file)
|
|
print(f" ✓ Loaded {len(institutions)} institutions")
|
|
|
|
# Initialize RDF exporter
|
|
print(f"\n2. Initializing RDF exporter...")
|
|
exporter = RDFExporter()
|
|
|
|
# Export to RDF
|
|
print(f"\n3. Converting to RDF/Turtle...")
|
|
print(f" Processing institutions...")
|
|
|
|
# Add all institutions to the RDF graph
|
|
for inst in institutions:
|
|
exporter.add_custodian(inst)
|
|
|
|
# Count triples
|
|
triple_count = len(exporter.graph)
|
|
print(f" ✓ Generated {triple_count:,} RDF triples")
|
|
|
|
# Serialize to Turtle
|
|
print(f"\n4. Serializing to Turtle format...")
|
|
exporter.graph.serialize(destination=output_file, format='turtle')
|
|
|
|
file_size_kb = output_file.stat().st_size / 1024
|
|
print(f" ✓ Exported to: {output_file}")
|
|
print(f" ✓ File size: {file_size_kb:.1f} KB")
|
|
|
|
# Read back for sample display
|
|
with open(output_file, 'r', encoding='utf-8') as f:
|
|
ttl_content = f.read()
|
|
|
|
# Statistics
|
|
print("\n" + "=" * 70)
|
|
print("RDF Export Summary")
|
|
print("=" * 70)
|
|
print(f"Total institutions: {len(institutions)}")
|
|
print(f"RDF triples generated: {triple_count:,}")
|
|
print(f"Average triples per inst: {triple_count/len(institutions):.1f}")
|
|
|
|
# Count institutions with key properties
|
|
with_wikidata = sum(1 for inst in institutions if any(
|
|
i.identifier_scheme == "Wikidata" for i in inst.identifiers or []
|
|
))
|
|
with_locations = sum(1 for inst in institutions if inst.locations)
|
|
with_coords = sum(1 for inst in institutions if inst.locations and any(
|
|
loc.latitude for loc in inst.locations
|
|
))
|
|
|
|
print(f"With Wikidata Q-numbers: {with_wikidata} ({with_wikidata/len(institutions)*100:.1f}%)")
|
|
print(f"With locations: {with_locations} ({with_locations/len(institutions)*100:.1f}%)")
|
|
print(f"With coordinates: {with_coords} ({with_coords/len(institutions)*100:.1f}%)")
|
|
|
|
# Sample triples
|
|
print(f"\n5. Sample RDF triples (first 10 lines):")
|
|
lines = ttl_content.split('\n')
|
|
for line in lines[:10]:
|
|
if line.strip():
|
|
print(f" {line}")
|
|
|
|
print("\n✓ RDF export complete!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|