#!/usr/bin/env python3 """ Export Libya Heritage Institutions to RDF/Turtle Generates RDF serializations of the 52 Libyan heritage institutions with 75% Wikidata coverage (39/52 institutions). Outputs: - data/exports/libya_institutions.ttl (Turtle format) - data/exports/libya_institutions.rdf (RDF/XML format) - data/exports/libya_institutions.jsonld (JSON-LD format) - data/exports/libya_statistics.yaml (coverage statistics) """ import sys import yaml from pathlib import Path from datetime import datetime, timezone # Add src to path for imports sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) from glam_extractor.models import HeritageCustodian from glam_extractor.exporters.rdf_exporter import RDFExporter def load_libya_dataset(filepath: Path) -> list[HeritageCustodian]: """Load Libya institutions from YAML and convert to Pydantic models.""" print(f"Loading Libya dataset from: {filepath.name}") with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) print(f" Loaded {len(data)} institutions") # Convert to Pydantic models custodians = [] for inst_data in data: try: custodian = HeritageCustodian(**inst_data) custodians.append(custodian) except Exception as e: print(f" āš ļø Skipping invalid record: {inst_data.get('name', 'UNKNOWN')}: {e}") print(f" āœ… Validated {len(custodians)} institutions") return custodians def calculate_statistics(custodians: list[HeritageCustodian]) -> dict: """Calculate coverage statistics for Libya dataset.""" stats = { 'total_institutions': len(custodians), 'country': 'Libya (LY)', 'generated': datetime.now(timezone.utc).isoformat(), } # Wikidata coverage with_wikidata = sum( 1 for c in custodians if any(id.identifier_scheme == 'Wikidata' for id in c.identifiers) ) stats['wikidata_coverage'] = { 'count': with_wikidata, 'percentage': round(with_wikidata / len(custodians) * 100, 1) } # Institution types type_counts = {} for custodian in custodians: inst_type = custodian.institution_type type_counts[inst_type] = type_counts.get(inst_type, 0) + 1 stats['institution_types'] = dict(sorted(type_counts.items(), key=lambda x: x[1], reverse=True)) # Geographic distribution cities = {} for custodian in custodians: if custodian.locations: city = custodian.locations[0].city cities[city] = cities.get(city, 0) + 1 stats['cities'] = dict(sorted(cities.items(), key=lambda x: x[1], reverse=True)) return stats def main(): """Main export workflow.""" print("\n" + "="*80) print("Libya Heritage Institutions - RDF Export") print("="*80 + "\n") # Paths base_dir = Path('/Users/kempersc/apps/glam') input_file = base_dir / 'data/instances/libya/libyan_institutions.yaml' output_dir = base_dir / 'data/exports' output_dir.mkdir(exist_ok=True) # Load dataset custodians = load_libya_dataset(input_file) # Calculate statistics print("\nšŸ“Š Calculating statistics...") stats = calculate_statistics(custodians) print(f" Total institutions: {stats['total_institutions']}") print(f" Wikidata coverage: {stats['wikidata_coverage']['count']}/{stats['total_institutions']} ({stats['wikidata_coverage']['percentage']}%)") print(f" Institution types: {len(stats['institution_types'])}") print(f" Cities covered: {len(stats['cities'])}") # Export to RDF formats print("\nšŸ“¤ Exporting to RDF formats...") exporter = RDFExporter() # Turtle format ttl_file = output_dir / 'libya_institutions.ttl' print(f" Exporting Turtle: {ttl_file.name}") exporter.export_to_file(custodians, str(ttl_file), format='turtle') print(f" āœ… Saved ({ttl_file.stat().st_size:,} bytes)") # RDF/XML format exporter_xml = RDFExporter() # New instance for clean graph rdf_file = output_dir / 'libya_institutions.rdf' print(f" Exporting RDF/XML: {rdf_file.name}") exporter_xml.export_to_file(custodians, str(rdf_file), format='xml') print(f" āœ… Saved ({rdf_file.stat().st_size:,} bytes)") # JSON-LD format exporter_jsonld = RDFExporter() # New instance for clean graph jsonld_file = output_dir / 'libya_institutions.jsonld' print(f" Exporting JSON-LD: {jsonld_file.name}") exporter_jsonld.export_to_file(custodians, str(jsonld_file), format='json-ld') print(f" āœ… Saved ({jsonld_file.stat().st_size:,} bytes)") # Save statistics stats_file = output_dir / 'libya_statistics.yaml' print(f"\nšŸ“Š Saving statistics: {stats_file.name}") with open(stats_file, 'w', encoding='utf-8') as f: yaml.dump(stats, f, allow_unicode=True, sort_keys=False) print(f" āœ… Saved") print("\n" + "="*80) print("āœ… EXPORT COMPLETE!") print("="*80) print(f"\nšŸ“ Output files in: {output_dir}/") print(f" - libya_institutions.ttl (Turtle)") print(f" - libya_institutions.rdf (RDF/XML)") print(f" - libya_institutions.jsonld (JSON-LD)") print(f" - libya_statistics.yaml (Statistics)") print(f"\nšŸŽÆ Libya dataset ready for semantic web integration!") print(f" 52 institutions, 39 with Wikidata (75.0% coverage)") if __name__ == '__main__': main()