#!/usr/bin/env python3 """ Export Latin American GLAM institutions dataset to multiple formats. Exports: 1. JSON-LD (Linked Data with @context) 2. CSV (flat table for spreadsheet analysis) 3. GeoJSON (for geographic visualization) 4. Simple statistics JSON This script generates all export formats from the validated combined YAML file. """ import json import csv import yaml from pathlib import Path from datetime import datetime, timezone from typing import List, Dict, Any from collections import Counter # File paths BASE_DIR = Path(__file__).parent.parent INSTANCES_DIR = BASE_DIR / "data" / "instances" EXPORTS_DIR = INSTANCES_DIR / "exports" INPUT_FILE = INSTANCES_DIR / "latin_american_institutions_osm_enriched.yaml" JSONLD_FILE = EXPORTS_DIR / "latin_american_institutions_osm_enriched.jsonld" CSV_FILE = EXPORTS_DIR / "latin_american_institutions_osm_enriched.csv" GEOJSON_FILE = EXPORTS_DIR / "latin_american_institutions_osm_enriched.geojson" STATS_FILE = EXPORTS_DIR / "latin_american_osm_enriched_statistics.json" # JSON-LD context CONTEXT = { "@context": { "@vocab": "https://w3id.org/heritage/custodian/", "id": "@id", "type": "@type", "schema": "http://schema.org/", "dct": "http://purl.org/dc/terms/", "foaf": "http://xmlns.com/foaf/0.1/", "prov": "http://www.w3.org/ns/prov#", "geo": "http://www.w3.org/2003/01/geo/wgs84_pos#", "HeritageCustodian": "HeritageCustodian", "name": "schema:name", "description": "schema:description", "institution_type": "institutionType", "locations": "location", "identifiers": "identifier", "digital_platforms": "digitalPlatform", "collections": "collection", "provenance": "prov:hadDerivation", "city": "schema:addressLocality", "region": "schema:addressRegion", "country": "schema:addressCountry", "latitude": "geo:lat", "longitude": "geo:long", "street_address": "schema:streetAddress", "postal_code": "schema:postalCode" } } def load_institutions() -> List[Dict[str, Any]]: """Load institutions from YAML file.""" print(f"Loading institutions from: {INPUT_FILE}") with open(INPUT_FILE, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) print(f" Loaded {len(data)} institutions\n") return data def export_jsonld(institutions: List[Dict[str, Any]]) -> None: """Export to JSON-LD format with @context.""" print("=" * 70) print("Exporting to JSON-LD") print("=" * 70) print() # Add @context to each institution institutions_with_context = [] for inst in institutions: inst_copy = inst.copy() inst_copy["@type"] = "HeritageCustodian" institutions_with_context.append(inst_copy) # Create document with graph jsonld_doc = { "@context": CONTEXT["@context"], "@graph": institutions_with_context } with open(JSONLD_FILE, 'w', encoding='utf-8') as f: json.dump(jsonld_doc, f, indent=2, ensure_ascii=False, default=str) file_size = JSONLD_FILE.stat().st_size / 1024 print(f"✓ Exported {len(institutions)} institutions to JSON-LD") print(f" File: {JSONLD_FILE}") print(f" Size: {file_size:.1f} KB\n") def export_csv(institutions: List[Dict[str, Any]]) -> None: """Export to CSV format (flattened structure).""" print("=" * 70) print("Exporting to CSV") print("=" * 70) print() # CSV headers fieldnames = [ 'id', 'name', 'institution_type', 'description', 'country', 'region', 'city', 'street_address', 'postal_code', 'latitude', 'longitude', 'geonames_id', 'identifier_schemes', 'identifier_values', 'platform_count', 'collection_count', 'data_source', 'data_tier', 'extraction_date', 'confidence_score', 'conversation_id' ] rows = [] for inst in institutions: # Get primary location loc = inst.get('locations', [{}])[0] if inst.get('locations') else {} # Get identifiers identifiers = inst.get('identifiers', []) id_schemes = '; '.join(i.get('identifier_scheme', '') for i in identifiers) id_values = '; '.join(str(i.get('identifier_value', '')) for i in identifiers) # Provenance prov = inst.get('provenance', {}) row = { 'id': inst.get('id', ''), 'name': inst.get('name', ''), 'institution_type': inst.get('institution_type', ''), 'description': inst.get('description', ''), 'country': loc.get('country', ''), 'region': loc.get('region', ''), 'city': loc.get('city', ''), 'street_address': loc.get('street_address', ''), 'postal_code': loc.get('postal_code', ''), 'latitude': loc.get('latitude', ''), 'longitude': loc.get('longitude', ''), 'geonames_id': loc.get('geonames_id', ''), 'identifier_schemes': id_schemes, 'identifier_values': id_values, 'platform_count': len(inst.get('digital_platforms', [])), 'collection_count': len(inst.get('collections', [])), 'data_source': prov.get('data_source', ''), 'data_tier': prov.get('data_tier', ''), 'extraction_date': prov.get('extraction_date', ''), 'confidence_score': prov.get('confidence_score', ''), 'conversation_id': prov.get('conversation_id', '') } rows.append(row) with open(CSV_FILE, 'w', encoding='utf-8', newline='') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(rows) file_size = CSV_FILE.stat().st_size / 1024 print(f"✓ Exported {len(rows)} institutions to CSV") print(f" File: {CSV_FILE}") print(f" Size: {file_size:.1f} KB") print(f" Columns: {len(fieldnames)}\n") def export_geojson(institutions: List[Dict[str, Any]]) -> None: """Export to GeoJSON format for mapping.""" print("=" * 70) print("Exporting to GeoJSON") print("=" * 70) print() features = [] geocoded_count = 0 for inst in institutions: # Get primary location with coordinates primary_loc = None for loc in inst.get('locations', []): if loc.get('latitude') is not None and loc.get('longitude') is not None: primary_loc = loc break if not primary_loc: continue # Skip institutions without coordinates geocoded_count += 1 # Create GeoJSON feature feature = { "type": "Feature", "geometry": { "type": "Point", "coordinates": [ float(primary_loc['longitude']), float(primary_loc['latitude']) ] }, "properties": { "id": inst.get('id', ''), "name": inst.get('name', ''), "institution_type": inst.get('institution_type', ''), "description": inst.get('description', ''), "city": primary_loc.get('city', ''), "region": primary_loc.get('region', ''), "country": primary_loc.get('country', ''), "street_address": primary_loc.get('street_address', ''), "data_tier": inst.get('provenance', {}).get('data_tier', ''), "confidence_score": inst.get('provenance', {}).get('confidence_score', '') } } features.append(feature) geojson = { "type": "FeatureCollection", "features": features, "metadata": { "title": "Latin American GLAM Institutions", "description": "Geocoded heritage institutions from Brazil, Chile, and Mexico", "total_features": len(features), "total_institutions": len(institutions), "geocoding_rate": f"{len(features)/len(institutions)*100:.1f}%", "generated": datetime.now(timezone.utc).isoformat(), "schema_version": "LinkML v0.2.0" } } with open(GEOJSON_FILE, 'w', encoding='utf-8') as f: json.dump(geojson, f, indent=2, ensure_ascii=False) file_size = GEOJSON_FILE.stat().st_size / 1024 print(f"✓ Exported {geocoded_count} geocoded institutions to GeoJSON") print(f" File: {GEOJSON_FILE}") print(f" Size: {file_size:.1f} KB") print(f" Skipped: {len(institutions) - geocoded_count} without coordinates\n") def export_statistics(institutions: List[Dict[str, Any]]) -> None: """Export dataset statistics as JSON.""" print("=" * 70) print("Generating Statistics") print("=" * 70) print() # Count by country country_counts = Counter() for inst in institutions: for loc in inst.get('locations', []): if loc.get('country'): country_counts[loc['country']] += 1 break # Only count first location # Count by institution type type_counts = Counter(inst.get('institution_type', 'UNKNOWN') for inst in institutions) # Count geocoded geocoded = sum(1 for inst in institutions if any(loc.get('latitude') is not None and loc.get('longitude') is not None for loc in inst.get('locations', []))) # Count with identifiers/platforms with_identifiers = sum(1 for inst in institutions if inst.get('identifiers')) with_platforms = sum(1 for inst in institutions if inst.get('digital_platforms')) with_collections = sum(1 for inst in institutions if inst.get('collections')) # Count unique cities and regions cities = set() regions = set() for inst in institutions: for loc in inst.get('locations', []): if loc.get('city'): cities.add(loc['city']) if loc.get('region'): regions.add(loc['region']) stats = { "metadata": { "title": "Latin American GLAM Institutions Statistics", "generated": datetime.now(timezone.utc).isoformat(), "schema_version": "LinkML v0.2.0" }, "totals": { "total_institutions": len(institutions), "geocoded_institutions": geocoded, "geocoding_rate": f"{geocoded/len(institutions)*100:.1f}%", "with_identifiers": with_identifiers, "with_digital_platforms": with_platforms, "with_collections": with_collections, "unique_cities": len(cities), "unique_regions": len(regions) }, "by_country": dict(country_counts), "by_institution_type": dict(type_counts), "top_10_cities": dict(Counter( loc.get('city') for inst in institutions for loc in inst.get('locations', []) if loc.get('city') ).most_common(10)) } with open(STATS_FILE, 'w', encoding='utf-8') as f: json.dump(stats, f, indent=2, ensure_ascii=False) file_size = STATS_FILE.stat().st_size / 1024 print(f"✓ Exported statistics to JSON") print(f" File: {STATS_FILE}") print(f" Size: {file_size:.1f} KB\n") # Print summary print("Summary Statistics:") print(f" Total institutions: {len(institutions)}") print(f" Geocoded: {geocoded} ({geocoded/len(institutions)*100:.1f}%)") print(f" Countries: {len(country_counts)}") print(f" Unique cities: {len(cities)}") print(f" Unique regions: {len(regions)}") print() def main(): """Main export function.""" print("=" * 70) print("Latin American GLAM Dataset - Multi-Format Export") print("=" * 70) print() # Create exports directory EXPORTS_DIR.mkdir(exist_ok=True) print(f"Exports directory: {EXPORTS_DIR}\n") # Load data institutions = load_institutions() # Export to all formats export_jsonld(institutions) export_csv(institutions) export_geojson(institutions) export_statistics(institutions) # Summary print("=" * 70) print("Export Complete!") print("=" * 70) print() print("Generated files:") print(f" 1. {JSONLD_FILE.name} - Linked Data (JSON-LD)") print(f" 2. {CSV_FILE.name} - Spreadsheet format") print(f" 3. {GEOJSON_FILE.name} - Geographic visualization") print(f" 4. {STATS_FILE.name} - Summary statistics") print() print("Next steps:") print(" - Import CSV into spreadsheet software (Excel, Google Sheets)") print(" - Visualize GeoJSON on map platforms (QGIS, Mapbox, Leaflet)") print(" - Use JSON-LD for Linked Data integration") print() if __name__ == '__main__': main()