glam/scripts/export_latin_american_datasets.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

364 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Export Latin American GLAM institutions dataset to multiple formats.
Exports:
1. JSON-LD (Linked Data with @context)
2. CSV (flat table for spreadsheet analysis)
3. GeoJSON (for geographic visualization)
4. Simple statistics JSON
This script generates all export formats from the validated combined YAML file.
"""
import json
import csv
import yaml
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Any
from collections import Counter
# File paths
BASE_DIR = Path(__file__).parent.parent
INSTANCES_DIR = BASE_DIR / "data" / "instances"
EXPORTS_DIR = INSTANCES_DIR / "exports"
INPUT_FILE = INSTANCES_DIR / "latin_american_institutions_osm_enriched.yaml"
JSONLD_FILE = EXPORTS_DIR / "latin_american_institutions_osm_enriched.jsonld"
CSV_FILE = EXPORTS_DIR / "latin_american_institutions_osm_enriched.csv"
GEOJSON_FILE = EXPORTS_DIR / "latin_american_institutions_osm_enriched.geojson"
STATS_FILE = EXPORTS_DIR / "latin_american_osm_enriched_statistics.json"
# JSON-LD context
CONTEXT = {
"@context": {
"@vocab": "https://w3id.org/heritage/custodian/",
"id": "@id",
"type": "@type",
"schema": "http://schema.org/",
"dct": "http://purl.org/dc/terms/",
"foaf": "http://xmlns.com/foaf/0.1/",
"prov": "http://www.w3.org/ns/prov#",
"geo": "http://www.w3.org/2003/01/geo/wgs84_pos#",
"HeritageCustodian": "HeritageCustodian",
"name": "schema:name",
"description": "schema:description",
"institution_type": "institutionType",
"locations": "location",
"identifiers": "identifier",
"digital_platforms": "digitalPlatform",
"collections": "collection",
"provenance": "prov:hadDerivation",
"city": "schema:addressLocality",
"region": "schema:addressRegion",
"country": "schema:addressCountry",
"latitude": "geo:lat",
"longitude": "geo:long",
"street_address": "schema:streetAddress",
"postal_code": "schema:postalCode"
}
}
def load_institutions() -> List[Dict[str, Any]]:
"""Load institutions from YAML file."""
print(f"Loading institutions from: {INPUT_FILE}")
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
print(f" Loaded {len(data)} institutions\n")
return data
def export_jsonld(institutions: List[Dict[str, Any]]) -> None:
"""Export to JSON-LD format with @context."""
print("=" * 70)
print("Exporting to JSON-LD")
print("=" * 70)
print()
# Add @context to each institution
institutions_with_context = []
for inst in institutions:
inst_copy = inst.copy()
inst_copy["@type"] = "HeritageCustodian"
institutions_with_context.append(inst_copy)
# Create document with graph
jsonld_doc = {
"@context": CONTEXT["@context"],
"@graph": institutions_with_context
}
with open(JSONLD_FILE, 'w', encoding='utf-8') as f:
json.dump(jsonld_doc, f, indent=2, ensure_ascii=False, default=str)
file_size = JSONLD_FILE.stat().st_size / 1024
print(f"✓ Exported {len(institutions)} institutions to JSON-LD")
print(f" File: {JSONLD_FILE}")
print(f" Size: {file_size:.1f} KB\n")
def export_csv(institutions: List[Dict[str, Any]]) -> None:
"""Export to CSV format (flattened structure)."""
print("=" * 70)
print("Exporting to CSV")
print("=" * 70)
print()
# CSV headers
fieldnames = [
'id', 'name', 'institution_type', 'description',
'country', 'region', 'city', 'street_address', 'postal_code',
'latitude', 'longitude', 'geonames_id',
'identifier_schemes', 'identifier_values',
'platform_count', 'collection_count',
'data_source', 'data_tier', 'extraction_date',
'confidence_score', 'conversation_id'
]
rows = []
for inst in institutions:
# Get primary location
loc = inst.get('locations', [{}])[0] if inst.get('locations') else {}
# Get identifiers
identifiers = inst.get('identifiers', [])
id_schemes = '; '.join(i.get('identifier_scheme', '') for i in identifiers)
id_values = '; '.join(str(i.get('identifier_value', '')) for i in identifiers)
# Provenance
prov = inst.get('provenance', {})
row = {
'id': inst.get('id', ''),
'name': inst.get('name', ''),
'institution_type': inst.get('institution_type', ''),
'description': inst.get('description', ''),
'country': loc.get('country', ''),
'region': loc.get('region', ''),
'city': loc.get('city', ''),
'street_address': loc.get('street_address', ''),
'postal_code': loc.get('postal_code', ''),
'latitude': loc.get('latitude', ''),
'longitude': loc.get('longitude', ''),
'geonames_id': loc.get('geonames_id', ''),
'identifier_schemes': id_schemes,
'identifier_values': id_values,
'platform_count': len(inst.get('digital_platforms', [])),
'collection_count': len(inst.get('collections', [])),
'data_source': prov.get('data_source', ''),
'data_tier': prov.get('data_tier', ''),
'extraction_date': prov.get('extraction_date', ''),
'confidence_score': prov.get('confidence_score', ''),
'conversation_id': prov.get('conversation_id', '')
}
rows.append(row)
with open(CSV_FILE, 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
file_size = CSV_FILE.stat().st_size / 1024
print(f"✓ Exported {len(rows)} institutions to CSV")
print(f" File: {CSV_FILE}")
print(f" Size: {file_size:.1f} KB")
print(f" Columns: {len(fieldnames)}\n")
def export_geojson(institutions: List[Dict[str, Any]]) -> None:
"""Export to GeoJSON format for mapping."""
print("=" * 70)
print("Exporting to GeoJSON")
print("=" * 70)
print()
features = []
geocoded_count = 0
for inst in institutions:
# Get primary location with coordinates
primary_loc = None
for loc in inst.get('locations', []):
if loc.get('latitude') is not None and loc.get('longitude') is not None:
primary_loc = loc
break
if not primary_loc:
continue # Skip institutions without coordinates
geocoded_count += 1
# Create GeoJSON feature
feature = {
"type": "Feature",
"geometry": {
"type": "Point",
"coordinates": [
float(primary_loc['longitude']),
float(primary_loc['latitude'])
]
},
"properties": {
"id": inst.get('id', ''),
"name": inst.get('name', ''),
"institution_type": inst.get('institution_type', ''),
"description": inst.get('description', ''),
"city": primary_loc.get('city', ''),
"region": primary_loc.get('region', ''),
"country": primary_loc.get('country', ''),
"street_address": primary_loc.get('street_address', ''),
"data_tier": inst.get('provenance', {}).get('data_tier', ''),
"confidence_score": inst.get('provenance', {}).get('confidence_score', '')
}
}
features.append(feature)
geojson = {
"type": "FeatureCollection",
"features": features,
"metadata": {
"title": "Latin American GLAM Institutions",
"description": "Geocoded heritage institutions from Brazil, Chile, and Mexico",
"total_features": len(features),
"total_institutions": len(institutions),
"geocoding_rate": f"{len(features)/len(institutions)*100:.1f}%",
"generated": datetime.now(timezone.utc).isoformat(),
"schema_version": "LinkML v0.2.0"
}
}
with open(GEOJSON_FILE, 'w', encoding='utf-8') as f:
json.dump(geojson, f, indent=2, ensure_ascii=False)
file_size = GEOJSON_FILE.stat().st_size / 1024
print(f"✓ Exported {geocoded_count} geocoded institutions to GeoJSON")
print(f" File: {GEOJSON_FILE}")
print(f" Size: {file_size:.1f} KB")
print(f" Skipped: {len(institutions) - geocoded_count} without coordinates\n")
def export_statistics(institutions: List[Dict[str, Any]]) -> None:
"""Export dataset statistics as JSON."""
print("=" * 70)
print("Generating Statistics")
print("=" * 70)
print()
# Count by country
country_counts = Counter()
for inst in institutions:
for loc in inst.get('locations', []):
if loc.get('country'):
country_counts[loc['country']] += 1
break # Only count first location
# Count by institution type
type_counts = Counter(inst.get('institution_type', 'UNKNOWN') for inst in institutions)
# Count geocoded
geocoded = sum(1 for inst in institutions
if any(loc.get('latitude') is not None and loc.get('longitude') is not None
for loc in inst.get('locations', [])))
# Count with identifiers/platforms
with_identifiers = sum(1 for inst in institutions if inst.get('identifiers'))
with_platforms = sum(1 for inst in institutions if inst.get('digital_platforms'))
with_collections = sum(1 for inst in institutions if inst.get('collections'))
# Count unique cities and regions
cities = set()
regions = set()
for inst in institutions:
for loc in inst.get('locations', []):
if loc.get('city'):
cities.add(loc['city'])
if loc.get('region'):
regions.add(loc['region'])
stats = {
"metadata": {
"title": "Latin American GLAM Institutions Statistics",
"generated": datetime.now(timezone.utc).isoformat(),
"schema_version": "LinkML v0.2.0"
},
"totals": {
"total_institutions": len(institutions),
"geocoded_institutions": geocoded,
"geocoding_rate": f"{geocoded/len(institutions)*100:.1f}%",
"with_identifiers": with_identifiers,
"with_digital_platforms": with_platforms,
"with_collections": with_collections,
"unique_cities": len(cities),
"unique_regions": len(regions)
},
"by_country": dict(country_counts),
"by_institution_type": dict(type_counts),
"top_10_cities": dict(Counter(
loc.get('city') for inst in institutions
for loc in inst.get('locations', [])
if loc.get('city')
).most_common(10))
}
with open(STATS_FILE, 'w', encoding='utf-8') as f:
json.dump(stats, f, indent=2, ensure_ascii=False)
file_size = STATS_FILE.stat().st_size / 1024
print(f"✓ Exported statistics to JSON")
print(f" File: {STATS_FILE}")
print(f" Size: {file_size:.1f} KB\n")
# Print summary
print("Summary Statistics:")
print(f" Total institutions: {len(institutions)}")
print(f" Geocoded: {geocoded} ({geocoded/len(institutions)*100:.1f}%)")
print(f" Countries: {len(country_counts)}")
print(f" Unique cities: {len(cities)}")
print(f" Unique regions: {len(regions)}")
print()
def main():
"""Main export function."""
print("=" * 70)
print("Latin American GLAM Dataset - Multi-Format Export")
print("=" * 70)
print()
# Create exports directory
EXPORTS_DIR.mkdir(exist_ok=True)
print(f"Exports directory: {EXPORTS_DIR}\n")
# Load data
institutions = load_institutions()
# Export to all formats
export_jsonld(institutions)
export_csv(institutions)
export_geojson(institutions)
export_statistics(institutions)
# Summary
print("=" * 70)
print("Export Complete!")
print("=" * 70)
print()
print("Generated files:")
print(f" 1. {JSONLD_FILE.name} - Linked Data (JSON-LD)")
print(f" 2. {CSV_FILE.name} - Spreadsheet format")
print(f" 3. {GEOJSON_FILE.name} - Geographic visualization")
print(f" 4. {STATS_FILE.name} - Summary statistics")
print()
print("Next steps:")
print(" - Import CSV into spreadsheet software (Excel, Google Sheets)")
print(" - Visualize GeoJSON on map platforms (QGIS, Mapbox, Leaflet)")
print(" - Use JSON-LD for Linked Data integration")
print()
if __name__ == '__main__':
main()