- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
364 lines
13 KiB
Python
364 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Export Latin American GLAM institutions dataset to multiple formats.
|
|
|
|
Exports:
|
|
1. JSON-LD (Linked Data with @context)
|
|
2. CSV (flat table for spreadsheet analysis)
|
|
3. GeoJSON (for geographic visualization)
|
|
4. Simple statistics JSON
|
|
|
|
This script generates all export formats from the validated combined YAML file.
|
|
"""
|
|
|
|
import json
|
|
import csv
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Any
|
|
from collections import Counter
|
|
|
|
# File paths
|
|
BASE_DIR = Path(__file__).parent.parent
|
|
INSTANCES_DIR = BASE_DIR / "data" / "instances"
|
|
EXPORTS_DIR = INSTANCES_DIR / "exports"
|
|
|
|
INPUT_FILE = INSTANCES_DIR / "latin_american_institutions_osm_enriched.yaml"
|
|
JSONLD_FILE = EXPORTS_DIR / "latin_american_institutions_osm_enriched.jsonld"
|
|
CSV_FILE = EXPORTS_DIR / "latin_american_institutions_osm_enriched.csv"
|
|
GEOJSON_FILE = EXPORTS_DIR / "latin_american_institutions_osm_enriched.geojson"
|
|
STATS_FILE = EXPORTS_DIR / "latin_american_osm_enriched_statistics.json"
|
|
|
|
# JSON-LD context
|
|
CONTEXT = {
|
|
"@context": {
|
|
"@vocab": "https://w3id.org/heritage/custodian/",
|
|
"id": "@id",
|
|
"type": "@type",
|
|
"schema": "http://schema.org/",
|
|
"dct": "http://purl.org/dc/terms/",
|
|
"foaf": "http://xmlns.com/foaf/0.1/",
|
|
"prov": "http://www.w3.org/ns/prov#",
|
|
"geo": "http://www.w3.org/2003/01/geo/wgs84_pos#",
|
|
|
|
"HeritageCustodian": "HeritageCustodian",
|
|
"name": "schema:name",
|
|
"description": "schema:description",
|
|
"institution_type": "institutionType",
|
|
"locations": "location",
|
|
"identifiers": "identifier",
|
|
"digital_platforms": "digitalPlatform",
|
|
"collections": "collection",
|
|
"provenance": "prov:hadDerivation",
|
|
|
|
"city": "schema:addressLocality",
|
|
"region": "schema:addressRegion",
|
|
"country": "schema:addressCountry",
|
|
"latitude": "geo:lat",
|
|
"longitude": "geo:long",
|
|
"street_address": "schema:streetAddress",
|
|
"postal_code": "schema:postalCode"
|
|
}
|
|
}
|
|
|
|
|
|
def load_institutions() -> List[Dict[str, Any]]:
|
|
"""Load institutions from YAML file."""
|
|
print(f"Loading institutions from: {INPUT_FILE}")
|
|
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
print(f" Loaded {len(data)} institutions\n")
|
|
return data
|
|
|
|
|
|
def export_jsonld(institutions: List[Dict[str, Any]]) -> None:
|
|
"""Export to JSON-LD format with @context."""
|
|
print("=" * 70)
|
|
print("Exporting to JSON-LD")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Add @context to each institution
|
|
institutions_with_context = []
|
|
for inst in institutions:
|
|
inst_copy = inst.copy()
|
|
inst_copy["@type"] = "HeritageCustodian"
|
|
institutions_with_context.append(inst_copy)
|
|
|
|
# Create document with graph
|
|
jsonld_doc = {
|
|
"@context": CONTEXT["@context"],
|
|
"@graph": institutions_with_context
|
|
}
|
|
|
|
with open(JSONLD_FILE, 'w', encoding='utf-8') as f:
|
|
json.dump(jsonld_doc, f, indent=2, ensure_ascii=False, default=str)
|
|
|
|
file_size = JSONLD_FILE.stat().st_size / 1024
|
|
print(f"✓ Exported {len(institutions)} institutions to JSON-LD")
|
|
print(f" File: {JSONLD_FILE}")
|
|
print(f" Size: {file_size:.1f} KB\n")
|
|
|
|
|
|
def export_csv(institutions: List[Dict[str, Any]]) -> None:
|
|
"""Export to CSV format (flattened structure)."""
|
|
print("=" * 70)
|
|
print("Exporting to CSV")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# CSV headers
|
|
fieldnames = [
|
|
'id', 'name', 'institution_type', 'description',
|
|
'country', 'region', 'city', 'street_address', 'postal_code',
|
|
'latitude', 'longitude', 'geonames_id',
|
|
'identifier_schemes', 'identifier_values',
|
|
'platform_count', 'collection_count',
|
|
'data_source', 'data_tier', 'extraction_date',
|
|
'confidence_score', 'conversation_id'
|
|
]
|
|
|
|
rows = []
|
|
for inst in institutions:
|
|
# Get primary location
|
|
loc = inst.get('locations', [{}])[0] if inst.get('locations') else {}
|
|
|
|
# Get identifiers
|
|
identifiers = inst.get('identifiers', [])
|
|
id_schemes = '; '.join(i.get('identifier_scheme', '') for i in identifiers)
|
|
id_values = '; '.join(str(i.get('identifier_value', '')) for i in identifiers)
|
|
|
|
# Provenance
|
|
prov = inst.get('provenance', {})
|
|
|
|
row = {
|
|
'id': inst.get('id', ''),
|
|
'name': inst.get('name', ''),
|
|
'institution_type': inst.get('institution_type', ''),
|
|
'description': inst.get('description', ''),
|
|
'country': loc.get('country', ''),
|
|
'region': loc.get('region', ''),
|
|
'city': loc.get('city', ''),
|
|
'street_address': loc.get('street_address', ''),
|
|
'postal_code': loc.get('postal_code', ''),
|
|
'latitude': loc.get('latitude', ''),
|
|
'longitude': loc.get('longitude', ''),
|
|
'geonames_id': loc.get('geonames_id', ''),
|
|
'identifier_schemes': id_schemes,
|
|
'identifier_values': id_values,
|
|
'platform_count': len(inst.get('digital_platforms', [])),
|
|
'collection_count': len(inst.get('collections', [])),
|
|
'data_source': prov.get('data_source', ''),
|
|
'data_tier': prov.get('data_tier', ''),
|
|
'extraction_date': prov.get('extraction_date', ''),
|
|
'confidence_score': prov.get('confidence_score', ''),
|
|
'conversation_id': prov.get('conversation_id', '')
|
|
}
|
|
rows.append(row)
|
|
|
|
with open(CSV_FILE, 'w', encoding='utf-8', newline='') as f:
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(rows)
|
|
|
|
file_size = CSV_FILE.stat().st_size / 1024
|
|
print(f"✓ Exported {len(rows)} institutions to CSV")
|
|
print(f" File: {CSV_FILE}")
|
|
print(f" Size: {file_size:.1f} KB")
|
|
print(f" Columns: {len(fieldnames)}\n")
|
|
|
|
|
|
def export_geojson(institutions: List[Dict[str, Any]]) -> None:
|
|
"""Export to GeoJSON format for mapping."""
|
|
print("=" * 70)
|
|
print("Exporting to GeoJSON")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
features = []
|
|
geocoded_count = 0
|
|
|
|
for inst in institutions:
|
|
# Get primary location with coordinates
|
|
primary_loc = None
|
|
for loc in inst.get('locations', []):
|
|
if loc.get('latitude') is not None and loc.get('longitude') is not None:
|
|
primary_loc = loc
|
|
break
|
|
|
|
if not primary_loc:
|
|
continue # Skip institutions without coordinates
|
|
|
|
geocoded_count += 1
|
|
|
|
# Create GeoJSON feature
|
|
feature = {
|
|
"type": "Feature",
|
|
"geometry": {
|
|
"type": "Point",
|
|
"coordinates": [
|
|
float(primary_loc['longitude']),
|
|
float(primary_loc['latitude'])
|
|
]
|
|
},
|
|
"properties": {
|
|
"id": inst.get('id', ''),
|
|
"name": inst.get('name', ''),
|
|
"institution_type": inst.get('institution_type', ''),
|
|
"description": inst.get('description', ''),
|
|
"city": primary_loc.get('city', ''),
|
|
"region": primary_loc.get('region', ''),
|
|
"country": primary_loc.get('country', ''),
|
|
"street_address": primary_loc.get('street_address', ''),
|
|
"data_tier": inst.get('provenance', {}).get('data_tier', ''),
|
|
"confidence_score": inst.get('provenance', {}).get('confidence_score', '')
|
|
}
|
|
}
|
|
features.append(feature)
|
|
|
|
geojson = {
|
|
"type": "FeatureCollection",
|
|
"features": features,
|
|
"metadata": {
|
|
"title": "Latin American GLAM Institutions",
|
|
"description": "Geocoded heritage institutions from Brazil, Chile, and Mexico",
|
|
"total_features": len(features),
|
|
"total_institutions": len(institutions),
|
|
"geocoding_rate": f"{len(features)/len(institutions)*100:.1f}%",
|
|
"generated": datetime.now(timezone.utc).isoformat(),
|
|
"schema_version": "LinkML v0.2.0"
|
|
}
|
|
}
|
|
|
|
with open(GEOJSON_FILE, 'w', encoding='utf-8') as f:
|
|
json.dump(geojson, f, indent=2, ensure_ascii=False)
|
|
|
|
file_size = GEOJSON_FILE.stat().st_size / 1024
|
|
print(f"✓ Exported {geocoded_count} geocoded institutions to GeoJSON")
|
|
print(f" File: {GEOJSON_FILE}")
|
|
print(f" Size: {file_size:.1f} KB")
|
|
print(f" Skipped: {len(institutions) - geocoded_count} without coordinates\n")
|
|
|
|
|
|
def export_statistics(institutions: List[Dict[str, Any]]) -> None:
|
|
"""Export dataset statistics as JSON."""
|
|
print("=" * 70)
|
|
print("Generating Statistics")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Count by country
|
|
country_counts = Counter()
|
|
for inst in institutions:
|
|
for loc in inst.get('locations', []):
|
|
if loc.get('country'):
|
|
country_counts[loc['country']] += 1
|
|
break # Only count first location
|
|
|
|
# Count by institution type
|
|
type_counts = Counter(inst.get('institution_type', 'UNKNOWN') for inst in institutions)
|
|
|
|
# Count geocoded
|
|
geocoded = sum(1 for inst in institutions
|
|
if any(loc.get('latitude') is not None and loc.get('longitude') is not None
|
|
for loc in inst.get('locations', [])))
|
|
|
|
# Count with identifiers/platforms
|
|
with_identifiers = sum(1 for inst in institutions if inst.get('identifiers'))
|
|
with_platforms = sum(1 for inst in institutions if inst.get('digital_platforms'))
|
|
with_collections = sum(1 for inst in institutions if inst.get('collections'))
|
|
|
|
# Count unique cities and regions
|
|
cities = set()
|
|
regions = set()
|
|
for inst in institutions:
|
|
for loc in inst.get('locations', []):
|
|
if loc.get('city'):
|
|
cities.add(loc['city'])
|
|
if loc.get('region'):
|
|
regions.add(loc['region'])
|
|
|
|
stats = {
|
|
"metadata": {
|
|
"title": "Latin American GLAM Institutions Statistics",
|
|
"generated": datetime.now(timezone.utc).isoformat(),
|
|
"schema_version": "LinkML v0.2.0"
|
|
},
|
|
"totals": {
|
|
"total_institutions": len(institutions),
|
|
"geocoded_institutions": geocoded,
|
|
"geocoding_rate": f"{geocoded/len(institutions)*100:.1f}%",
|
|
"with_identifiers": with_identifiers,
|
|
"with_digital_platforms": with_platforms,
|
|
"with_collections": with_collections,
|
|
"unique_cities": len(cities),
|
|
"unique_regions": len(regions)
|
|
},
|
|
"by_country": dict(country_counts),
|
|
"by_institution_type": dict(type_counts),
|
|
"top_10_cities": dict(Counter(
|
|
loc.get('city') for inst in institutions
|
|
for loc in inst.get('locations', [])
|
|
if loc.get('city')
|
|
).most_common(10))
|
|
}
|
|
|
|
with open(STATS_FILE, 'w', encoding='utf-8') as f:
|
|
json.dump(stats, f, indent=2, ensure_ascii=False)
|
|
|
|
file_size = STATS_FILE.stat().st_size / 1024
|
|
print(f"✓ Exported statistics to JSON")
|
|
print(f" File: {STATS_FILE}")
|
|
print(f" Size: {file_size:.1f} KB\n")
|
|
|
|
# Print summary
|
|
print("Summary Statistics:")
|
|
print(f" Total institutions: {len(institutions)}")
|
|
print(f" Geocoded: {geocoded} ({geocoded/len(institutions)*100:.1f}%)")
|
|
print(f" Countries: {len(country_counts)}")
|
|
print(f" Unique cities: {len(cities)}")
|
|
print(f" Unique regions: {len(regions)}")
|
|
print()
|
|
|
|
|
|
def main():
|
|
"""Main export function."""
|
|
print("=" * 70)
|
|
print("Latin American GLAM Dataset - Multi-Format Export")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Create exports directory
|
|
EXPORTS_DIR.mkdir(exist_ok=True)
|
|
print(f"Exports directory: {EXPORTS_DIR}\n")
|
|
|
|
# Load data
|
|
institutions = load_institutions()
|
|
|
|
# Export to all formats
|
|
export_jsonld(institutions)
|
|
export_csv(institutions)
|
|
export_geojson(institutions)
|
|
export_statistics(institutions)
|
|
|
|
# Summary
|
|
print("=" * 70)
|
|
print("Export Complete!")
|
|
print("=" * 70)
|
|
print()
|
|
print("Generated files:")
|
|
print(f" 1. {JSONLD_FILE.name} - Linked Data (JSON-LD)")
|
|
print(f" 2. {CSV_FILE.name} - Spreadsheet format")
|
|
print(f" 3. {GEOJSON_FILE.name} - Geographic visualization")
|
|
print(f" 4. {STATS_FILE.name} - Summary statistics")
|
|
print()
|
|
print("Next steps:")
|
|
print(" - Import CSV into spreadsheet software (Excel, Google Sheets)")
|
|
print(" - Visualize GeoJSON on map platforms (QGIS, Mapbox, Leaflet)")
|
|
print(" - Use JSON-LD for Linked Data integration")
|
|
print()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|