glam/scripts/export_nde_enriched.py

428 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Export enriched NDE entries to various formats.
Creates:
- CSV export with flattened data
- JSON-LD export for linked data
- Statistics report
Usage:
python scripts/export_nde_enriched.py
Output:
data/nde/exports/nde_enriched.csv
data/nde/exports/nde_enriched.jsonld
data/nde/exports/statistics.yaml
"""
import os
import csv
import json
import yaml
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Any, Optional
from collections import Counter
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Paths
PROJECT_ROOT = Path(__file__).parent.parent
ENTRIES_DIR = PROJECT_ROOT / "data" / "nde" / "enriched" / "entries"
EXPORTS_DIR = PROJECT_ROOT / "data" / "nde" / "exports"
ENTITY_CACHE = PROJECT_ROOT / "data" / "nde" / "enriched" / "entity_cache.json"
# JSON-LD context for heritage institutions
JSONLD_CONTEXT = {
"@context": {
"@vocab": "https://schema.org/",
"wd": "http://www.wikidata.org/entity/",
"wdt": "http://www.wikidata.org/prop/direct/",
"glam": "https://w3id.org/heritage/custodian/",
"isil": "http://id.loc.gov/vocabulary/identifiers/isil/",
"viaf": "http://viaf.org/viaf/",
"gnd": "https://d-nb.info/gnd/",
"name": "schema:name",
"alternateName": "schema:alternateName",
"description": "schema:description",
"url": "schema:url",
"address": "schema:address",
"geo": "schema:geo",
"identifier": "schema:identifier",
"foundingDate": "schema:foundingDate",
"dissolutionDate": "schema:dissolutionDate",
"image": "schema:image",
"sameAs": "schema:sameAs",
"wikidataId": {"@id": "schema:identifier", "@type": "@id"},
"isilCode": "isil:",
"viafId": "viaf:",
"gndId": "gnd:",
"institutionType": "glam:institutionType",
"collectionSystem": "glam:collectionSystem",
}
}
def load_entity_cache() -> Dict[str, Dict]:
"""Load the entity label cache."""
if ENTITY_CACHE.exists():
with open(ENTITY_CACHE, "r", encoding="utf-8") as f:
return json.load(f)
return {}
def load_all_entries() -> List[Dict]:
"""Load all enriched entry YAML files."""
entries = []
for yaml_file in sorted(ENTRIES_DIR.glob("*.yaml")):
try:
with open(yaml_file, "r", encoding="utf-8") as f:
entry = yaml.safe_load(f)
if entry:
entry["_source_file"] = yaml_file.name
entries.append(entry)
except Exception as e:
logger.warning(f"Error loading {yaml_file}: {e}")
return entries
def flatten_entry_for_csv(entry: Dict) -> Dict:
"""Flatten a nested entry for CSV export."""
original = entry.get("original_entry", {})
wikidata = entry.get("wikidata_enrichment", {})
# Basic fields from original
flat = {
"entry_index": entry.get("entry_index", ""),
"name": original.get("organisatie", ""),
"city": original.get("plaatsnaam_bezoekadres", ""),
"address": original.get("straat_en_huisnummer_bezoekadres", ""),
"website": original.get("webadres_organisatie", ""),
"type_code": ", ".join(original.get("type", [])) if original.get("type") else "",
"type_organisatie": original.get("type_organisatie", ""),
"isil_code": original.get("isil-code_na", ""),
"system": original.get("systeem", ""),
"wikidata_id": original.get("wikidata_id", ""),
# Enrichment status
"enrichment_status": entry.get("enrichment_status", ""),
}
# Wikidata enrichment fields
if wikidata:
flat["wikidata_label_nl"] = wikidata.get("wikidata_label_nl", "")
flat["wikidata_label_en"] = wikidata.get("wikidata_label_en", "")
flat["wikidata_description_nl"] = wikidata.get("wikidata_description_nl", "")
flat["wikidata_description_en"] = wikidata.get("wikidata_description_en", "")
# Coordinates
coords = wikidata.get("wikidata_coordinates", {})
flat["latitude"] = coords.get("latitude", "")
flat["longitude"] = coords.get("longitude", "")
# Identifiers
identifiers = wikidata.get("wikidata_identifiers", {})
flat["viaf_id"] = identifiers.get("VIAF", "")
flat["gnd_id"] = identifiers.get("GND", "")
flat["rkd_artists_id"] = identifiers.get("RKD artists", "")
flat["ror_id"] = identifiers.get("ROR", "")
# Instance types
instance_of = wikidata.get("wikidata_instance_of", [])
flat["instance_types"] = "; ".join([
f"{i.get('id', '')} ({i.get('label_en', '')})"
for i in instance_of[:3] # Limit to 3
])
# Location
location = wikidata.get("wikidata_located_in", {})
flat["wikidata_location"] = f"{location.get('id', '')} ({location.get('label_en', '')})" if location else ""
# Country
country = wikidata.get("wikidata_country", {})
flat["wikidata_country"] = f"{country.get('id', '')} ({country.get('label_en', '')})" if country else ""
# Dates
flat["inception"] = wikidata.get("wikidata_inception", "")
flat["dissolution"] = wikidata.get("wikidata_dissolution", "")
# Media
flat["official_website"] = wikidata.get("wikidata_official_website", "")
flat["image_url"] = wikidata.get("wikidata_image", "")
flat["logo_url"] = wikidata.get("wikidata_logo", "")
# Wikipedia links
sitelinks = wikidata.get("wikidata_sitelinks", {})
flat["wikipedia_nl"] = sitelinks.get("nlwiki", "")
flat["wikipedia_en"] = sitelinks.get("enwiki", "")
return flat
def entry_to_jsonld(entry: Dict) -> Dict:
"""Convert an entry to JSON-LD format."""
original = entry.get("original_entry", {})
wikidata = entry.get("wikidata_enrichment", {})
wikidata_id = original.get("wikidata_id", "")
jsonld = {
"@type": "Organization",
"@id": f"glam:nde/{entry.get('entry_index', 'unknown')}",
"name": original.get("organisatie", ""),
}
# Add Wikidata sameAs
if wikidata_id and str(wikidata_id).startswith("Q"):
jsonld["sameAs"] = [f"http://www.wikidata.org/entity/{wikidata_id}"]
# Address
city = original.get("plaatsnaam_bezoekadres", "")
street = original.get("straat_en_huisnummer_bezoekadres", "")
if city or street:
jsonld["address"] = {
"@type": "PostalAddress",
"addressLocality": city,
"streetAddress": street,
"addressCountry": "NL",
}
# URL
if original.get("webadres_organisatie"):
jsonld["url"] = original["webadres_organisatie"]
# ISIL code
if original.get("isil-code_na"):
jsonld["identifier"] = {
"@type": "PropertyValue",
"propertyID": "ISIL",
"value": original["isil-code_na"],
}
# Wikidata enrichment
if wikidata:
# Alternative names
aliases = wikidata.get("wikidata_aliases", {})
all_aliases = []
for lang_aliases in aliases.values():
if isinstance(lang_aliases, list):
all_aliases.extend(lang_aliases)
if all_aliases:
jsonld["alternateName"] = list(set(all_aliases))
# Description
desc = wikidata.get("wikidata_description_en") or wikidata.get("wikidata_description_nl")
if desc:
jsonld["description"] = desc
# Geo coordinates
coords = wikidata.get("wikidata_coordinates", {})
if coords.get("latitude") and coords.get("longitude"):
jsonld["geo"] = {
"@type": "GeoCoordinates",
"latitude": coords["latitude"],
"longitude": coords["longitude"],
}
# Founding date
if wikidata.get("wikidata_inception"):
jsonld["foundingDate"] = wikidata["wikidata_inception"]
# Image
if wikidata.get("wikidata_image"):
jsonld["image"] = wikidata["wikidata_image"]
# Additional identifiers
identifiers = wikidata.get("wikidata_identifiers", {})
if identifiers:
same_as = jsonld.get("sameAs", [])
if identifiers.get("VIAF"):
same_as.append(f"http://viaf.org/viaf/{identifiers['VIAF']}")
if identifiers.get("GND"):
same_as.append(f"https://d-nb.info/gnd/{identifiers['GND']}")
if identifiers.get("ROR"):
same_as.append(f"https://ror.org/{identifiers['ROR']}")
if same_as:
jsonld["sameAs"] = same_as
return jsonld
def generate_statistics(entries: List[Dict]) -> Dict:
"""Generate statistics from entries."""
stats = {
"generated_at": datetime.now(timezone.utc).isoformat(),
"total_entries": len(entries),
}
# Count by enrichment status
status_counts = Counter()
type_counts = Counter()
city_counts = Counter()
system_counts = Counter()
has_coords = 0
has_viaf = 0
has_gnd = 0
has_isil = 0
has_wikipedia_nl = 0
has_image = 0
instance_type_counts = Counter()
for entry in entries:
original = entry.get("original_entry", {})
wikidata = entry.get("wikidata_enrichment", {})
# Status
status_counts[entry.get("enrichment_status", "unknown")] += 1
# Type codes
for t in original.get("type", []):
type_counts[t] += 1
# City
city = original.get("plaatsnaam_bezoekadres", "")
if city:
city_counts[city] += 1
# System
system = original.get("systeem", "")
if system:
system_counts[system] += 1
# ISIL
if original.get("isil-code_na"):
has_isil += 1
# Wikidata enrichment stats
if wikidata:
# Coordinates
if wikidata.get("wikidata_coordinates", {}).get("latitude"):
has_coords += 1
# Identifiers
identifiers = wikidata.get("wikidata_identifiers", {})
if identifiers.get("VIAF"):
has_viaf += 1
if identifiers.get("GND"):
has_gnd += 1
# Wikipedia
if wikidata.get("wikidata_sitelinks", {}).get("nlwiki"):
has_wikipedia_nl += 1
# Image
if wikidata.get("wikidata_image"):
has_image += 1
# Instance types
for inst in wikidata.get("wikidata_instance_of", []):
label = inst.get("label_en", inst.get("id", "unknown"))
instance_type_counts[label] += 1
stats["by_enrichment_status"] = dict(status_counts.most_common())
stats["by_institution_type"] = dict(type_counts.most_common())
stats["top_cities"] = dict(city_counts.most_common(20))
stats["by_collection_system"] = dict(system_counts.most_common())
stats["top_wikidata_instance_types"] = dict(instance_type_counts.most_common(20))
stats["identifier_coverage"] = {
"has_isil_code": has_isil,
"has_coordinates": has_coords,
"has_viaf": has_viaf,
"has_gnd": has_gnd,
"has_wikipedia_nl": has_wikipedia_nl,
"has_image": has_image,
}
# Calculate percentages
total = len(entries)
stats["coverage_percentages"] = {
"isil_code": f"{has_isil / total * 100:.1f}%",
"coordinates": f"{has_coords / total * 100:.1f}%",
"viaf": f"{has_viaf / total * 100:.1f}%",
"gnd": f"{has_gnd / total * 100:.1f}%",
"wikipedia_nl": f"{has_wikipedia_nl / total * 100:.1f}%",
"image": f"{has_image / total * 100:.1f}%",
}
return stats
def main():
"""Main export function."""
logger.info("Loading enriched entries...")
entries = load_all_entries()
logger.info(f"Loaded {len(entries)} entries")
# Create exports directory
EXPORTS_DIR.mkdir(parents=True, exist_ok=True)
# Export to CSV
logger.info("Exporting to CSV...")
csv_path = EXPORTS_DIR / "nde_enriched.csv"
flat_entries = [flatten_entry_for_csv(e) for e in entries]
if flat_entries:
fieldnames = list(flat_entries[0].keys())
with open(csv_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(flat_entries)
logger.info(f"CSV exported to: {csv_path}")
# Export to JSON-LD
logger.info("Exporting to JSON-LD...")
jsonld_path = EXPORTS_DIR / "nde_enriched.jsonld"
jsonld_entries = [entry_to_jsonld(e) for e in entries]
jsonld_doc = {
**JSONLD_CONTEXT,
"@graph": jsonld_entries,
}
with open(jsonld_path, "w", encoding="utf-8") as f:
json.dump(jsonld_doc, f, indent=2, ensure_ascii=False)
logger.info(f"JSON-LD exported to: {jsonld_path}")
# Generate statistics
logger.info("Generating statistics...")
stats = generate_statistics(entries)
stats_path = EXPORTS_DIR / "statistics.yaml"
with open(stats_path, "w", encoding="utf-8") as f:
yaml.dump(stats, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
logger.info(f"Statistics saved to: {stats_path}")
# Print summary
print("\n" + "=" * 60)
print("EXPORT SUMMARY")
print("=" * 60)
print(f"Total entries: {stats['total_entries']}")
print(f"\nBy enrichment status:")
for status, count in stats["by_enrichment_status"].items():
print(f" {status}: {count}")
print(f"\nBy institution type:")
for t, count in stats["by_institution_type"].items():
print(f" {t}: {count}")
print(f"\nIdentifier coverage:")
for key, pct in stats["coverage_percentages"].items():
print(f" {key}: {pct}")
print(f"\nTop 10 cities:")
for city, count in list(stats["top_cities"].items())[:10]:
print(f" {city}: {count}")
print(f"\nTop 10 collection systems:")
for system, count in list(stats["by_collection_system"].items())[:10]:
print(f" {system}: {count}")
print("\n" + "=" * 60)
print(f"Files saved to: {EXPORTS_DIR}")
print("=" * 60)
if __name__ == "__main__":
main()