428 lines
14 KiB
Python
428 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Export enriched NDE entries to various formats.
|
|
|
|
Creates:
|
|
- CSV export with flattened data
|
|
- JSON-LD export for linked data
|
|
- Statistics report
|
|
|
|
Usage:
|
|
python scripts/export_nde_enriched.py
|
|
|
|
Output:
|
|
data/nde/exports/nde_enriched.csv
|
|
data/nde/exports/nde_enriched.jsonld
|
|
data/nde/exports/statistics.yaml
|
|
"""
|
|
|
|
import os
|
|
import csv
|
|
import json
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Any, Optional
|
|
from collections import Counter
|
|
import logging
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Paths
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
ENTRIES_DIR = PROJECT_ROOT / "data" / "nde" / "enriched" / "entries"
|
|
EXPORTS_DIR = PROJECT_ROOT / "data" / "nde" / "exports"
|
|
ENTITY_CACHE = PROJECT_ROOT / "data" / "nde" / "enriched" / "entity_cache.json"
|
|
|
|
# JSON-LD context for heritage institutions
|
|
JSONLD_CONTEXT = {
|
|
"@context": {
|
|
"@vocab": "https://schema.org/",
|
|
"wd": "http://www.wikidata.org/entity/",
|
|
"wdt": "http://www.wikidata.org/prop/direct/",
|
|
"glam": "https://w3id.org/heritage/custodian/",
|
|
"isil": "http://id.loc.gov/vocabulary/identifiers/isil/",
|
|
"viaf": "http://viaf.org/viaf/",
|
|
"gnd": "https://d-nb.info/gnd/",
|
|
|
|
"name": "schema:name",
|
|
"alternateName": "schema:alternateName",
|
|
"description": "schema:description",
|
|
"url": "schema:url",
|
|
"address": "schema:address",
|
|
"geo": "schema:geo",
|
|
"identifier": "schema:identifier",
|
|
"foundingDate": "schema:foundingDate",
|
|
"dissolutionDate": "schema:dissolutionDate",
|
|
"image": "schema:image",
|
|
"sameAs": "schema:sameAs",
|
|
|
|
"wikidataId": {"@id": "schema:identifier", "@type": "@id"},
|
|
"isilCode": "isil:",
|
|
"viafId": "viaf:",
|
|
"gndId": "gnd:",
|
|
|
|
"institutionType": "glam:institutionType",
|
|
"collectionSystem": "glam:collectionSystem",
|
|
}
|
|
}
|
|
|
|
|
|
def load_entity_cache() -> Dict[str, Dict]:
|
|
"""Load the entity label cache."""
|
|
if ENTITY_CACHE.exists():
|
|
with open(ENTITY_CACHE, "r", encoding="utf-8") as f:
|
|
return json.load(f)
|
|
return {}
|
|
|
|
|
|
def load_all_entries() -> List[Dict]:
|
|
"""Load all enriched entry YAML files."""
|
|
entries = []
|
|
for yaml_file in sorted(ENTRIES_DIR.glob("*.yaml")):
|
|
try:
|
|
with open(yaml_file, "r", encoding="utf-8") as f:
|
|
entry = yaml.safe_load(f)
|
|
if entry:
|
|
entry["_source_file"] = yaml_file.name
|
|
entries.append(entry)
|
|
except Exception as e:
|
|
logger.warning(f"Error loading {yaml_file}: {e}")
|
|
return entries
|
|
|
|
|
|
def flatten_entry_for_csv(entry: Dict) -> Dict:
|
|
"""Flatten a nested entry for CSV export."""
|
|
original = entry.get("original_entry", {})
|
|
wikidata = entry.get("wikidata_enrichment", {})
|
|
|
|
# Basic fields from original
|
|
flat = {
|
|
"entry_index": entry.get("entry_index", ""),
|
|
"name": original.get("organisatie", ""),
|
|
"city": original.get("plaatsnaam_bezoekadres", ""),
|
|
"address": original.get("straat_en_huisnummer_bezoekadres", ""),
|
|
"website": original.get("webadres_organisatie", ""),
|
|
"type_code": ", ".join(original.get("type", [])) if original.get("type") else "",
|
|
"type_organisatie": original.get("type_organisatie", ""),
|
|
"isil_code": original.get("isil-code_na", ""),
|
|
"system": original.get("systeem", ""),
|
|
"wikidata_id": original.get("wikidata_id", ""),
|
|
|
|
# Enrichment status
|
|
"enrichment_status": entry.get("enrichment_status", ""),
|
|
}
|
|
|
|
# Wikidata enrichment fields
|
|
if wikidata:
|
|
flat["wikidata_label_nl"] = wikidata.get("wikidata_label_nl", "")
|
|
flat["wikidata_label_en"] = wikidata.get("wikidata_label_en", "")
|
|
flat["wikidata_description_nl"] = wikidata.get("wikidata_description_nl", "")
|
|
flat["wikidata_description_en"] = wikidata.get("wikidata_description_en", "")
|
|
|
|
# Coordinates
|
|
coords = wikidata.get("wikidata_coordinates", {})
|
|
flat["latitude"] = coords.get("latitude", "")
|
|
flat["longitude"] = coords.get("longitude", "")
|
|
|
|
# Identifiers
|
|
identifiers = wikidata.get("wikidata_identifiers", {})
|
|
flat["viaf_id"] = identifiers.get("VIAF", "")
|
|
flat["gnd_id"] = identifiers.get("GND", "")
|
|
flat["rkd_artists_id"] = identifiers.get("RKD artists", "")
|
|
flat["ror_id"] = identifiers.get("ROR", "")
|
|
|
|
# Instance types
|
|
instance_of = wikidata.get("wikidata_instance_of", [])
|
|
flat["instance_types"] = "; ".join([
|
|
f"{i.get('id', '')} ({i.get('label_en', '')})"
|
|
for i in instance_of[:3] # Limit to 3
|
|
])
|
|
|
|
# Location
|
|
location = wikidata.get("wikidata_located_in", {})
|
|
flat["wikidata_location"] = f"{location.get('id', '')} ({location.get('label_en', '')})" if location else ""
|
|
|
|
# Country
|
|
country = wikidata.get("wikidata_country", {})
|
|
flat["wikidata_country"] = f"{country.get('id', '')} ({country.get('label_en', '')})" if country else ""
|
|
|
|
# Dates
|
|
flat["inception"] = wikidata.get("wikidata_inception", "")
|
|
flat["dissolution"] = wikidata.get("wikidata_dissolution", "")
|
|
|
|
# Media
|
|
flat["official_website"] = wikidata.get("wikidata_official_website", "")
|
|
flat["image_url"] = wikidata.get("wikidata_image", "")
|
|
flat["logo_url"] = wikidata.get("wikidata_logo", "")
|
|
|
|
# Wikipedia links
|
|
sitelinks = wikidata.get("wikidata_sitelinks", {})
|
|
flat["wikipedia_nl"] = sitelinks.get("nlwiki", "")
|
|
flat["wikipedia_en"] = sitelinks.get("enwiki", "")
|
|
|
|
return flat
|
|
|
|
|
|
def entry_to_jsonld(entry: Dict) -> Dict:
|
|
"""Convert an entry to JSON-LD format."""
|
|
original = entry.get("original_entry", {})
|
|
wikidata = entry.get("wikidata_enrichment", {})
|
|
|
|
wikidata_id = original.get("wikidata_id", "")
|
|
|
|
jsonld = {
|
|
"@type": "Organization",
|
|
"@id": f"glam:nde/{entry.get('entry_index', 'unknown')}",
|
|
"name": original.get("organisatie", ""),
|
|
}
|
|
|
|
# Add Wikidata sameAs
|
|
if wikidata_id and str(wikidata_id).startswith("Q"):
|
|
jsonld["sameAs"] = [f"http://www.wikidata.org/entity/{wikidata_id}"]
|
|
|
|
# Address
|
|
city = original.get("plaatsnaam_bezoekadres", "")
|
|
street = original.get("straat_en_huisnummer_bezoekadres", "")
|
|
if city or street:
|
|
jsonld["address"] = {
|
|
"@type": "PostalAddress",
|
|
"addressLocality": city,
|
|
"streetAddress": street,
|
|
"addressCountry": "NL",
|
|
}
|
|
|
|
# URL
|
|
if original.get("webadres_organisatie"):
|
|
jsonld["url"] = original["webadres_organisatie"]
|
|
|
|
# ISIL code
|
|
if original.get("isil-code_na"):
|
|
jsonld["identifier"] = {
|
|
"@type": "PropertyValue",
|
|
"propertyID": "ISIL",
|
|
"value": original["isil-code_na"],
|
|
}
|
|
|
|
# Wikidata enrichment
|
|
if wikidata:
|
|
# Alternative names
|
|
aliases = wikidata.get("wikidata_aliases", {})
|
|
all_aliases = []
|
|
for lang_aliases in aliases.values():
|
|
if isinstance(lang_aliases, list):
|
|
all_aliases.extend(lang_aliases)
|
|
if all_aliases:
|
|
jsonld["alternateName"] = list(set(all_aliases))
|
|
|
|
# Description
|
|
desc = wikidata.get("wikidata_description_en") or wikidata.get("wikidata_description_nl")
|
|
if desc:
|
|
jsonld["description"] = desc
|
|
|
|
# Geo coordinates
|
|
coords = wikidata.get("wikidata_coordinates", {})
|
|
if coords.get("latitude") and coords.get("longitude"):
|
|
jsonld["geo"] = {
|
|
"@type": "GeoCoordinates",
|
|
"latitude": coords["latitude"],
|
|
"longitude": coords["longitude"],
|
|
}
|
|
|
|
# Founding date
|
|
if wikidata.get("wikidata_inception"):
|
|
jsonld["foundingDate"] = wikidata["wikidata_inception"]
|
|
|
|
# Image
|
|
if wikidata.get("wikidata_image"):
|
|
jsonld["image"] = wikidata["wikidata_image"]
|
|
|
|
# Additional identifiers
|
|
identifiers = wikidata.get("wikidata_identifiers", {})
|
|
if identifiers:
|
|
same_as = jsonld.get("sameAs", [])
|
|
if identifiers.get("VIAF"):
|
|
same_as.append(f"http://viaf.org/viaf/{identifiers['VIAF']}")
|
|
if identifiers.get("GND"):
|
|
same_as.append(f"https://d-nb.info/gnd/{identifiers['GND']}")
|
|
if identifiers.get("ROR"):
|
|
same_as.append(f"https://ror.org/{identifiers['ROR']}")
|
|
if same_as:
|
|
jsonld["sameAs"] = same_as
|
|
|
|
return jsonld
|
|
|
|
|
|
def generate_statistics(entries: List[Dict]) -> Dict:
|
|
"""Generate statistics from entries."""
|
|
stats = {
|
|
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
"total_entries": len(entries),
|
|
}
|
|
|
|
# Count by enrichment status
|
|
status_counts = Counter()
|
|
type_counts = Counter()
|
|
city_counts = Counter()
|
|
system_counts = Counter()
|
|
has_coords = 0
|
|
has_viaf = 0
|
|
has_gnd = 0
|
|
has_isil = 0
|
|
has_wikipedia_nl = 0
|
|
has_image = 0
|
|
instance_type_counts = Counter()
|
|
|
|
for entry in entries:
|
|
original = entry.get("original_entry", {})
|
|
wikidata = entry.get("wikidata_enrichment", {})
|
|
|
|
# Status
|
|
status_counts[entry.get("enrichment_status", "unknown")] += 1
|
|
|
|
# Type codes
|
|
for t in original.get("type", []):
|
|
type_counts[t] += 1
|
|
|
|
# City
|
|
city = original.get("plaatsnaam_bezoekadres", "")
|
|
if city:
|
|
city_counts[city] += 1
|
|
|
|
# System
|
|
system = original.get("systeem", "")
|
|
if system:
|
|
system_counts[system] += 1
|
|
|
|
# ISIL
|
|
if original.get("isil-code_na"):
|
|
has_isil += 1
|
|
|
|
# Wikidata enrichment stats
|
|
if wikidata:
|
|
# Coordinates
|
|
if wikidata.get("wikidata_coordinates", {}).get("latitude"):
|
|
has_coords += 1
|
|
|
|
# Identifiers
|
|
identifiers = wikidata.get("wikidata_identifiers", {})
|
|
if identifiers.get("VIAF"):
|
|
has_viaf += 1
|
|
if identifiers.get("GND"):
|
|
has_gnd += 1
|
|
|
|
# Wikipedia
|
|
if wikidata.get("wikidata_sitelinks", {}).get("nlwiki"):
|
|
has_wikipedia_nl += 1
|
|
|
|
# Image
|
|
if wikidata.get("wikidata_image"):
|
|
has_image += 1
|
|
|
|
# Instance types
|
|
for inst in wikidata.get("wikidata_instance_of", []):
|
|
label = inst.get("label_en", inst.get("id", "unknown"))
|
|
instance_type_counts[label] += 1
|
|
|
|
stats["by_enrichment_status"] = dict(status_counts.most_common())
|
|
stats["by_institution_type"] = dict(type_counts.most_common())
|
|
stats["top_cities"] = dict(city_counts.most_common(20))
|
|
stats["by_collection_system"] = dict(system_counts.most_common())
|
|
stats["top_wikidata_instance_types"] = dict(instance_type_counts.most_common(20))
|
|
|
|
stats["identifier_coverage"] = {
|
|
"has_isil_code": has_isil,
|
|
"has_coordinates": has_coords,
|
|
"has_viaf": has_viaf,
|
|
"has_gnd": has_gnd,
|
|
"has_wikipedia_nl": has_wikipedia_nl,
|
|
"has_image": has_image,
|
|
}
|
|
|
|
# Calculate percentages
|
|
total = len(entries)
|
|
stats["coverage_percentages"] = {
|
|
"isil_code": f"{has_isil / total * 100:.1f}%",
|
|
"coordinates": f"{has_coords / total * 100:.1f}%",
|
|
"viaf": f"{has_viaf / total * 100:.1f}%",
|
|
"gnd": f"{has_gnd / total * 100:.1f}%",
|
|
"wikipedia_nl": f"{has_wikipedia_nl / total * 100:.1f}%",
|
|
"image": f"{has_image / total * 100:.1f}%",
|
|
}
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
"""Main export function."""
|
|
logger.info("Loading enriched entries...")
|
|
entries = load_all_entries()
|
|
logger.info(f"Loaded {len(entries)} entries")
|
|
|
|
# Create exports directory
|
|
EXPORTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Export to CSV
|
|
logger.info("Exporting to CSV...")
|
|
csv_path = EXPORTS_DIR / "nde_enriched.csv"
|
|
flat_entries = [flatten_entry_for_csv(e) for e in entries]
|
|
|
|
if flat_entries:
|
|
fieldnames = list(flat_entries[0].keys())
|
|
with open(csv_path, "w", newline="", encoding="utf-8") as f:
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(flat_entries)
|
|
logger.info(f"CSV exported to: {csv_path}")
|
|
|
|
# Export to JSON-LD
|
|
logger.info("Exporting to JSON-LD...")
|
|
jsonld_path = EXPORTS_DIR / "nde_enriched.jsonld"
|
|
jsonld_entries = [entry_to_jsonld(e) for e in entries]
|
|
|
|
jsonld_doc = {
|
|
**JSONLD_CONTEXT,
|
|
"@graph": jsonld_entries,
|
|
}
|
|
|
|
with open(jsonld_path, "w", encoding="utf-8") as f:
|
|
json.dump(jsonld_doc, f, indent=2, ensure_ascii=False)
|
|
logger.info(f"JSON-LD exported to: {jsonld_path}")
|
|
|
|
# Generate statistics
|
|
logger.info("Generating statistics...")
|
|
stats = generate_statistics(entries)
|
|
stats_path = EXPORTS_DIR / "statistics.yaml"
|
|
|
|
with open(stats_path, "w", encoding="utf-8") as f:
|
|
yaml.dump(stats, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
logger.info(f"Statistics saved to: {stats_path}")
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 60)
|
|
print("EXPORT SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Total entries: {stats['total_entries']}")
|
|
print(f"\nBy enrichment status:")
|
|
for status, count in stats["by_enrichment_status"].items():
|
|
print(f" {status}: {count}")
|
|
print(f"\nBy institution type:")
|
|
for t, count in stats["by_institution_type"].items():
|
|
print(f" {t}: {count}")
|
|
print(f"\nIdentifier coverage:")
|
|
for key, pct in stats["coverage_percentages"].items():
|
|
print(f" {key}: {pct}")
|
|
print(f"\nTop 10 cities:")
|
|
for city, count in list(stats["top_cities"].items())[:10]:
|
|
print(f" {city}: {count}")
|
|
print(f"\nTop 10 collection systems:")
|
|
for system, count in list(stats["by_collection_system"].items())[:10]:
|
|
print(f" {system}: {count}")
|
|
print("\n" + "=" * 60)
|
|
print(f"Files saved to: {EXPORTS_DIR}")
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|