#!/usr/bin/env python3 """ Export enriched NDE entries to various formats. Creates: - CSV export with flattened data - JSON-LD export for linked data - Statistics report Usage: python scripts/export_nde_enriched.py Output: data/nde/exports/nde_enriched.csv data/nde/exports/nde_enriched.jsonld data/nde/exports/statistics.yaml """ import os import csv import json import yaml from pathlib import Path from datetime import datetime, timezone from typing import Dict, List, Any, Optional from collections import Counter import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Paths PROJECT_ROOT = Path(__file__).parent.parent ENTRIES_DIR = PROJECT_ROOT / "data" / "nde" / "enriched" / "entries" EXPORTS_DIR = PROJECT_ROOT / "data" / "nde" / "exports" ENTITY_CACHE = PROJECT_ROOT / "data" / "nde" / "enriched" / "entity_cache.json" # JSON-LD context for heritage institutions JSONLD_CONTEXT = { "@context": { "@vocab": "https://schema.org/", "wd": "http://www.wikidata.org/entity/", "wdt": "http://www.wikidata.org/prop/direct/", "glam": "https://w3id.org/heritage/custodian/", "isil": "http://id.loc.gov/vocabulary/identifiers/isil/", "viaf": "http://viaf.org/viaf/", "gnd": "https://d-nb.info/gnd/", "name": "schema:name", "alternateName": "schema:alternateName", "description": "schema:description", "url": "schema:url", "address": "schema:address", "geo": "schema:geo", "identifier": "schema:identifier", "foundingDate": "schema:foundingDate", "dissolutionDate": "schema:dissolutionDate", "image": "schema:image", "sameAs": "schema:sameAs", "wikidataId": {"@id": "schema:identifier", "@type": "@id"}, "isilCode": "isil:", "viafId": "viaf:", "gndId": "gnd:", "institutionType": "glam:institutionType", "collectionSystem": "glam:collectionSystem", } } def load_entity_cache() -> Dict[str, Dict]: """Load the entity label cache.""" if ENTITY_CACHE.exists(): with open(ENTITY_CACHE, "r", encoding="utf-8") as f: return json.load(f) return {} def load_all_entries() -> List[Dict]: """Load all enriched entry YAML files.""" entries = [] for yaml_file in sorted(ENTRIES_DIR.glob("*.yaml")): try: with open(yaml_file, "r", encoding="utf-8") as f: entry = yaml.safe_load(f) if entry: entry["_source_file"] = yaml_file.name entries.append(entry) except Exception as e: logger.warning(f"Error loading {yaml_file}: {e}") return entries def flatten_entry_for_csv(entry: Dict) -> Dict: """Flatten a nested entry for CSV export.""" original = entry.get("original_entry", {}) wikidata = entry.get("wikidata_enrichment", {}) # Basic fields from original flat = { "entry_index": entry.get("entry_index", ""), "name": original.get("organisatie", ""), "city": original.get("plaatsnaam_bezoekadres", ""), "address": original.get("straat_en_huisnummer_bezoekadres", ""), "website": original.get("webadres_organisatie", ""), "type_code": ", ".join(original.get("type", [])) if original.get("type") else "", "type_organisatie": original.get("type_organisatie", ""), "isil_code": original.get("isil-code_na", ""), "system": original.get("systeem", ""), "wikidata_id": original.get("wikidata_id", ""), # Enrichment status "enrichment_status": entry.get("enrichment_status", ""), } # Wikidata enrichment fields if wikidata: flat["wikidata_label_nl"] = wikidata.get("wikidata_label_nl", "") flat["wikidata_label_en"] = wikidata.get("wikidata_label_en", "") flat["wikidata_description_nl"] = wikidata.get("wikidata_description_nl", "") flat["wikidata_description_en"] = wikidata.get("wikidata_description_en", "") # Coordinates coords = wikidata.get("wikidata_coordinates", {}) flat["latitude"] = coords.get("latitude", "") flat["longitude"] = coords.get("longitude", "") # Identifiers identifiers = wikidata.get("wikidata_identifiers", {}) flat["viaf_id"] = identifiers.get("VIAF", "") flat["gnd_id"] = identifiers.get("GND", "") flat["rkd_artists_id"] = identifiers.get("RKD artists", "") flat["ror_id"] = identifiers.get("ROR", "") # Instance types instance_of = wikidata.get("wikidata_instance_of", []) flat["instance_types"] = "; ".join([ f"{i.get('id', '')} ({i.get('label_en', '')})" for i in instance_of[:3] # Limit to 3 ]) # Location location = wikidata.get("wikidata_located_in", {}) flat["wikidata_location"] = f"{location.get('id', '')} ({location.get('label_en', '')})" if location else "" # Country country = wikidata.get("wikidata_country", {}) flat["wikidata_country"] = f"{country.get('id', '')} ({country.get('label_en', '')})" if country else "" # Dates flat["inception"] = wikidata.get("wikidata_inception", "") flat["dissolution"] = wikidata.get("wikidata_dissolution", "") # Media flat["official_website"] = wikidata.get("wikidata_official_website", "") flat["image_url"] = wikidata.get("wikidata_image", "") flat["logo_url"] = wikidata.get("wikidata_logo", "") # Wikipedia links sitelinks = wikidata.get("wikidata_sitelinks", {}) flat["wikipedia_nl"] = sitelinks.get("nlwiki", "") flat["wikipedia_en"] = sitelinks.get("enwiki", "") return flat def entry_to_jsonld(entry: Dict) -> Dict: """Convert an entry to JSON-LD format.""" original = entry.get("original_entry", {}) wikidata = entry.get("wikidata_enrichment", {}) wikidata_id = original.get("wikidata_id", "") jsonld = { "@type": "Organization", "@id": f"glam:nde/{entry.get('entry_index', 'unknown')}", "name": original.get("organisatie", ""), } # Add Wikidata sameAs if wikidata_id and str(wikidata_id).startswith("Q"): jsonld["sameAs"] = [f"http://www.wikidata.org/entity/{wikidata_id}"] # Address city = original.get("plaatsnaam_bezoekadres", "") street = original.get("straat_en_huisnummer_bezoekadres", "") if city or street: jsonld["address"] = { "@type": "PostalAddress", "addressLocality": city, "streetAddress": street, "addressCountry": "NL", } # URL if original.get("webadres_organisatie"): jsonld["url"] = original["webadres_organisatie"] # ISIL code if original.get("isil-code_na"): jsonld["identifier"] = { "@type": "PropertyValue", "propertyID": "ISIL", "value": original["isil-code_na"], } # Wikidata enrichment if wikidata: # Alternative names aliases = wikidata.get("wikidata_aliases", {}) all_aliases = [] for lang_aliases in aliases.values(): if isinstance(lang_aliases, list): all_aliases.extend(lang_aliases) if all_aliases: jsonld["alternateName"] = list(set(all_aliases)) # Description desc = wikidata.get("wikidata_description_en") or wikidata.get("wikidata_description_nl") if desc: jsonld["description"] = desc # Geo coordinates coords = wikidata.get("wikidata_coordinates", {}) if coords.get("latitude") and coords.get("longitude"): jsonld["geo"] = { "@type": "GeoCoordinates", "latitude": coords["latitude"], "longitude": coords["longitude"], } # Founding date if wikidata.get("wikidata_inception"): jsonld["foundingDate"] = wikidata["wikidata_inception"] # Image if wikidata.get("wikidata_image"): jsonld["image"] = wikidata["wikidata_image"] # Additional identifiers identifiers = wikidata.get("wikidata_identifiers", {}) if identifiers: same_as = jsonld.get("sameAs", []) if identifiers.get("VIAF"): same_as.append(f"http://viaf.org/viaf/{identifiers['VIAF']}") if identifiers.get("GND"): same_as.append(f"https://d-nb.info/gnd/{identifiers['GND']}") if identifiers.get("ROR"): same_as.append(f"https://ror.org/{identifiers['ROR']}") if same_as: jsonld["sameAs"] = same_as return jsonld def generate_statistics(entries: List[Dict]) -> Dict: """Generate statistics from entries.""" stats = { "generated_at": datetime.now(timezone.utc).isoformat(), "total_entries": len(entries), } # Count by enrichment status status_counts = Counter() type_counts = Counter() city_counts = Counter() system_counts = Counter() has_coords = 0 has_viaf = 0 has_gnd = 0 has_isil = 0 has_wikipedia_nl = 0 has_image = 0 instance_type_counts = Counter() for entry in entries: original = entry.get("original_entry", {}) wikidata = entry.get("wikidata_enrichment", {}) # Status status_counts[entry.get("enrichment_status", "unknown")] += 1 # Type codes for t in original.get("type", []): type_counts[t] += 1 # City city = original.get("plaatsnaam_bezoekadres", "") if city: city_counts[city] += 1 # System system = original.get("systeem", "") if system: system_counts[system] += 1 # ISIL if original.get("isil-code_na"): has_isil += 1 # Wikidata enrichment stats if wikidata: # Coordinates if wikidata.get("wikidata_coordinates", {}).get("latitude"): has_coords += 1 # Identifiers identifiers = wikidata.get("wikidata_identifiers", {}) if identifiers.get("VIAF"): has_viaf += 1 if identifiers.get("GND"): has_gnd += 1 # Wikipedia if wikidata.get("wikidata_sitelinks", {}).get("nlwiki"): has_wikipedia_nl += 1 # Image if wikidata.get("wikidata_image"): has_image += 1 # Instance types for inst in wikidata.get("wikidata_instance_of", []): label = inst.get("label_en", inst.get("id", "unknown")) instance_type_counts[label] += 1 stats["by_enrichment_status"] = dict(status_counts.most_common()) stats["by_institution_type"] = dict(type_counts.most_common()) stats["top_cities"] = dict(city_counts.most_common(20)) stats["by_collection_system"] = dict(system_counts.most_common()) stats["top_wikidata_instance_types"] = dict(instance_type_counts.most_common(20)) stats["identifier_coverage"] = { "has_isil_code": has_isil, "has_coordinates": has_coords, "has_viaf": has_viaf, "has_gnd": has_gnd, "has_wikipedia_nl": has_wikipedia_nl, "has_image": has_image, } # Calculate percentages total = len(entries) stats["coverage_percentages"] = { "isil_code": f"{has_isil / total * 100:.1f}%", "coordinates": f"{has_coords / total * 100:.1f}%", "viaf": f"{has_viaf / total * 100:.1f}%", "gnd": f"{has_gnd / total * 100:.1f}%", "wikipedia_nl": f"{has_wikipedia_nl / total * 100:.1f}%", "image": f"{has_image / total * 100:.1f}%", } return stats def main(): """Main export function.""" logger.info("Loading enriched entries...") entries = load_all_entries() logger.info(f"Loaded {len(entries)} entries") # Create exports directory EXPORTS_DIR.mkdir(parents=True, exist_ok=True) # Export to CSV logger.info("Exporting to CSV...") csv_path = EXPORTS_DIR / "nde_enriched.csv" flat_entries = [flatten_entry_for_csv(e) for e in entries] if flat_entries: fieldnames = list(flat_entries[0].keys()) with open(csv_path, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(flat_entries) logger.info(f"CSV exported to: {csv_path}") # Export to JSON-LD logger.info("Exporting to JSON-LD...") jsonld_path = EXPORTS_DIR / "nde_enriched.jsonld" jsonld_entries = [entry_to_jsonld(e) for e in entries] jsonld_doc = { **JSONLD_CONTEXT, "@graph": jsonld_entries, } with open(jsonld_path, "w", encoding="utf-8") as f: json.dump(jsonld_doc, f, indent=2, ensure_ascii=False) logger.info(f"JSON-LD exported to: {jsonld_path}") # Generate statistics logger.info("Generating statistics...") stats = generate_statistics(entries) stats_path = EXPORTS_DIR / "statistics.yaml" with open(stats_path, "w", encoding="utf-8") as f: yaml.dump(stats, f, default_flow_style=False, allow_unicode=True, sort_keys=False) logger.info(f"Statistics saved to: {stats_path}") # Print summary print("\n" + "=" * 60) print("EXPORT SUMMARY") print("=" * 60) print(f"Total entries: {stats['total_entries']}") print(f"\nBy enrichment status:") for status, count in stats["by_enrichment_status"].items(): print(f" {status}: {count}") print(f"\nBy institution type:") for t, count in stats["by_institution_type"].items(): print(f" {t}: {count}") print(f"\nIdentifier coverage:") for key, pct in stats["coverage_percentages"].items(): print(f" {key}: {pct}") print(f"\nTop 10 cities:") for city, count in list(stats["top_cities"].items())[:10]: print(f" {city}: {count}") print(f"\nTop 10 collection systems:") for system, count in list(stats["by_collection_system"].items())[:10]: print(f" {system}: {count}") print("\n" + "=" * 60) print(f"Files saved to: {EXPORTS_DIR}") print("=" * 60) if __name__ == "__main__": main()