glam/scripts/generate_kien_custodian_entries.py
2025-12-05 15:30:23 +01:00

336 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Generate Custodian entry YAML files from KIEN (Kenniscentrum Immaterieel Erfgoed Nederland) data.
This script reads the extracted custodian profiles from KIEN and generates
individual YAML entry files in the /data/nde/enriched/entries/ format.
Starting entry_index: 1674 (continuing from existing entries)
"""
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import yaml
# Configuration
CUSTODIAN_PROFILES_PATH = Path("/Users/kempersc/apps/glam/data/intangible_heritage/custodian_profiles.json")
WIKIDATA_CROSSREF_PATH = Path("/Users/kempersc/apps/glam/data/intangible_heritage/wikidata_crossref.json")
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/nde/enriched/entries")
STARTING_INDEX = 1674
# Patterns to identify non-custodian entries (news, educational materials, etc.)
NON_CUSTODIAN_PATTERNS = [
r"^Lesmateriaal\s+", # Educational materials
r"^Nieuws\s+", # News items
r"^Publicaties?\s*$", # Publications pages
]
# KIEN default phone number (used when org doesn't have own phone)
KIEN_DEFAULT_PHONE = "+31263576113"
KIEN_DEFAULT_EMAIL = "info@immaterieelerfgoed.nl"
def is_valid_custodian(name: str) -> bool:
"""Check if the entry is a valid custodian (not news/educational material)."""
for pattern in NON_CUSTODIAN_PATTERNS:
if re.match(pattern, name, re.IGNORECASE):
return False
return True
def extract_legal_form(name: str) -> tuple[str, Optional[str]]:
"""
Extract legal form from organization name and return cleaned name + legal form.
Per AGENTS.md Rule 8: Legal form terms are filtered from CustodianName.
"""
# Dutch legal form patterns to filter
legal_forms = {
"Stichting": "Stichting",
"Vereniging": None, # Keep - describes organizational purpose
"Federatie": None, # Keep - describes organizational purpose
"Genootschap": None, # Keep
"Bond": None, # Keep
"Broederschap": None, # Keep
"Comité": None, # Keep
"Commissie": None, # Keep
"Platform": None, # Keep
"Kring": None, # Keep
"Gilde": None, # Keep
"Jonkheid": None, # Keep - traditional youth organization
"Buurt": None, # Keep - neighborhood organization
}
cleaned_name = name
detected_legal_form = None
# Only filter "Stichting" - other terms describe organizational purpose
if name.lower().startswith("stichting "):
cleaned_name = name[10:].strip()
detected_legal_form = "Stichting"
elif " stichting" in name.lower():
# Handle cases like "Hidde Nijland Stichting"
cleaned_name = re.sub(r'\s+[Ss]tichting$', '', name).strip()
detected_legal_form = "Stichting"
return cleaned_name, detected_legal_form
def normalize_website(url: Optional[str]) -> Optional[str]:
"""Normalize website URL."""
if not url:
return None
# Fix common issues
url = url.strip()
# Handle malformed URLs (e.g., email addresses used as URLs)
if "@" in url and not url.startswith("http"):
return None
# Ensure http/https prefix
if not url.startswith(("http://", "https://")):
url = "https://" + url
return url
def determine_institution_type(name: str, heritage_forms: list) -> str:
"""
Determine the institution type based on name and heritage forms.
KIEN custodians are primarily Intangible Heritage Groups (I),
but some may be other types.
"""
name_lower = name.lower()
# Check for museum indicators
if "museum" in name_lower:
return "M" # Museum
# Check for bakery/food heritage (Taste/Smell)
if any(x in name_lower for x in ["bakkerij", "visserij", "imkerij", "pottenbakkerij"]):
return "T" # Taste/Smell heritage
# Default to Intangible Heritage Group
return "I"
def generate_custodian_name_slug(name: str) -> str:
"""Generate a URL-safe slug from the custodian name."""
# Remove special characters and convert to lowercase
slug = re.sub(r'[^\w\s-]', '', name.lower())
slug = re.sub(r'[-\s]+', '_', slug)
return slug[:50] # Limit length
def create_entry_yaml(custodian: dict, entry_index: int, wikidata_map: dict) -> dict:
"""Create a YAML entry structure for a KIEN custodian."""
name = custodian.get("name", "")
cleaned_name, legal_form = extract_legal_form(name)
# Get heritage forms
heritage_forms = custodian.get("heritage_forms", [])
heritage_forms_linked = custodian.get("heritage_forms_linked", [])
# Combine heritage forms
all_heritage_forms = heritage_forms.copy()
for hf in heritage_forms_linked:
if isinstance(hf, dict) and "name" in hf:
# Filter out generic pages
if hf["name"] not in ["Inventaris Immaterieel Erfgoed Nederland", "Publicaties"]:
all_heritage_forms.append(hf["name"])
elif isinstance(hf, str):
all_heritage_forms.append(hf)
# Deduplicate
all_heritage_forms = list(set(all_heritage_forms))
# Determine institution type
inst_type = determine_institution_type(name, all_heritage_forms)
# Normalize website
website = normalize_website(custodian.get("website"))
# Get locations
locations = custodian.get("locations", [])
# Filter out "Arnhem" if it's KIEN's location, not the org's
if len(locations) > 1 and "Arnhem" in locations:
locations = [loc for loc in locations if loc != "Arnhem"]
elif locations == ["Arnhem"]:
# Only Arnhem - might be KIEN's default
pass
# Check for Wikidata mapping
wikidata_id = wikidata_map.get("custodians", {}).get(name, {}).get("wikidata_id")
# Build entry structure
entry = {
"original_entry": {
"organisatie": name,
"webadres_organisatie": website,
"type_organisatie": "intangible_heritage_custodian",
"systeem": "KIEN",
"type": [inst_type],
},
"entry_index": entry_index,
"processing_timestamp": datetime.now(timezone.utc).isoformat(),
"enrichment_status": "kien_extracted",
"provenance": {
"schema_version": "1.0.0",
"generated_at": datetime.now(timezone.utc).isoformat(),
"sources": {
"kien": [{
"source_type": "kien_intangible_heritage_registry",
"source_url": custodian.get("kien_url"),
"fetch_timestamp": custodian.get("extracted_at") or custodian.get("fetched_at"),
"data_tier": "TIER_2_VERIFIED",
"claims_extracted": ["name", "website", "email", "phone", "description", "locations", "heritage_forms"],
}]
},
"data_tier_summary": {
"TIER_1_AUTHORITATIVE": [],
"TIER_2_VERIFIED": ["kien_intangible_heritage_registry"],
"TIER_3_CROWD_SOURCED": [],
"TIER_4_INFERRED": [],
},
"notes": [
"Entry created from KIEN (Kenniscentrum Immaterieel Erfgoed Nederland) registry",
"Intangible heritage custodian organization",
],
},
"kien_enrichment": {
"kien_name": name,
"kien_url": custodian.get("kien_url"),
"heritage_forms": all_heritage_forms,
"enrichment_timestamp": datetime.now(timezone.utc).isoformat(),
"source": "https://www.immaterieelerfgoed.nl",
},
}
# Add description if available
description = custodian.get("description", "").strip()
if description:
entry["notes"] = description
# Add contact info (only if not KIEN default)
contact = {}
email = custodian.get("email")
phone = custodian.get("phone")
if email and email != KIEN_DEFAULT_EMAIL:
contact["email"] = email
if phone and phone != KIEN_DEFAULT_PHONE:
contact["phone"] = phone
if website:
contact["website"] = website
if contact:
entry["contact"] = contact
# Add locations
if locations:
entry["locations"] = []
for loc in locations:
entry["locations"].append({
"city": loc,
"country": "NL",
})
# Add legal form if detected
if legal_form:
entry["legal_status"] = {
"legal_form": legal_form,
"original_name_with_legal_form": name,
}
# Add custodian_name
entry["custodian_name"] = {
"claim_type": "custodian_name",
"claim_value": cleaned_name.lower(),
"source": "kien_registry",
"confidence": 0.9,
"extraction_timestamp": datetime.now(timezone.utc).isoformat(),
}
# Add identifiers
entry["identifiers"] = [
{
"identifier_scheme": "KIEN_URL",
"identifier_value": custodian.get("kien_url"),
"identifier_url": custodian.get("kien_url"),
}
]
if wikidata_id:
entry["identifiers"].append({
"identifier_scheme": "Wikidata",
"identifier_value": wikidata_id,
"identifier_url": f"https://www.wikidata.org/wiki/{wikidata_id}",
})
entry["provenance"]["notes"].append(f"Wikidata ID: {wikidata_id}")
return entry
def main():
"""Main function to generate KIEN custodian entry files."""
print("Loading KIEN custodian profiles...")
with open(CUSTODIAN_PROFILES_PATH, 'r', encoding='utf-8') as f:
data = json.load(f)
custodians = data.get("custodians", [])
print(f"Loaded {len(custodians)} custodian profiles")
# Load Wikidata crossref
wikidata_map = {}
if WIKIDATA_CROSSREF_PATH.exists():
with open(WIKIDATA_CROSSREF_PATH, 'r', encoding='utf-8') as f:
wikidata_map = json.load(f)
print(f"Loaded Wikidata crossref with {len(wikidata_map.get('custodians', {}))} custodian mappings")
# Filter valid custodians
valid_custodians = [c for c in custodians if is_valid_custodian(c.get("name", ""))]
skipped = len(custodians) - len(valid_custodians)
print(f"Filtered to {len(valid_custodians)} valid custodians (skipped {skipped} non-custodian entries)")
# Create output directory if needed
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Generate entries
created_count = 0
for i, custodian in enumerate(valid_custodians):
entry_index = STARTING_INDEX + i
name = custodian.get("name", "Unknown")
try:
entry = create_entry_yaml(custodian, entry_index, wikidata_map)
# Generate filename
slug = generate_custodian_name_slug(name)
filename = f"{entry_index}_{slug}.yaml"
filepath = OUTPUT_DIR / filename
# Write YAML file
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
created_count += 1
if created_count % 20 == 0:
print(f"Created {created_count} entries...")
except Exception as e:
print(f"Error creating entry for '{name}': {e}")
print(f"\nDone! Created {created_count} custodian entry files.")
print(f"Entry indices: {STARTING_INDEX} - {STARTING_INDEX + created_count - 1}")
print(f"Output directory: {OUTPUT_DIR}")
if __name__ == "__main__":
main()