glam/scripts/generate_kien_custodian_entries.py

#!/usr/bin/env python3
"""
Generate Custodian entry YAML files from KIEN (Kenniscentrum Immaterieel Erfgoed Nederland) data.

This script reads the extracted custodian profiles from KIEN and generates
individual YAML entry files in the /data/nde/enriched/entries/ format.

Starting entry_index: 1674 (continuing from existing entries)
"""

import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import yaml

# Configuration
CUSTODIAN_PROFILES_PATH = Path("/Users/kempersc/apps/glam/data/intangible_heritage/custodian_profiles.json")
WIKIDATA_CROSSREF_PATH = Path("/Users/kempersc/apps/glam/data/intangible_heritage/wikidata_crossref.json")
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/nde/enriched/entries")
STARTING_INDEX = 1674

# Patterns to identify non-custodian entries (news, educational materials, etc.)
NON_CUSTODIAN_PATTERNS = [
    r"^Lesmateriaal\s+",  # Educational materials
    r"^Nieuws\s+",  # News items
    r"^Publicaties?\s*$",  # Publications pages
]

# KIEN default phone number (used when org doesn't have own phone)
KIEN_DEFAULT_PHONE = "+31263576113"
KIEN_DEFAULT_EMAIL = "info@immaterieelerfgoed.nl"


def is_valid_custodian(name: str) -> bool:
    """Check if the entry is a valid custodian (not news/educational material)."""
    for pattern in NON_CUSTODIAN_PATTERNS:
        if re.match(pattern, name, re.IGNORECASE):
            return False
    return True


def extract_legal_form(name: str) -> tuple[str, Optional[str]]:
    """
    Extract legal form from organization name and return cleaned name + legal form.

    Per AGENTS.md Rule 8: Legal form terms are filtered from CustodianName.
    """
    # Dutch legal form patterns to filter
    legal_forms = {
        "Stichting": "Stichting",
        "Vereniging": None,  # Keep - describes organizational purpose
        "Federatie": None,  # Keep - describes organizational purpose
        "Genootschap": None,  # Keep
        "Bond": None,  # Keep
        "Broederschap": None,  # Keep
        "Comité": None,  # Keep
        "Commissie": None,  # Keep
        "Platform": None,  # Keep
        "Kring": None,  # Keep
        "Gilde": None,  # Keep
        "Jonkheid": None,  # Keep - traditional youth organization
        "Buurt": None,  # Keep - neighborhood organization
    }

    cleaned_name = name
    detected_legal_form = None

    # Only filter "Stichting" - other terms describe organizational purpose
    if name.lower().startswith("stichting "):
        cleaned_name = name[10:].strip()
        detected_legal_form = "Stichting"
    elif " stichting" in name.lower():
        # Handle cases like "Hidde Nijland Stichting"
        cleaned_name = re.sub(r'\s+[Ss]tichting$', '', name).strip()
        detected_legal_form = "Stichting"

    return cleaned_name, detected_legal_form


def normalize_website(url: Optional[str]) -> Optional[str]:
    """Normalize website URL."""
    if not url:
        return None

    # Fix common issues
    url = url.strip()

    # Handle malformed URLs (e.g., email addresses used as URLs)
    if "@" in url and not url.startswith("http"):
        return None

    # Ensure http/https prefix
    if not url.startswith(("http://", "https://")):
        url = "https://" + url

    return url


def determine_institution_type(name: str, heritage_forms: list) -> str:
    """
    Determine the institution type based on name and heritage forms.

    KIEN custodians are primarily Intangible Heritage Groups (I),
    but some may be other types.
    """
    name_lower = name.lower()

    # Check for museum indicators
    if "museum" in name_lower:
        return "M"  # Museum

    # Check for bakery/food heritage (Taste/Smell)
    if any(x in name_lower for x in ["bakkerij", "visserij", "imkerij", "pottenbakkerij"]):
        return "T"  # Taste/Smell heritage

    # Default to Intangible Heritage Group
    return "I"


def generate_custodian_name_slug(name: str) -> str:
    """Generate a URL-safe slug from the custodian name."""
    # Remove special characters and convert to lowercase
    slug = re.sub(r'[^\w\s-]', '', name.lower())
    slug = re.sub(r'[-\s]+', '_', slug)
    return slug[:50]  # Limit length


def create_entry_yaml(custodian: dict, entry_index: int, wikidata_map: dict) -> dict:
    """Create a YAML entry structure for a KIEN custodian."""

    name = custodian.get("name", "")
    cleaned_name, legal_form = extract_legal_form(name)

    # Get heritage forms
    heritage_forms = custodian.get("heritage_forms", [])
    heritage_forms_linked = custodian.get("heritage_forms_linked", [])

    # Combine heritage forms
    all_heritage_forms = heritage_forms.copy()
    for hf in heritage_forms_linked:
        if isinstance(hf, dict) and "name" in hf:
            # Filter out generic pages
            if hf["name"] not in ["Inventaris Immaterieel Erfgoed Nederland", "Publicaties"]:
                all_heritage_forms.append(hf["name"])
        elif isinstance(hf, str):
            all_heritage_forms.append(hf)

    # Deduplicate
    all_heritage_forms = list(set(all_heritage_forms))

    # Determine institution type
    inst_type = determine_institution_type(name, all_heritage_forms)

    # Normalize website
    website = normalize_website(custodian.get("website"))

    # Get locations
    locations = custodian.get("locations", [])
    # Filter out "Arnhem" if it's KIEN's location, not the org's
    if len(locations) > 1 and "Arnhem" in locations:
        locations = [loc for loc in locations if loc != "Arnhem"]
    elif locations == ["Arnhem"]:
        # Only Arnhem - might be KIEN's default
        pass

    # Check for Wikidata mapping
    wikidata_id = wikidata_map.get("custodians", {}).get(name, {}).get("wikidata_id")

    # Build entry structure
    entry = {
        "original_entry": {
            "organisatie": name,
            "webadres_organisatie": website,
            "type_organisatie": "intangible_heritage_custodian",
            "systeem": "KIEN",
            "type": [inst_type],
        },
        "entry_index": entry_index,
        "processing_timestamp": datetime.now(timezone.utc).isoformat(),
        "enrichment_status": "kien_extracted",
        "provenance": {
            "schema_version": "1.0.0",
            "generated_at": datetime.now(timezone.utc).isoformat(),
            "sources": {
                "kien": [{
                    "source_type": "kien_intangible_heritage_registry",
                    "source_url": custodian.get("kien_url"),
                    "fetch_timestamp": custodian.get("extracted_at") or custodian.get("fetched_at"),
                    "data_tier": "TIER_2_VERIFIED",
                    "claims_extracted": ["name", "website", "email", "phone", "description", "locations", "heritage_forms"],
                }]
            },
            "data_tier_summary": {
                "TIER_1_AUTHORITATIVE": [],
                "TIER_2_VERIFIED": ["kien_intangible_heritage_registry"],
                "TIER_3_CROWD_SOURCED": [],
                "TIER_4_INFERRED": [],
            },
            "notes": [
                "Entry created from KIEN (Kenniscentrum Immaterieel Erfgoed Nederland) registry",
                "Intangible heritage custodian organization",
            ],
        },
        "kien_enrichment": {
            "kien_name": name,
            "kien_url": custodian.get("kien_url"),
            "heritage_forms": all_heritage_forms,
            "enrichment_timestamp": datetime.now(timezone.utc).isoformat(),
            "source": "https://www.immaterieelerfgoed.nl",
        },
    }

    # Add description if available
    description = custodian.get("description", "").strip()
    if description:
        entry["notes"] = description

    # Add contact info (only if not KIEN default)
    contact = {}
    email = custodian.get("email")
    phone = custodian.get("phone")

    if email and email != KIEN_DEFAULT_EMAIL:
        contact["email"] = email
    if phone and phone != KIEN_DEFAULT_PHONE:
        contact["phone"] = phone
    if website:
        contact["website"] = website

    if contact:
        entry["contact"] = contact

    # Add locations
    if locations:
        entry["locations"] = []
        for loc in locations:
            entry["locations"].append({
                "city": loc,
                "country": "NL",
            })

    # Add legal form if detected
    if legal_form:
        entry["legal_status"] = {
            "legal_form": legal_form,
            "original_name_with_legal_form": name,
        }

    # Add custodian_name
    entry["custodian_name"] = {
        "claim_type": "custodian_name",
        "claim_value": cleaned_name.lower(),
        "source": "kien_registry",
        "confidence": 0.9,
        "extraction_timestamp": datetime.now(timezone.utc).isoformat(),
    }

    # Add identifiers
    entry["identifiers"] = [
        {
            "identifier_scheme": "KIEN_URL",
            "identifier_value": custodian.get("kien_url"),
            "identifier_url": custodian.get("kien_url"),
        }
    ]

    if wikidata_id:
        entry["identifiers"].append({
            "identifier_scheme": "Wikidata",
            "identifier_value": wikidata_id,
            "identifier_url": f"https://www.wikidata.org/wiki/{wikidata_id}",
        })
        entry["provenance"]["notes"].append(f"Wikidata ID: {wikidata_id}")

    return entry


def main():
    """Main function to generate KIEN custodian entry files."""

    print("Loading KIEN custodian profiles...")
    with open(CUSTODIAN_PROFILES_PATH, 'r', encoding='utf-8') as f:
        data = json.load(f)

    custodians = data.get("custodians", [])
    print(f"Loaded {len(custodians)} custodian profiles")

    # Load Wikidata crossref
    wikidata_map = {}
    if WIKIDATA_CROSSREF_PATH.exists():
        with open(WIKIDATA_CROSSREF_PATH, 'r', encoding='utf-8') as f:
            wikidata_map = json.load(f)
        print(f"Loaded Wikidata crossref with {len(wikidata_map.get('custodians', {}))} custodian mappings")

    # Filter valid custodians
    valid_custodians = [c for c in custodians if is_valid_custodian(c.get("name", ""))]
    skipped = len(custodians) - len(valid_custodians)
    print(f"Filtered to {len(valid_custodians)} valid custodians (skipped {skipped} non-custodian entries)")

    # Create output directory if needed
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    # Generate entries
    created_count = 0
    for i, custodian in enumerate(valid_custodians):
        entry_index = STARTING_INDEX + i
        name = custodian.get("name", "Unknown")

        try:
            entry = create_entry_yaml(custodian, entry_index, wikidata_map)

            # Generate filename
            slug = generate_custodian_name_slug(name)
            filename = f"{entry_index}_{slug}.yaml"
            filepath = OUTPUT_DIR / filename

            # Write YAML file
            with open(filepath, 'w', encoding='utf-8') as f:
                yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

            created_count += 1
            if created_count % 20 == 0:
                print(f"Created {created_count} entries...")

        except Exception as e:
            print(f"Error creating entry for '{name}': {e}")

    print(f"\nDone! Created {created_count} custodian entry files.")
    print(f"Entry indices: {STARTING_INDEX} - {STARTING_INDEX + created_count - 1}")
    print(f"Output directory: {OUTPUT_DIR}")


if __name__ == "__main__":
    main()