#!/usr/bin/env python3 """ Generate Custodian entry YAML files from KIEN (Kenniscentrum Immaterieel Erfgoed Nederland) data. This script reads the extracted custodian profiles from KIEN and generates individual YAML entry files in the /data/nde/enriched/entries/ format. Starting entry_index: 1674 (continuing from existing entries) """ import json import re from datetime import datetime, timezone from pathlib import Path from typing import Optional import yaml # Configuration CUSTODIAN_PROFILES_PATH = Path("/Users/kempersc/apps/glam/data/intangible_heritage/custodian_profiles.json") WIKIDATA_CROSSREF_PATH = Path("/Users/kempersc/apps/glam/data/intangible_heritage/wikidata_crossref.json") OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/nde/enriched/entries") STARTING_INDEX = 1674 # Patterns to identify non-custodian entries (news, educational materials, etc.) NON_CUSTODIAN_PATTERNS = [ r"^Lesmateriaal\s+", # Educational materials r"^Nieuws\s+", # News items r"^Publicaties?\s*$", # Publications pages ] # KIEN default phone number (used when org doesn't have own phone) KIEN_DEFAULT_PHONE = "+31263576113" KIEN_DEFAULT_EMAIL = "info@immaterieelerfgoed.nl" def is_valid_custodian(name: str) -> bool: """Check if the entry is a valid custodian (not news/educational material).""" for pattern in NON_CUSTODIAN_PATTERNS: if re.match(pattern, name, re.IGNORECASE): return False return True def extract_legal_form(name: str) -> tuple[str, Optional[str]]: """ Extract legal form from organization name and return cleaned name + legal form. Per AGENTS.md Rule 8: Legal form terms are filtered from CustodianName. """ # Dutch legal form patterns to filter legal_forms = { "Stichting": "Stichting", "Vereniging": None, # Keep - describes organizational purpose "Federatie": None, # Keep - describes organizational purpose "Genootschap": None, # Keep "Bond": None, # Keep "Broederschap": None, # Keep "Comité": None, # Keep "Commissie": None, # Keep "Platform": None, # Keep "Kring": None, # Keep "Gilde": None, # Keep "Jonkheid": None, # Keep - traditional youth organization "Buurt": None, # Keep - neighborhood organization } cleaned_name = name detected_legal_form = None # Only filter "Stichting" - other terms describe organizational purpose if name.lower().startswith("stichting "): cleaned_name = name[10:].strip() detected_legal_form = "Stichting" elif " stichting" in name.lower(): # Handle cases like "Hidde Nijland Stichting" cleaned_name = re.sub(r'\s+[Ss]tichting$', '', name).strip() detected_legal_form = "Stichting" return cleaned_name, detected_legal_form def normalize_website(url: Optional[str]) -> Optional[str]: """Normalize website URL.""" if not url: return None # Fix common issues url = url.strip() # Handle malformed URLs (e.g., email addresses used as URLs) if "@" in url and not url.startswith("http"): return None # Ensure http/https prefix if not url.startswith(("http://", "https://")): url = "https://" + url return url def determine_institution_type(name: str, heritage_forms: list) -> str: """ Determine the institution type based on name and heritage forms. KIEN custodians are primarily Intangible Heritage Groups (I), but some may be other types. """ name_lower = name.lower() # Check for museum indicators if "museum" in name_lower: return "M" # Museum # Check for bakery/food heritage (Taste/Smell) if any(x in name_lower for x in ["bakkerij", "visserij", "imkerij", "pottenbakkerij"]): return "T" # Taste/Smell heritage # Default to Intangible Heritage Group return "I" def generate_custodian_name_slug(name: str) -> str: """Generate a URL-safe slug from the custodian name.""" # Remove special characters and convert to lowercase slug = re.sub(r'[^\w\s-]', '', name.lower()) slug = re.sub(r'[-\s]+', '_', slug) return slug[:50] # Limit length def create_entry_yaml(custodian: dict, entry_index: int, wikidata_map: dict) -> dict: """Create a YAML entry structure for a KIEN custodian.""" name = custodian.get("name", "") cleaned_name, legal_form = extract_legal_form(name) # Get heritage forms heritage_forms = custodian.get("heritage_forms", []) heritage_forms_linked = custodian.get("heritage_forms_linked", []) # Combine heritage forms all_heritage_forms = heritage_forms.copy() for hf in heritage_forms_linked: if isinstance(hf, dict) and "name" in hf: # Filter out generic pages if hf["name"] not in ["Inventaris Immaterieel Erfgoed Nederland", "Publicaties"]: all_heritage_forms.append(hf["name"]) elif isinstance(hf, str): all_heritage_forms.append(hf) # Deduplicate all_heritage_forms = list(set(all_heritage_forms)) # Determine institution type inst_type = determine_institution_type(name, all_heritage_forms) # Normalize website website = normalize_website(custodian.get("website")) # Get locations locations = custodian.get("locations", []) # Filter out "Arnhem" if it's KIEN's location, not the org's if len(locations) > 1 and "Arnhem" in locations: locations = [loc for loc in locations if loc != "Arnhem"] elif locations == ["Arnhem"]: # Only Arnhem - might be KIEN's default pass # Check for Wikidata mapping wikidata_id = wikidata_map.get("custodians", {}).get(name, {}).get("wikidata_id") # Build entry structure entry = { "original_entry": { "organisatie": name, "webadres_organisatie": website, "type_organisatie": "intangible_heritage_custodian", "systeem": "KIEN", "type": [inst_type], }, "entry_index": entry_index, "processing_timestamp": datetime.now(timezone.utc).isoformat(), "enrichment_status": "kien_extracted", "provenance": { "schema_version": "1.0.0", "generated_at": datetime.now(timezone.utc).isoformat(), "sources": { "kien": [{ "source_type": "kien_intangible_heritage_registry", "source_url": custodian.get("kien_url"), "fetch_timestamp": custodian.get("extracted_at") or custodian.get("fetched_at"), "data_tier": "TIER_2_VERIFIED", "claims_extracted": ["name", "website", "email", "phone", "description", "locations", "heritage_forms"], }] }, "data_tier_summary": { "TIER_1_AUTHORITATIVE": [], "TIER_2_VERIFIED": ["kien_intangible_heritage_registry"], "TIER_3_CROWD_SOURCED": [], "TIER_4_INFERRED": [], }, "notes": [ "Entry created from KIEN (Kenniscentrum Immaterieel Erfgoed Nederland) registry", "Intangible heritage custodian organization", ], }, "kien_enrichment": { "kien_name": name, "kien_url": custodian.get("kien_url"), "heritage_forms": all_heritage_forms, "enrichment_timestamp": datetime.now(timezone.utc).isoformat(), "source": "https://www.immaterieelerfgoed.nl", }, } # Add description if available description = custodian.get("description", "").strip() if description: entry["notes"] = description # Add contact info (only if not KIEN default) contact = {} email = custodian.get("email") phone = custodian.get("phone") if email and email != KIEN_DEFAULT_EMAIL: contact["email"] = email if phone and phone != KIEN_DEFAULT_PHONE: contact["phone"] = phone if website: contact["website"] = website if contact: entry["contact"] = contact # Add locations if locations: entry["locations"] = [] for loc in locations: entry["locations"].append({ "city": loc, "country": "NL", }) # Add legal form if detected if legal_form: entry["legal_status"] = { "legal_form": legal_form, "original_name_with_legal_form": name, } # Add custodian_name entry["custodian_name"] = { "claim_type": "custodian_name", "claim_value": cleaned_name.lower(), "source": "kien_registry", "confidence": 0.9, "extraction_timestamp": datetime.now(timezone.utc).isoformat(), } # Add identifiers entry["identifiers"] = [ { "identifier_scheme": "KIEN_URL", "identifier_value": custodian.get("kien_url"), "identifier_url": custodian.get("kien_url"), } ] if wikidata_id: entry["identifiers"].append({ "identifier_scheme": "Wikidata", "identifier_value": wikidata_id, "identifier_url": f"https://www.wikidata.org/wiki/{wikidata_id}", }) entry["provenance"]["notes"].append(f"Wikidata ID: {wikidata_id}") return entry def main(): """Main function to generate KIEN custodian entry files.""" print("Loading KIEN custodian profiles...") with open(CUSTODIAN_PROFILES_PATH, 'r', encoding='utf-8') as f: data = json.load(f) custodians = data.get("custodians", []) print(f"Loaded {len(custodians)} custodian profiles") # Load Wikidata crossref wikidata_map = {} if WIKIDATA_CROSSREF_PATH.exists(): with open(WIKIDATA_CROSSREF_PATH, 'r', encoding='utf-8') as f: wikidata_map = json.load(f) print(f"Loaded Wikidata crossref with {len(wikidata_map.get('custodians', {}))} custodian mappings") # Filter valid custodians valid_custodians = [c for c in custodians if is_valid_custodian(c.get("name", ""))] skipped = len(custodians) - len(valid_custodians) print(f"Filtered to {len(valid_custodians)} valid custodians (skipped {skipped} non-custodian entries)") # Create output directory if needed OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Generate entries created_count = 0 for i, custodian in enumerate(valid_custodians): entry_index = STARTING_INDEX + i name = custodian.get("name", "Unknown") try: entry = create_entry_yaml(custodian, entry_index, wikidata_map) # Generate filename slug = generate_custodian_name_slug(name) filename = f"{entry_index}_{slug}.yaml" filepath = OUTPUT_DIR / filename # Write YAML file with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False) created_count += 1 if created_count % 20 == 0: print(f"Created {created_count} entries...") except Exception as e: print(f"Error creating entry for '{name}': {e}") print(f"\nDone! Created {created_count} custodian entry files.") print(f"Entry indices: {STARTING_INDEX} - {STARTING_INDEX + created_count - 1}") print(f"Output directory: {OUTPUT_DIR}") if __name__ == "__main__": main()