336 lines
12 KiB
Python
336 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate Custodian entry YAML files from KIEN (Kenniscentrum Immaterieel Erfgoed Nederland) data.
|
|
|
|
This script reads the extracted custodian profiles from KIEN and generates
|
|
individual YAML entry files in the /data/nde/enriched/entries/ format.
|
|
|
|
Starting entry_index: 1674 (continuing from existing entries)
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
import yaml
|
|
|
|
# Configuration
|
|
CUSTODIAN_PROFILES_PATH = Path("/Users/kempersc/apps/glam/data/intangible_heritage/custodian_profiles.json")
|
|
WIKIDATA_CROSSREF_PATH = Path("/Users/kempersc/apps/glam/data/intangible_heritage/wikidata_crossref.json")
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/nde/enriched/entries")
|
|
STARTING_INDEX = 1674
|
|
|
|
# Patterns to identify non-custodian entries (news, educational materials, etc.)
|
|
NON_CUSTODIAN_PATTERNS = [
|
|
r"^Lesmateriaal\s+", # Educational materials
|
|
r"^Nieuws\s+", # News items
|
|
r"^Publicaties?\s*$", # Publications pages
|
|
]
|
|
|
|
# KIEN default phone number (used when org doesn't have own phone)
|
|
KIEN_DEFAULT_PHONE = "+31263576113"
|
|
KIEN_DEFAULT_EMAIL = "info@immaterieelerfgoed.nl"
|
|
|
|
|
|
def is_valid_custodian(name: str) -> bool:
|
|
"""Check if the entry is a valid custodian (not news/educational material)."""
|
|
for pattern in NON_CUSTODIAN_PATTERNS:
|
|
if re.match(pattern, name, re.IGNORECASE):
|
|
return False
|
|
return True
|
|
|
|
|
|
def extract_legal_form(name: str) -> tuple[str, Optional[str]]:
|
|
"""
|
|
Extract legal form from organization name and return cleaned name + legal form.
|
|
|
|
Per AGENTS.md Rule 8: Legal form terms are filtered from CustodianName.
|
|
"""
|
|
# Dutch legal form patterns to filter
|
|
legal_forms = {
|
|
"Stichting": "Stichting",
|
|
"Vereniging": None, # Keep - describes organizational purpose
|
|
"Federatie": None, # Keep - describes organizational purpose
|
|
"Genootschap": None, # Keep
|
|
"Bond": None, # Keep
|
|
"Broederschap": None, # Keep
|
|
"Comité": None, # Keep
|
|
"Commissie": None, # Keep
|
|
"Platform": None, # Keep
|
|
"Kring": None, # Keep
|
|
"Gilde": None, # Keep
|
|
"Jonkheid": None, # Keep - traditional youth organization
|
|
"Buurt": None, # Keep - neighborhood organization
|
|
}
|
|
|
|
cleaned_name = name
|
|
detected_legal_form = None
|
|
|
|
# Only filter "Stichting" - other terms describe organizational purpose
|
|
if name.lower().startswith("stichting "):
|
|
cleaned_name = name[10:].strip()
|
|
detected_legal_form = "Stichting"
|
|
elif " stichting" in name.lower():
|
|
# Handle cases like "Hidde Nijland Stichting"
|
|
cleaned_name = re.sub(r'\s+[Ss]tichting$', '', name).strip()
|
|
detected_legal_form = "Stichting"
|
|
|
|
return cleaned_name, detected_legal_form
|
|
|
|
|
|
def normalize_website(url: Optional[str]) -> Optional[str]:
|
|
"""Normalize website URL."""
|
|
if not url:
|
|
return None
|
|
|
|
# Fix common issues
|
|
url = url.strip()
|
|
|
|
# Handle malformed URLs (e.g., email addresses used as URLs)
|
|
if "@" in url and not url.startswith("http"):
|
|
return None
|
|
|
|
# Ensure http/https prefix
|
|
if not url.startswith(("http://", "https://")):
|
|
url = "https://" + url
|
|
|
|
return url
|
|
|
|
|
|
def determine_institution_type(name: str, heritage_forms: list) -> str:
|
|
"""
|
|
Determine the institution type based on name and heritage forms.
|
|
|
|
KIEN custodians are primarily Intangible Heritage Groups (I),
|
|
but some may be other types.
|
|
"""
|
|
name_lower = name.lower()
|
|
|
|
# Check for museum indicators
|
|
if "museum" in name_lower:
|
|
return "M" # Museum
|
|
|
|
# Check for bakery/food heritage (Taste/Smell)
|
|
if any(x in name_lower for x in ["bakkerij", "visserij", "imkerij", "pottenbakkerij"]):
|
|
return "T" # Taste/Smell heritage
|
|
|
|
# Default to Intangible Heritage Group
|
|
return "I"
|
|
|
|
|
|
def generate_custodian_name_slug(name: str) -> str:
|
|
"""Generate a URL-safe slug from the custodian name."""
|
|
# Remove special characters and convert to lowercase
|
|
slug = re.sub(r'[^\w\s-]', '', name.lower())
|
|
slug = re.sub(r'[-\s]+', '_', slug)
|
|
return slug[:50] # Limit length
|
|
|
|
|
|
def create_entry_yaml(custodian: dict, entry_index: int, wikidata_map: dict) -> dict:
|
|
"""Create a YAML entry structure for a KIEN custodian."""
|
|
|
|
name = custodian.get("name", "")
|
|
cleaned_name, legal_form = extract_legal_form(name)
|
|
|
|
# Get heritage forms
|
|
heritage_forms = custodian.get("heritage_forms", [])
|
|
heritage_forms_linked = custodian.get("heritage_forms_linked", [])
|
|
|
|
# Combine heritage forms
|
|
all_heritage_forms = heritage_forms.copy()
|
|
for hf in heritage_forms_linked:
|
|
if isinstance(hf, dict) and "name" in hf:
|
|
# Filter out generic pages
|
|
if hf["name"] not in ["Inventaris Immaterieel Erfgoed Nederland", "Publicaties"]:
|
|
all_heritage_forms.append(hf["name"])
|
|
elif isinstance(hf, str):
|
|
all_heritage_forms.append(hf)
|
|
|
|
# Deduplicate
|
|
all_heritage_forms = list(set(all_heritage_forms))
|
|
|
|
# Determine institution type
|
|
inst_type = determine_institution_type(name, all_heritage_forms)
|
|
|
|
# Normalize website
|
|
website = normalize_website(custodian.get("website"))
|
|
|
|
# Get locations
|
|
locations = custodian.get("locations", [])
|
|
# Filter out "Arnhem" if it's KIEN's location, not the org's
|
|
if len(locations) > 1 and "Arnhem" in locations:
|
|
locations = [loc for loc in locations if loc != "Arnhem"]
|
|
elif locations == ["Arnhem"]:
|
|
# Only Arnhem - might be KIEN's default
|
|
pass
|
|
|
|
# Check for Wikidata mapping
|
|
wikidata_id = wikidata_map.get("custodians", {}).get(name, {}).get("wikidata_id")
|
|
|
|
# Build entry structure
|
|
entry = {
|
|
"original_entry": {
|
|
"organisatie": name,
|
|
"webadres_organisatie": website,
|
|
"type_organisatie": "intangible_heritage_custodian",
|
|
"systeem": "KIEN",
|
|
"type": [inst_type],
|
|
},
|
|
"entry_index": entry_index,
|
|
"processing_timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"enrichment_status": "kien_extracted",
|
|
"provenance": {
|
|
"schema_version": "1.0.0",
|
|
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
"sources": {
|
|
"kien": [{
|
|
"source_type": "kien_intangible_heritage_registry",
|
|
"source_url": custodian.get("kien_url"),
|
|
"fetch_timestamp": custodian.get("extracted_at") or custodian.get("fetched_at"),
|
|
"data_tier": "TIER_2_VERIFIED",
|
|
"claims_extracted": ["name", "website", "email", "phone", "description", "locations", "heritage_forms"],
|
|
}]
|
|
},
|
|
"data_tier_summary": {
|
|
"TIER_1_AUTHORITATIVE": [],
|
|
"TIER_2_VERIFIED": ["kien_intangible_heritage_registry"],
|
|
"TIER_3_CROWD_SOURCED": [],
|
|
"TIER_4_INFERRED": [],
|
|
},
|
|
"notes": [
|
|
"Entry created from KIEN (Kenniscentrum Immaterieel Erfgoed Nederland) registry",
|
|
"Intangible heritage custodian organization",
|
|
],
|
|
},
|
|
"kien_enrichment": {
|
|
"kien_name": name,
|
|
"kien_url": custodian.get("kien_url"),
|
|
"heritage_forms": all_heritage_forms,
|
|
"enrichment_timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"source": "https://www.immaterieelerfgoed.nl",
|
|
},
|
|
}
|
|
|
|
# Add description if available
|
|
description = custodian.get("description", "").strip()
|
|
if description:
|
|
entry["notes"] = description
|
|
|
|
# Add contact info (only if not KIEN default)
|
|
contact = {}
|
|
email = custodian.get("email")
|
|
phone = custodian.get("phone")
|
|
|
|
if email and email != KIEN_DEFAULT_EMAIL:
|
|
contact["email"] = email
|
|
if phone and phone != KIEN_DEFAULT_PHONE:
|
|
contact["phone"] = phone
|
|
if website:
|
|
contact["website"] = website
|
|
|
|
if contact:
|
|
entry["contact"] = contact
|
|
|
|
# Add locations
|
|
if locations:
|
|
entry["locations"] = []
|
|
for loc in locations:
|
|
entry["locations"].append({
|
|
"city": loc,
|
|
"country": "NL",
|
|
})
|
|
|
|
# Add legal form if detected
|
|
if legal_form:
|
|
entry["legal_status"] = {
|
|
"legal_form": legal_form,
|
|
"original_name_with_legal_form": name,
|
|
}
|
|
|
|
# Add custodian_name
|
|
entry["custodian_name"] = {
|
|
"claim_type": "custodian_name",
|
|
"claim_value": cleaned_name.lower(),
|
|
"source": "kien_registry",
|
|
"confidence": 0.9,
|
|
"extraction_timestamp": datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
|
|
# Add identifiers
|
|
entry["identifiers"] = [
|
|
{
|
|
"identifier_scheme": "KIEN_URL",
|
|
"identifier_value": custodian.get("kien_url"),
|
|
"identifier_url": custodian.get("kien_url"),
|
|
}
|
|
]
|
|
|
|
if wikidata_id:
|
|
entry["identifiers"].append({
|
|
"identifier_scheme": "Wikidata",
|
|
"identifier_value": wikidata_id,
|
|
"identifier_url": f"https://www.wikidata.org/wiki/{wikidata_id}",
|
|
})
|
|
entry["provenance"]["notes"].append(f"Wikidata ID: {wikidata_id}")
|
|
|
|
return entry
|
|
|
|
|
|
def main():
|
|
"""Main function to generate KIEN custodian entry files."""
|
|
|
|
print("Loading KIEN custodian profiles...")
|
|
with open(CUSTODIAN_PROFILES_PATH, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
custodians = data.get("custodians", [])
|
|
print(f"Loaded {len(custodians)} custodian profiles")
|
|
|
|
# Load Wikidata crossref
|
|
wikidata_map = {}
|
|
if WIKIDATA_CROSSREF_PATH.exists():
|
|
with open(WIKIDATA_CROSSREF_PATH, 'r', encoding='utf-8') as f:
|
|
wikidata_map = json.load(f)
|
|
print(f"Loaded Wikidata crossref with {len(wikidata_map.get('custodians', {}))} custodian mappings")
|
|
|
|
# Filter valid custodians
|
|
valid_custodians = [c for c in custodians if is_valid_custodian(c.get("name", ""))]
|
|
skipped = len(custodians) - len(valid_custodians)
|
|
print(f"Filtered to {len(valid_custodians)} valid custodians (skipped {skipped} non-custodian entries)")
|
|
|
|
# Create output directory if needed
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Generate entries
|
|
created_count = 0
|
|
for i, custodian in enumerate(valid_custodians):
|
|
entry_index = STARTING_INDEX + i
|
|
name = custodian.get("name", "Unknown")
|
|
|
|
try:
|
|
entry = create_entry_yaml(custodian, entry_index, wikidata_map)
|
|
|
|
# Generate filename
|
|
slug = generate_custodian_name_slug(name)
|
|
filename = f"{entry_index}_{slug}.yaml"
|
|
filepath = OUTPUT_DIR / filename
|
|
|
|
# Write YAML file
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
created_count += 1
|
|
if created_count % 20 == 0:
|
|
print(f"Created {created_count} entries...")
|
|
|
|
except Exception as e:
|
|
print(f"Error creating entry for '{name}': {e}")
|
|
|
|
print(f"\nDone! Created {created_count} custodian entry files.")
|
|
print(f"Entry indices: {STARTING_INDEX} - {STARTING_INDEX + created_count - 1}")
|
|
print(f"Output directory: {OUTPUT_DIR}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|