#!/usr/bin/env python3 """ Enrich NDE Dutch Heritage Organizations with Wikidata Q-numbers using MCP service. This script must be run from OpenCode with access to the Wikidata MCP service tools. It searches Wikidata for each organization and logs all queries and results. """ import yaml import json from pathlib import Path from datetime import datetime, timezone from typing import Dict, List, Optional, Tuple import time def create_sparql_query(org_name: str, org_type: Optional[str], city: Optional[str], country_code: str = "Q55") -> str: """ Create SPARQL query for finding heritage organizations. Args: org_name: Organization name org_type: Type (museum, archief, etc.) city: City name country_code: Wikidata Q-code for country (default: Q55 = Netherlands) Returns: SPARQL query string """ # Map types to Wikidata classes type_map = { 'museum': 'wd:Q33506', 'archief': 'wd:Q166118', 'bibliotheek': 'wd:Q7075', 'historische vereniging': 'wd:Q1964266' } # Build query parts = ["SELECT ?item ?itemLabel ?isil ?viaf ?website WHERE {"] # Type constraint if org_type and org_type.lower() in type_map: parts.append(f" ?item wdt:P31/wdt:P279* {type_map[org_type.lower()]} .") # Country constraint parts.append(f" ?item wdt:P17 wd:{country_code} .") # Optional properties parts.extend([ " OPTIONAL { ?item wdt:P791 ?isil }", " OPTIONAL { ?item wdt:P214 ?viaf }", " OPTIONAL { ?item wdt:P856 ?website }", ' SERVICE wikibase:label { bd:serviceParam wikibase:language "nl,en" }' ]) # Name filter safe_name = org_name.lower().replace('"', '\\"') parts.append(f' FILTER(CONTAINS(LCASE(?itemLabel), "{safe_name}"))') parts.append("}") parts.append("LIMIT 20") return "\n".join(parts) def log_enrichment_attempt(org_record: Dict, q_number: Optional[str], method: str, confidence: str, sparql_query: Optional[str], sparql_results: Optional[str], sparql_dir: Path) -> Path: """ Log an enrichment attempt with SPARQL query and results. Args: org_record: Organization record being enriched q_number: Wikidata Q-number found (or None) method: Method used ('search_api', 'sparql', 'manual') confidence: Confidence level ('high', 'medium', 'low', 'none') sparql_query: SPARQL query used (if any) sparql_results: SPARQL results (if any) sparql_dir: Directory to save logs Returns: Path to log file """ timestamp = datetime.now(timezone.utc).isoformat() safe_timestamp = timestamp.replace(':', '-').replace('.', '-') org_name = org_record.get('organisatie', 'unknown') safe_name = "".join(c if c.isalnum() or c in ('-', '_') else '_' for c in org_name[:50]) filename = f"{safe_timestamp}_{safe_name}_{confidence}.json" log_path = sparql_dir / filename log_entry = { 'timestamp': timestamp, 'organization': { 'name': org_name, 'type': org_record.get('type_organisatie'), 'city': org_record.get('plaatsnaam_bezoekadres'), 'isil': org_record.get('isil-code_na'), 'website': org_record.get('webadres_organisatie') }, 'enrichment': { 'wikidata_id': q_number, 'method': method, 'confidence': confidence }, 'sparql': { 'query': sparql_query, 'results': sparql_results } } with open(log_path, 'w', encoding='utf-8') as f: json.dump(log_entry, f, indent=2, ensure_ascii=False) return log_path def main(): """ Main enrichment process. NOTE: This script is designed to be run interactively from OpenCode with access to the Wikidata MCP service tools. """ print("=" * 80) print("NDE WIKIDATA ENRICHMENT WITH MCP SERVICE") print("=" * 80) print() # Paths data_path = Path("data/nde/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.yaml") sparql_dir = Path("data/nde/sparql") sparql_dir.mkdir(parents=True, exist_ok=True) # Load data print(f"Loading data from {data_path}...") with open(data_path, 'r', encoding='utf-8') as f: records = yaml.safe_load(f) print(f"Total records: {len(records)}") print() print("=" * 80) print("INSTRUCTIONS FOR INTERACTIVE ENRICHMENT") print("=" * 80) print() print("This script prepares SPARQL queries for each organization.") print("The OpenCode assistant should:") print() print("1. For each organization, use wikidata-authenticated_search_entity") print(" to find the Q-number") print() print("2. Verify the match using wikidata-authenticated_get_metadata") print() print("3. Log the result using the provided log_enrichment_attempt function") print() print("4. Update the YAML file with wikidata_id field") print() print("=" * 80) print() # Generate queries for first 10 records print("Generating SPARQL queries for first 10 records...") print() for idx, record in enumerate(records[:10], 1): org_name = record.get('organisatie', 'Unknown') print(f"[{idx}/10] {org_name}") # Create SPARQL query query = create_sparql_query( org_name=org_name, org_type=record.get('type_organisatie'), city=record.get('plaatsnaam_bezoekadres') ) # Log the query log_enrichment_attempt( org_record=record, q_number=None, method='prepared', confidence='pending', sparql_query=query, sparql_results=None, sparql_dir=sparql_dir ) print(f" Query prepared and logged") print() print("=" * 80) print("QUERIES PREPARED") print("=" * 80) print() print(f"SPARQL queries saved to: {sparql_dir}") print() print("Next step: Run interactive enrichment with OpenCode assistant") print(" using the Wikidata MCP service tools") if __name__ == "__main__": main()