glam/scripts/prepare_wikidata_enrichment.py
2025-11-19 23:25:22 +01:00

206 lines
6.3 KiB
Python

#!/usr/bin/env python3
"""
Enrich NDE Dutch Heritage Organizations with Wikidata Q-numbers using MCP service.
This script must be run from OpenCode with access to the Wikidata MCP service tools.
It searches Wikidata for each organization and logs all queries and results.
"""
import yaml
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Tuple
import time
def create_sparql_query(org_name: str, org_type: Optional[str],
city: Optional[str], country_code: str = "Q55") -> str:
"""
Create SPARQL query for finding heritage organizations.
Args:
org_name: Organization name
org_type: Type (museum, archief, etc.)
city: City name
country_code: Wikidata Q-code for country (default: Q55 = Netherlands)
Returns:
SPARQL query string
"""
# Map types to Wikidata classes
type_map = {
'museum': 'wd:Q33506',
'archief': 'wd:Q166118',
'bibliotheek': 'wd:Q7075',
'historische vereniging': 'wd:Q1964266'
}
# Build query
parts = ["SELECT ?item ?itemLabel ?isil ?viaf ?website WHERE {"]
# Type constraint
if org_type and org_type.lower() in type_map:
parts.append(f" ?item wdt:P31/wdt:P279* {type_map[org_type.lower()]} .")
# Country constraint
parts.append(f" ?item wdt:P17 wd:{country_code} .")
# Optional properties
parts.extend([
" OPTIONAL { ?item wdt:P791 ?isil }",
" OPTIONAL { ?item wdt:P214 ?viaf }",
" OPTIONAL { ?item wdt:P856 ?website }",
' SERVICE wikibase:label { bd:serviceParam wikibase:language "nl,en" }'
])
# Name filter
safe_name = org_name.lower().replace('"', '\\"')
parts.append(f' FILTER(CONTAINS(LCASE(?itemLabel), "{safe_name}"))')
parts.append("}")
parts.append("LIMIT 20")
return "\n".join(parts)
def log_enrichment_attempt(org_record: Dict, q_number: Optional[str],
method: str, confidence: str,
sparql_query: Optional[str],
sparql_results: Optional[str],
sparql_dir: Path) -> Path:
"""
Log an enrichment attempt with SPARQL query and results.
Args:
org_record: Organization record being enriched
q_number: Wikidata Q-number found (or None)
method: Method used ('search_api', 'sparql', 'manual')
confidence: Confidence level ('high', 'medium', 'low', 'none')
sparql_query: SPARQL query used (if any)
sparql_results: SPARQL results (if any)
sparql_dir: Directory to save logs
Returns:
Path to log file
"""
timestamp = datetime.now(timezone.utc).isoformat()
safe_timestamp = timestamp.replace(':', '-').replace('.', '-')
org_name = org_record.get('organisatie', 'unknown')
safe_name = "".join(c if c.isalnum() or c in ('-', '_') else '_'
for c in org_name[:50])
filename = f"{safe_timestamp}_{safe_name}_{confidence}.json"
log_path = sparql_dir / filename
log_entry = {
'timestamp': timestamp,
'organization': {
'name': org_name,
'type': org_record.get('type_organisatie'),
'city': org_record.get('plaatsnaam_bezoekadres'),
'isil': org_record.get('isil-code_na'),
'website': org_record.get('webadres_organisatie')
},
'enrichment': {
'wikidata_id': q_number,
'method': method,
'confidence': confidence
},
'sparql': {
'query': sparql_query,
'results': sparql_results
}
}
with open(log_path, 'w', encoding='utf-8') as f:
json.dump(log_entry, f, indent=2, ensure_ascii=False)
return log_path
def main():
"""
Main enrichment process.
NOTE: This script is designed to be run interactively from OpenCode
with access to the Wikidata MCP service tools.
"""
print("=" * 80)
print("NDE WIKIDATA ENRICHMENT WITH MCP SERVICE")
print("=" * 80)
print()
# Paths
data_path = Path("data/nde/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.yaml")
sparql_dir = Path("data/nde/sparql")
sparql_dir.mkdir(parents=True, exist_ok=True)
# Load data
print(f"Loading data from {data_path}...")
with open(data_path, 'r', encoding='utf-8') as f:
records = yaml.safe_load(f)
print(f"Total records: {len(records)}")
print()
print("=" * 80)
print("INSTRUCTIONS FOR INTERACTIVE ENRICHMENT")
print("=" * 80)
print()
print("This script prepares SPARQL queries for each organization.")
print("The OpenCode assistant should:")
print()
print("1. For each organization, use wikidata-authenticated_search_entity")
print(" to find the Q-number")
print()
print("2. Verify the match using wikidata-authenticated_get_metadata")
print()
print("3. Log the result using the provided log_enrichment_attempt function")
print()
print("4. Update the YAML file with wikidata_id field")
print()
print("=" * 80)
print()
# Generate queries for first 10 records
print("Generating SPARQL queries for first 10 records...")
print()
for idx, record in enumerate(records[:10], 1):
org_name = record.get('organisatie', 'Unknown')
print(f"[{idx}/10] {org_name}")
# Create SPARQL query
query = create_sparql_query(
org_name=org_name,
org_type=record.get('type_organisatie'),
city=record.get('plaatsnaam_bezoekadres')
)
# Log the query
log_enrichment_attempt(
org_record=record,
q_number=None,
method='prepared',
confidence='pending',
sparql_query=query,
sparql_results=None,
sparql_dir=sparql_dir
)
print(f" Query prepared and logged")
print()
print("=" * 80)
print("QUERIES PREPARED")
print("=" * 80)
print()
print(f"SPARQL queries saved to: {sparql_dir}")
print()
print("Next step: Run interactive enrichment with OpenCode assistant")
print(" using the Wikidata MCP service tools")
if __name__ == "__main__":
main()