206 lines
6.3 KiB
Python
206 lines
6.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich NDE Dutch Heritage Organizations with Wikidata Q-numbers using MCP service.
|
|
|
|
This script must be run from OpenCode with access to the Wikidata MCP service tools.
|
|
It searches Wikidata for each organization and logs all queries and results.
|
|
"""
|
|
|
|
import yaml
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Optional, Tuple
|
|
import time
|
|
|
|
|
|
def create_sparql_query(org_name: str, org_type: Optional[str],
|
|
city: Optional[str], country_code: str = "Q55") -> str:
|
|
"""
|
|
Create SPARQL query for finding heritage organizations.
|
|
|
|
Args:
|
|
org_name: Organization name
|
|
org_type: Type (museum, archief, etc.)
|
|
city: City name
|
|
country_code: Wikidata Q-code for country (default: Q55 = Netherlands)
|
|
|
|
Returns:
|
|
SPARQL query string
|
|
"""
|
|
# Map types to Wikidata classes
|
|
type_map = {
|
|
'museum': 'wd:Q33506',
|
|
'archief': 'wd:Q166118',
|
|
'bibliotheek': 'wd:Q7075',
|
|
'historische vereniging': 'wd:Q1964266'
|
|
}
|
|
|
|
# Build query
|
|
parts = ["SELECT ?item ?itemLabel ?isil ?viaf ?website WHERE {"]
|
|
|
|
# Type constraint
|
|
if org_type and org_type.lower() in type_map:
|
|
parts.append(f" ?item wdt:P31/wdt:P279* {type_map[org_type.lower()]} .")
|
|
|
|
# Country constraint
|
|
parts.append(f" ?item wdt:P17 wd:{country_code} .")
|
|
|
|
# Optional properties
|
|
parts.extend([
|
|
" OPTIONAL { ?item wdt:P791 ?isil }",
|
|
" OPTIONAL { ?item wdt:P214 ?viaf }",
|
|
" OPTIONAL { ?item wdt:P856 ?website }",
|
|
' SERVICE wikibase:label { bd:serviceParam wikibase:language "nl,en" }'
|
|
])
|
|
|
|
# Name filter
|
|
safe_name = org_name.lower().replace('"', '\\"')
|
|
parts.append(f' FILTER(CONTAINS(LCASE(?itemLabel), "{safe_name}"))')
|
|
|
|
parts.append("}")
|
|
parts.append("LIMIT 20")
|
|
|
|
return "\n".join(parts)
|
|
|
|
|
|
def log_enrichment_attempt(org_record: Dict, q_number: Optional[str],
|
|
method: str, confidence: str,
|
|
sparql_query: Optional[str],
|
|
sparql_results: Optional[str],
|
|
sparql_dir: Path) -> Path:
|
|
"""
|
|
Log an enrichment attempt with SPARQL query and results.
|
|
|
|
Args:
|
|
org_record: Organization record being enriched
|
|
q_number: Wikidata Q-number found (or None)
|
|
method: Method used ('search_api', 'sparql', 'manual')
|
|
confidence: Confidence level ('high', 'medium', 'low', 'none')
|
|
sparql_query: SPARQL query used (if any)
|
|
sparql_results: SPARQL results (if any)
|
|
sparql_dir: Directory to save logs
|
|
|
|
Returns:
|
|
Path to log file
|
|
"""
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
safe_timestamp = timestamp.replace(':', '-').replace('.', '-')
|
|
|
|
org_name = org_record.get('organisatie', 'unknown')
|
|
safe_name = "".join(c if c.isalnum() or c in ('-', '_') else '_'
|
|
for c in org_name[:50])
|
|
|
|
filename = f"{safe_timestamp}_{safe_name}_{confidence}.json"
|
|
log_path = sparql_dir / filename
|
|
|
|
log_entry = {
|
|
'timestamp': timestamp,
|
|
'organization': {
|
|
'name': org_name,
|
|
'type': org_record.get('type_organisatie'),
|
|
'city': org_record.get('plaatsnaam_bezoekadres'),
|
|
'isil': org_record.get('isil-code_na'),
|
|
'website': org_record.get('webadres_organisatie')
|
|
},
|
|
'enrichment': {
|
|
'wikidata_id': q_number,
|
|
'method': method,
|
|
'confidence': confidence
|
|
},
|
|
'sparql': {
|
|
'query': sparql_query,
|
|
'results': sparql_results
|
|
}
|
|
}
|
|
|
|
with open(log_path, 'w', encoding='utf-8') as f:
|
|
json.dump(log_entry, f, indent=2, ensure_ascii=False)
|
|
|
|
return log_path
|
|
|
|
|
|
def main():
|
|
"""
|
|
Main enrichment process.
|
|
|
|
NOTE: This script is designed to be run interactively from OpenCode
|
|
with access to the Wikidata MCP service tools.
|
|
"""
|
|
print("=" * 80)
|
|
print("NDE WIKIDATA ENRICHMENT WITH MCP SERVICE")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Paths
|
|
data_path = Path("data/nde/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.yaml")
|
|
sparql_dir = Path("data/nde/sparql")
|
|
sparql_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Load data
|
|
print(f"Loading data from {data_path}...")
|
|
with open(data_path, 'r', encoding='utf-8') as f:
|
|
records = yaml.safe_load(f)
|
|
|
|
print(f"Total records: {len(records)}")
|
|
print()
|
|
print("=" * 80)
|
|
print("INSTRUCTIONS FOR INTERACTIVE ENRICHMENT")
|
|
print("=" * 80)
|
|
print()
|
|
print("This script prepares SPARQL queries for each organization.")
|
|
print("The OpenCode assistant should:")
|
|
print()
|
|
print("1. For each organization, use wikidata-authenticated_search_entity")
|
|
print(" to find the Q-number")
|
|
print()
|
|
print("2. Verify the match using wikidata-authenticated_get_metadata")
|
|
print()
|
|
print("3. Log the result using the provided log_enrichment_attempt function")
|
|
print()
|
|
print("4. Update the YAML file with wikidata_id field")
|
|
print()
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Generate queries for first 10 records
|
|
print("Generating SPARQL queries for first 10 records...")
|
|
print()
|
|
|
|
for idx, record in enumerate(records[:10], 1):
|
|
org_name = record.get('organisatie', 'Unknown')
|
|
print(f"[{idx}/10] {org_name}")
|
|
|
|
# Create SPARQL query
|
|
query = create_sparql_query(
|
|
org_name=org_name,
|
|
org_type=record.get('type_organisatie'),
|
|
city=record.get('plaatsnaam_bezoekadres')
|
|
)
|
|
|
|
# Log the query
|
|
log_enrichment_attempt(
|
|
org_record=record,
|
|
q_number=None,
|
|
method='prepared',
|
|
confidence='pending',
|
|
sparql_query=query,
|
|
sparql_results=None,
|
|
sparql_dir=sparql_dir
|
|
)
|
|
|
|
print(f" Query prepared and logged")
|
|
print()
|
|
|
|
print("=" * 80)
|
|
print("QUERIES PREPARED")
|
|
print("=" * 80)
|
|
print()
|
|
print(f"SPARQL queries saved to: {sparql_dir}")
|
|
print()
|
|
print("Next step: Run interactive enrichment with OpenCode assistant")
|
|
print(" using the Wikidata MCP service tools")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|