246 lines
8 KiB
Python
246 lines
8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Find Wikidata IDs for NDE entries that don't have them.
|
|
|
|
Uses Wikidata's search API to find matching entities based on:
|
|
- Organization name
|
|
- City (plaatsnaam)
|
|
- Type (museum, archive, library)
|
|
|
|
Usage:
|
|
python scripts/find_wikidata_for_missing_nde.py
|
|
|
|
Output:
|
|
data/nde/wikidata_candidates.yaml - Candidates for review
|
|
data/nde/wikidata_matches.yaml - High-confidence matches
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import yaml
|
|
import httpx
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Optional, Any
|
|
from dataclasses import dataclass, field, asdict
|
|
import logging
|
|
from rapidfuzz import fuzz
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration
|
|
WIKIDATA_SEARCH_API = "https://www.wikidata.org/w/api.php"
|
|
BASE_DELAY = 1.5 # Seconds between requests
|
|
WIKIMEDIA_CONTACT_EMAIL = os.getenv("WIKIMEDIA_CONTACT_EMAIL", "glam-data@example.com")
|
|
USER_AGENT = f"GLAMDataExtractor/1.0 ({WIKIMEDIA_CONTACT_EMAIL}) Python/httpx"
|
|
|
|
# Paths
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
NDE_YAML = PROJECT_ROOT / "data" / "nde" / "nde_register_nl.yaml"
|
|
ENTRIES_DIR = PROJECT_ROOT / "data" / "nde" / "enriched" / "entries"
|
|
OUTPUT_CANDIDATES = PROJECT_ROOT / "data" / "nde" / "wikidata_candidates.yaml"
|
|
OUTPUT_MATCHES = PROJECT_ROOT / "data" / "nde" / "wikidata_matches.yaml"
|
|
|
|
# Headers
|
|
HEADERS = {
|
|
"Accept": "application/json",
|
|
"User-Agent": USER_AGENT,
|
|
}
|
|
|
|
# Institution type mappings for Wikidata search
|
|
TYPE_KEYWORDS = {
|
|
"M": ["museum", "musea"],
|
|
"A": ["archief", "archive", "gemeentearchief", "regionaal archief"],
|
|
"L": ["bibliotheek", "library", "openbare bibliotheek"],
|
|
"S": ["historische vereniging", "heemkundekring", "stichting"],
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class WikidataCandidate:
|
|
"""A potential Wikidata match for an NDE entry."""
|
|
qid: str
|
|
label: str
|
|
description: str
|
|
match_score: float
|
|
search_query: str
|
|
entry_index: int
|
|
entry_name: str
|
|
entry_city: str
|
|
|
|
|
|
def search_wikidata(query: str, limit: int = 5) -> List[Dict]:
|
|
"""Search Wikidata for entities matching the query."""
|
|
params = {
|
|
"action": "wbsearchentities",
|
|
"format": "json",
|
|
"language": "nl",
|
|
"uselang": "nl",
|
|
"type": "item",
|
|
"limit": limit,
|
|
"search": query,
|
|
}
|
|
|
|
try:
|
|
with httpx.Client(headers=HEADERS, timeout=30.0) as client:
|
|
response = client.get(WIKIDATA_SEARCH_API, params=params)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
return data.get("search", [])
|
|
except Exception as e:
|
|
logger.error(f"Search error for '{query}': {e}")
|
|
return []
|
|
|
|
|
|
def find_best_match(entry: Dict, entry_index: int) -> Optional[WikidataCandidate]:
|
|
"""Find the best Wikidata match for an NDE entry."""
|
|
name = entry.get("organisatie", "")
|
|
city = entry.get("plaatsnaam_bezoekadres", "")
|
|
org_type = entry.get("type", [""])[0] if entry.get("type") else ""
|
|
|
|
if not name:
|
|
return None
|
|
|
|
# Try different search strategies
|
|
queries = [
|
|
name, # Full name
|
|
f"{name} {city}", # Name + city
|
|
]
|
|
|
|
# Add type-specific keywords
|
|
if org_type in TYPE_KEYWORDS:
|
|
for keyword in TYPE_KEYWORDS[org_type][:1]:
|
|
queries.append(f"{name} {keyword}")
|
|
|
|
best_candidate = None
|
|
best_score = 0
|
|
|
|
for query in queries:
|
|
results = search_wikidata(query)
|
|
time.sleep(BASE_DELAY) # Rate limiting
|
|
|
|
for result in results:
|
|
qid = result.get("id", "")
|
|
label = result.get("label", "")
|
|
description = result.get("description", "")
|
|
|
|
# Calculate match score using fuzzy matching
|
|
name_score = fuzz.ratio(name.lower(), label.lower())
|
|
|
|
# Boost score if city appears in description
|
|
city_boost = 10 if city.lower() in description.lower() else 0
|
|
|
|
# Boost for Netherlands mentions
|
|
nl_boost = 5 if any(x in description.lower() for x in ["nederland", "netherlands", "dutch", "nl-"]) else 0
|
|
|
|
# Boost for institution type mentions
|
|
type_boost = 0
|
|
if org_type in TYPE_KEYWORDS:
|
|
for keyword in TYPE_KEYWORDS[org_type]:
|
|
if keyword.lower() in description.lower():
|
|
type_boost = 10
|
|
break
|
|
|
|
total_score = name_score + city_boost + nl_boost + type_boost
|
|
|
|
if total_score > best_score:
|
|
best_score = total_score
|
|
best_candidate = WikidataCandidate(
|
|
qid=qid,
|
|
label=label,
|
|
description=description,
|
|
match_score=total_score,
|
|
search_query=query,
|
|
entry_index=entry_index,
|
|
entry_name=name,
|
|
entry_city=city,
|
|
)
|
|
|
|
return best_candidate if best_candidate and best_candidate.match_score >= 70 else None
|
|
|
|
|
|
def load_entries_without_wikidata() -> List[tuple]:
|
|
"""Load NDE entries that don't have Wikidata IDs."""
|
|
with open(NDE_YAML, "r", encoding="utf-8") as f:
|
|
entries = yaml.safe_load(f)
|
|
|
|
missing = []
|
|
for i, entry in enumerate(entries):
|
|
wikidata_id = entry.get("wikidata_id", "")
|
|
# Check if it's missing or invalid
|
|
if not wikidata_id or not str(wikidata_id).startswith("Q"):
|
|
missing.append((i, entry))
|
|
|
|
return missing
|
|
|
|
|
|
def main():
|
|
"""Main function."""
|
|
logger.info("Finding Wikidata IDs for NDE entries without them...")
|
|
|
|
# Load entries without Wikidata IDs
|
|
missing_entries = load_entries_without_wikidata()
|
|
logger.info(f"Found {len(missing_entries)} entries without Wikidata IDs")
|
|
|
|
candidates = []
|
|
high_confidence_matches = []
|
|
|
|
for idx, (entry_index, entry) in enumerate(missing_entries):
|
|
name = entry.get("organisatie", "Unknown")
|
|
logger.info(f"[{idx+1}/{len(missing_entries)}] Searching for: {name}")
|
|
|
|
candidate = find_best_match(entry, entry_index)
|
|
|
|
if candidate:
|
|
candidates.append(asdict(candidate))
|
|
logger.info(f" Found: {candidate.qid} - {candidate.label} (score: {candidate.match_score:.0f})")
|
|
|
|
# High confidence: score >= 90
|
|
if candidate.match_score >= 90:
|
|
high_confidence_matches.append({
|
|
"entry_index": entry_index,
|
|
"entry_name": name,
|
|
"wikidata_id": candidate.qid,
|
|
"wikidata_label": candidate.label,
|
|
"match_score": candidate.match_score,
|
|
})
|
|
else:
|
|
logger.info(f" No match found")
|
|
|
|
# Progress save every 50 entries
|
|
if (idx + 1) % 50 == 0:
|
|
save_results(candidates, high_confidence_matches)
|
|
|
|
# Final save
|
|
save_results(candidates, high_confidence_matches)
|
|
|
|
logger.info(f"\nDone! Found {len(candidates)} candidates, {len(high_confidence_matches)} high-confidence matches")
|
|
logger.info(f"Candidates saved to: {OUTPUT_CANDIDATES}")
|
|
logger.info(f"High-confidence matches saved to: {OUTPUT_MATCHES}")
|
|
|
|
|
|
def save_results(candidates: List[Dict], matches: List[Dict]):
|
|
"""Save results to YAML files."""
|
|
with open(OUTPUT_CANDIDATES, "w", encoding="utf-8") as f:
|
|
yaml.dump({
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"total_candidates": len(candidates),
|
|
"candidates": candidates,
|
|
}, f, default_flow_style=False, allow_unicode=True)
|
|
|
|
with open(OUTPUT_MATCHES, "w", encoding="utf-8") as f:
|
|
yaml.dump({
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"total_matches": len(matches),
|
|
"matches": matches,
|
|
}, f, default_flow_style=False, allow_unicode=True)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|