glam/scripts/enrich_institutions_wikidata_sparql.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

929 lines
35 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Global Wikidata SPARQL Enrichment for Heritage Institutions
This script enriches heritage institutions worldwide by querying Wikidata's SPARQL endpoint
and performing fuzzy name matching to find real Q-numbers.
🚨 CRITICAL POLICY: REAL IDENTIFIERS ONLY 🚨
This script NEVER generates synthetic Q-numbers. If no Wikidata match is found,
institutions remain without Q-numbers and are flagged for manual enrichment.
Strategy:
1. Process institutions by country (configurable priority order)
2. Query Wikidata for museums/archives/libraries in each country using SPARQL
3. Fuzzy match institution names (threshold > 0.85)
4. Cross-reference ISIL/VIAF identifiers for high-confidence matches
5. Update GHCIDs ONLY when collision resolution requires Q-number
6. Track provenance with match confidence scores
Priority Countries (configurable):
- Netherlands (NL, Q55): Highest data quality, 1,351 institutions
- Chile (CL, Q298): Good name quality, 28.9% current coverage
- Belgium (BE, Q31): ~500 institutions
- Italy (IT, Q38): ~400 institutions
- Denmark (DK, Q35): ~300 institutions
Usage:
python enrich_institutions_wikidata_sparql.py --countries NL CL BE --threshold 0.85 --dry-run
python enrich_institutions_wikidata_sparql.py --all-countries --skip-existing
"""
import sys
from pathlib import Path
from typing import Any, Optional
from datetime import datetime, timezone
import time
import yaml
from difflib import SequenceMatcher
import re
import argparse
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore
# Country configurations (Wikidata QIDs)
# Prioritized by data quality, institution count, and expected match rate
COUNTRY_CONFIGS = {
# Priority 1: High data quality, large datasets
'NL': {'name': 'Netherlands', 'qid': 'Q55', 'flag': '🇳🇱', 'languages': 'nl,en', 'priority': 1},
'CL': {'name': 'Chile', 'qid': 'Q298', 'flag': '🇨🇱', 'languages': 'es,en', 'priority': 1},
# Priority 2: Medium datasets, good coverage potential
'BE': {'name': 'Belgium', 'qid': 'Q31', 'flag': '🇧🇪', 'languages': 'nl,fr,en', 'priority': 2},
'IT': {'name': 'Italy', 'qid': 'Q38', 'flag': '🇮🇹', 'languages': 'it,en', 'priority': 2},
'DK': {'name': 'Denmark', 'qid': 'Q35', 'flag': '🇩🇰', 'languages': 'da,en', 'priority': 2},
'AT': {'name': 'Austria', 'qid': 'Q40', 'flag': '🇦🇹', 'languages': 'de,en', 'priority': 2},
'CH': {'name': 'Switzerland', 'qid': 'Q39', 'flag': '🇨🇭', 'languages': 'de,fr,it,en', 'priority': 2},
'NO': {'name': 'Norway', 'qid': 'Q20', 'flag': '🇳🇴', 'languages': 'no,en', 'priority': 2},
# Priority 3: Latin America (already partially enriched)
'BR': {'name': 'Brazil', 'qid': 'Q155', 'flag': '🇧🇷', 'languages': 'pt,en', 'priority': 3},
'MX': {'name': 'Mexico', 'qid': 'Q96', 'flag': '🇲🇽', 'languages': 'es,en', 'priority': 3},
'AR': {'name': 'Argentina', 'qid': 'Q414', 'flag': '🇦🇷', 'languages': 'es,en', 'priority': 3},
'CO': {'name': 'Colombia', 'qid': 'Q739', 'flag': '🇨🇴', 'languages': 'es,en', 'priority': 3},
# Priority 4: Asian countries (language barriers)
'JP': {'name': 'Japan', 'qid': 'Q17', 'flag': '🇯🇵', 'languages': 'ja,en', 'priority': 4},
'VN': {'name': 'Vietnam', 'qid': 'Q881', 'flag': '🇻🇳', 'languages': 'vi,en', 'priority': 4},
'TH': {'name': 'Thailand', 'qid': 'Q869', 'flag': '🇹🇭', 'languages': 'th,en', 'priority': 4},
'TW': {'name': 'Taiwan', 'qid': 'Q865', 'flag': '🇹🇼', 'languages': 'zh,en', 'priority': 4},
'KR': {'name': 'South Korea', 'qid': 'Q884', 'flag': '🇰🇷', 'languages': 'ko,en', 'priority': 4},
'MY': {'name': 'Malaysia', 'qid': 'Q833', 'flag': '🇲🇾', 'languages': 'ms,en', 'priority': 4},
'ID': {'name': 'Indonesia', 'qid': 'Q252', 'flag': '🇮🇩', 'languages': 'id,en', 'priority': 4},
'PH': {'name': 'Philippines', 'qid': 'Q928', 'flag': '🇵🇭', 'languages': 'en,tl', 'priority': 4},
# Priority 5: African/Middle Eastern countries (fewer Wikidata entries)
'EG': {'name': 'Egypt', 'qid': 'Q79', 'flag': '🇪🇬', 'languages': 'ar,en', 'priority': 5},
'ZA': {'name': 'South Africa', 'qid': 'Q258', 'flag': '🇿🇦', 'languages': 'en,af', 'priority': 5},
'KE': {'name': 'Kenya', 'qid': 'Q114', 'flag': '🇰🇪', 'languages': 'en,sw', 'priority': 5},
'NG': {'name': 'Nigeria', 'qid': 'Q1033', 'flag': '🇳🇬', 'languages': 'en', 'priority': 5},
'GH': {'name': 'Ghana', 'qid': 'Q117', 'flag': '🇬🇭', 'languages': 'en', 'priority': 5},
# Add more countries as needed
}
def normalize_name(name: str) -> str:
"""
Normalize institution name for fuzzy matching.
Removes common prefixes/suffixes in multiple languages to improve matching.
"""
# Lowercase
name = name.lower()
# Remove common prefixes (multilingual)
prefixes = [
# Dutch
r'^(het |de |museum |archief |bibliotheek |stichting |nationaal |provinciaal |gemeentelijk |regionaal )',
# English
r'^(the |museum |archive |library |foundation |national |provincial |municipal |regional )',
# Spanish/Portuguese
r'^(el |la |los |las |museo |archivo |biblioteca |fundación |fundação |nacional |provincial |municipal |regional )',
# French
r'^(le |la |les |musée |archives |bibliothèque |fondation |national |provincial |municipal |régional )',
# German
r'^(das |die |der |museum |archiv |bibliothek |stiftung |national |provinziell |kommunal |regional )',
# Italian
r'^(il |lo |la |museo |archivio |biblioteca |fondazione |nazionale |provinciale |comunale |regionale )',
]
for prefix_pattern in prefixes:
name = re.sub(prefix_pattern, '', name, flags=re.IGNORECASE)
# Remove common suffixes (multilingual)
suffixes = [
r'\s+(museum|museu|museo|musée)$',
r'\s+(archief|archive|archivo|arquivo|archives)$',
r'\s+(bibliotheek|library|biblioteca|bibliothèque)$',
r'\s+(stichting|foundation|fundación|fundação|fondation|fondazione)$',
r'\s+(national|nacional|nationale|nationaal)$',
r'\s+(regional|regional|régional)$',
r'\s+(municipal|comunal|municipale)$',
]
for suffix_pattern in suffixes:
name = re.sub(suffix_pattern, '', name, flags=re.IGNORECASE)
# Remove punctuation
name = re.sub(r'[^\w\s]', ' ', name)
# Normalize whitespace
name = ' '.join(name.split())
return name.strip()
def similarity_score(name1: str, name2: str) -> float:
"""Calculate similarity between two names using SequenceMatcher (0-1)."""
norm1 = normalize_name(name1)
norm2 = normalize_name(name2)
return SequenceMatcher(None, norm1, norm2).ratio()
def query_wikidata_institutions(
sparql: SPARQLWrapper,
country_qid: str,
institution_types: list[str],
languages: str = "en"
) -> dict[str, dict[str, Any]]:
"""
Query Wikidata for heritage institutions in a specific country.
🔧 OPTIMIZED VERSION: Queries each institution type separately to avoid
expensive transitive subclass queries (wdt:P279*) that cause 504 timeouts.
Args:
sparql: Configured SPARQLWrapper instance
country_qid: Wikidata QID for country (e.g., Q55 for Netherlands)
institution_types: List of Wikidata QIDs for institution types:
Q33506 - museum
Q7075 - library
Q166118 - archive
Q2668072 - art gallery
Q5282129 - cultural center
languages: Comma-separated language codes for labels (e.g., "nl,en")
Returns:
Dictionary mapping Wikidata QIDs to institution metadata
"""
# Query each type separately to avoid timeout
all_results = {}
for inst_type_qid in institution_types:
print(f" - Querying {inst_type_qid}...", end="", flush=True)
query = f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?itemAltLabel ?isil ?viaf ?coords ?website ?inception ?instType ?instTypeLabel
WHERE {{
# Direct instance-of match (no expensive transitive subclass)
?item wdt:P31/wdt:P279? wd:{inst_type_qid} . # instance of (or subclass of) type
?item wdt:P17 wd:{country_qid} . # country
# Capture the specific type
?item wdt:P31 ?instType .
# Optional identifiers and metadata
OPTIONAL {{ ?item wdt:P791 ?isil . }} # ISIL code
OPTIONAL {{ ?item wdt:P214 ?viaf . }} # VIAF ID
OPTIONAL {{ ?item wdt:P625 ?coords . }} # Coordinates
OPTIONAL {{ ?item wdt:P856 ?website . }} # Official website
OPTIONAL {{ ?item wdt:P571 ?inception . }} # Founding date
SERVICE wikibase:label {{
bd:serviceParam wikibase:language "{languages}" .
?item rdfs:label ?itemLabel .
?item schema:description ?itemDescription .
?item skos:altLabel ?itemAltLabel .
?instType rdfs:label ?instTypeLabel .
}}
}}
LIMIT 1000
"""
sparql.setQuery(query)
try:
raw_results = sparql.query().convert()
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
# Merge results
type_results = _parse_sparql_bindings(bindings)
all_results.update(type_results)
print(f" {len(type_results)} found")
except Exception as e:
print(f" ❌ Error: {e}")
continue
return all_results
def _parse_sparql_bindings(bindings: list[dict]) -> dict[str, dict[str, Any]]:
"""
Helper function to parse SPARQL query bindings into institution metadata.
Returns:
Dictionary mapping Wikidata QIDs to institution metadata
"""
results = {}
for binding in bindings:
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1] if item_uri else None
if not qid or not qid.startswith("Q"):
continue
# Check if it's a REAL Wikidata Q-number (not synthetic)
try:
qid_num = int(qid[1:])
if qid_num >= 90000000:
# Synthetic Q-number range - SKIP
continue
except ValueError:
continue
# Initialize or update result
if qid not in results:
results[qid] = {
"qid": qid,
"name": binding.get("itemLabel", {}).get("value", ""),
"description": binding.get("itemDescription", {}).get("value", ""),
"type": binding.get("instTypeLabel", {}).get("value", ""),
"alternative_names": [],
"identifiers": {}
}
# Collect alternative names (multilingual labels)
alt_label = binding.get("itemAltLabel", {}).get("value", "")
if alt_label and alt_label not in results[qid]["alternative_names"]:
results[qid]["alternative_names"].append(alt_label)
# Add identifiers
if "isil" in binding:
results[qid]["identifiers"]["ISIL"] = binding["isil"]["value"]
if "viaf" in binding:
results[qid]["identifiers"]["VIAF"] = binding["viaf"]["value"]
if "website" in binding:
results[qid]["identifiers"]["Website"] = binding["website"]["value"]
if "inception" in binding:
results[qid]["founding_date"] = binding["inception"]["value"].split("T")[0]
if "coords" in binding:
coords_str = binding["coords"]["value"]
if coords_str.startswith("Point("):
lon, lat = coords_str[6:-1].split()
results[qid]["latitude"] = float(lat)
results[qid]["longitude"] = float(lon)
return results
def has_real_wikidata_id(inst: dict[str, Any]) -> bool:
"""Check if institution already has a REAL (non-synthetic) Wikidata ID."""
for id_obj in inst.get("identifiers", []):
if not isinstance(id_obj, dict):
continue
if id_obj.get("identifier_scheme") == "Wikidata":
qid = id_obj.get("identifier_value", "")
if qid.startswith("Q"):
try:
qid_num = int(qid[1:])
if qid_num < 90000000:
return True # Real Wikidata ID
except ValueError:
pass
return False
def institution_type_compatible(inst_name: str, inst_type: str, wd_type: str) -> bool:
"""
Check if institution types are compatible (avoid museum/archive/library mismatches).
Uses both the institution's formal type and name keywords to validate compatibility.
"""
inst_lower = inst_name.lower()
wd_lower = wd_type.lower()
formal_type = inst_type.upper()
# Define type keywords (multilingual)
museum_keywords = ['museum', 'museo', 'museu', 'musée', 'muzeum', 'muzeu']
archive_keywords = ['archief', 'archive', 'archivo', 'arquivo', 'archiv', 'arkiv']
library_keywords = ['bibliotheek', 'library', 'biblioteca', 'bibliothèque', 'bibliothek', 'bibliotek']
gallery_keywords = ['gallery', 'galerie', 'galería', 'galleria', 'kunsthal', 'kunsthalle']
# Check if institution name contains type keyword
inst_is_museum = any(kw in inst_lower for kw in museum_keywords)
inst_is_archive = any(kw in inst_lower for kw in archive_keywords)
inst_is_library = any(kw in inst_lower for kw in library_keywords)
inst_is_gallery = any(kw in inst_lower for kw in gallery_keywords)
# Check if Wikidata type contains type keyword
wd_is_museum = any(kw in wd_lower for kw in museum_keywords)
wd_is_archive = any(kw in wd_lower for kw in archive_keywords)
wd_is_library = any(kw in wd_lower for kw in library_keywords)
wd_is_gallery = any(kw in wd_lower for kw in gallery_keywords)
# Check formal institution type
formal_is_museum = formal_type in ('MUSEUM', 'GALLERY', 'BOTANICAL_ZOO')
formal_is_archive = formal_type == 'ARCHIVE'
formal_is_library = formal_type == 'LIBRARY'
# If Wikidata type is empty, allow match (type will be determined by name/formal type)
if not wd_type or not wd_lower.strip():
return True
# If both have explicit types, they must match
if (inst_is_museum or formal_is_museum) and not wd_is_museum and not wd_is_gallery:
return False
if (inst_is_archive or formal_is_archive) and not wd_is_archive:
return False
if (inst_is_library or formal_is_library) and not wd_is_library:
return False
return True
def isil_cross_reference_match(inst: dict[str, Any], wd_data: dict[str, Any]) -> Optional[float]:
"""
Check for ISIL code cross-reference (highest confidence match).
Returns:
1.0 if ISIL codes match exactly
None if no ISIL match
"""
inst_isil = None
for id_obj in inst.get("identifiers", []):
if isinstance(id_obj, dict) and id_obj.get("identifier_scheme") == "ISIL":
inst_isil = id_obj.get("identifier_value", "").strip()
break
wd_isil = wd_data.get("identifiers", {}).get("ISIL", "").strip()
if inst_isil and wd_isil and inst_isil == wd_isil:
return 1.0 # Perfect match via ISIL
return None
def viaf_cross_reference_match(inst: dict[str, Any], wd_data: dict[str, Any]) -> Optional[float]:
"""
Check for VIAF ID cross-reference (high confidence match).
Returns:
0.98 if VIAF IDs match exactly
None if no VIAF match
"""
inst_viaf = None
for id_obj in inst.get("identifiers", []):
if isinstance(id_obj, dict) and id_obj.get("identifier_scheme") == "VIAF":
inst_viaf = id_obj.get("identifier_value", "").strip()
break
wd_viaf = wd_data.get("identifiers", {}).get("VIAF", "").strip()
if inst_viaf and wd_viaf and inst_viaf == wd_viaf:
return 0.98 # Very high confidence via VIAF
return None
def fuzzy_match_institutions(
institutions: list[dict[str, Any]],
wikidata_results: dict[str, dict[str, Any]],
threshold: float = 0.85
) -> list[tuple[int, str, float, dict[str, Any], str]]:
"""
Fuzzy match institutions with Wikidata results using multiple strategies.
Matching strategies (in priority order):
1. ISIL code cross-reference (confidence: 1.0)
2. VIAF ID cross-reference (confidence: 0.98)
3. Fuzzy name matching (confidence: similarity score)
Returns:
List of (institution_idx, qid, confidence_score, wd_data, match_method)
"""
matches = []
for idx, inst in enumerate(institutions):
inst_name = inst.get("name", "")
inst_type = inst.get("institution_type", "")
if not inst_name:
continue
# Skip if already has real Wikidata ID
if has_real_wikidata_id(inst):
continue
# Find best match using multiple strategies
best_score = 0.0
best_qid = None
best_data = None
best_method = "fuzzy_name_match"
for qid, wd_data in wikidata_results.items():
wd_name = wd_data.get("name", "")
wd_type = wd_data.get("type", "")
if not wd_name:
continue
# Check type compatibility
if not institution_type_compatible(inst_name, inst_type, wd_type):
continue
# Strategy 1: ISIL cross-reference (highest confidence)
isil_score = isil_cross_reference_match(inst, wd_data)
if isil_score:
best_score = isil_score
best_qid = qid
best_data = wd_data
best_method = "isil_cross_reference"
break # Perfect match, no need to continue
# Strategy 2: VIAF cross-reference (very high confidence)
viaf_score = viaf_cross_reference_match(inst, wd_data)
if viaf_score and viaf_score > best_score:
best_score = viaf_score
best_qid = qid
best_data = wd_data
best_method = "viaf_cross_reference"
# Strategy 3: Fuzzy name matching (check primary name and alternatives)
names_to_check = [wd_name] + wd_data.get("alternative_names", [])
for wd_name_variant in names_to_check:
score = similarity_score(inst_name, wd_name_variant)
if score > best_score:
best_score = score
best_qid = qid
best_data = wd_data
best_method = "fuzzy_name_match"
# Only include matches above threshold
if best_score >= threshold and best_qid and best_data:
matches.append((idx, best_qid, best_score, best_data, best_method))
return matches
def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any], match_method: str, confidence: float) -> bool:
"""
Enrich an institution with Wikidata data.
🚨 CRITICAL: This function ONLY adds REAL Wikidata Q-numbers.
It NEVER generates synthetic Q-numbers.
Returns:
True if institution was enriched
"""
enriched = False
if "identifiers" not in inst or not inst["identifiers"]:
inst["identifiers"] = []
identifiers_list = inst["identifiers"]
existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)}
# Check if Q-number is REAL (not synthetic)
qid = wd_data["qid"]
try:
qid_num = int(qid[1:])
if qid_num >= 90000000:
print(f"⚠️ WARNING: Attempted to add synthetic Q-number {qid} - REJECTED")
return False
except ValueError:
print(f"⚠️ WARNING: Invalid Q-number format {qid} - REJECTED")
return False
# Add or replace Wikidata ID
wikidata_idx = None
for i, id_obj in enumerate(identifiers_list):
if isinstance(id_obj, dict) and id_obj.get("identifier_scheme") == "Wikidata":
wikidata_idx = i
break
if wikidata_idx is not None:
# Replace existing (possibly synthetic) Wikidata ID
old_value = identifiers_list[wikidata_idx].get("identifier_value", "")
if old_value != qid:
identifiers_list[wikidata_idx] = {
"identifier_scheme": "Wikidata",
"identifier_value": qid,
"identifier_url": f"https://www.wikidata.org/wiki/{qid}"
}
enriched = True
else:
# Add new Wikidata ID
identifiers_list.append({
"identifier_scheme": "Wikidata",
"identifier_value": qid,
"identifier_url": f"https://www.wikidata.org/wiki/{qid}"
})
enriched = True
# Add other identifiers from Wikidata
wd_identifiers = wd_data.get("identifiers", {})
for scheme, value in wd_identifiers.items():
if scheme not in existing_schemes:
id_obj = {
"identifier_scheme": scheme,
"identifier_value": value
}
if scheme == "VIAF":
id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
elif scheme == "Website":
id_obj["identifier_url"] = value
elif scheme == "ISIL":
# Don't override existing ISIL, but add if missing
pass
identifiers_list.append(id_obj)
enriched = True
# Add founding date if missing
if "founding_date" in wd_data and not inst.get("founding_date"):
inst["founding_date"] = wd_data["founding_date"]
enriched = True
# Add coordinates if missing
if "latitude" in wd_data and "longitude" in wd_data:
locations = inst.get("locations", [])
if isinstance(locations, list) and len(locations) > 0:
first_loc = locations[0]
if isinstance(first_loc, dict):
if first_loc.get("latitude") is None or first_loc.get("longitude") is None:
first_loc["latitude"] = wd_data["latitude"]
first_loc["longitude"] = wd_data["longitude"]
enriched = True
# Update provenance
if enriched:
prov = inst.get("provenance", {})
if isinstance(prov, dict):
existing_method = prov.get("extraction_method", "")
match_type_desc = {
"isil_cross_reference": "Wikidata enrichment (ISIL cross-reference)",
"viaf_cross_reference": "Wikidata enrichment (VIAF cross-reference)",
"fuzzy_name_match": f"Wikidata enrichment (fuzzy name match, confidence: {confidence:.3f})"
}.get(match_method, "Wikidata enrichment")
if existing_method:
prov["extraction_method"] = f"{existing_method} + {match_type_desc}"
else:
prov["extraction_method"] = match_type_desc
# Update extraction date
prov["enrichment_date"] = datetime.now(timezone.utc).isoformat()
return enriched
def process_country(
institutions: list[dict[str, Any]],
country_code: str,
sparql: SPARQLWrapper,
threshold: float = 0.85,
dry_run: bool = False
) -> tuple[int, int, dict[str, int]]:
"""
Process a single country's institutions.
Returns:
(institutions_without_wikidata, enriched_count, match_methods_stats)
"""
country_info = COUNTRY_CONFIGS.get(country_code)
if not country_info:
print(f"\n⚠️ Unknown country code: {country_code}")
return 0, 0, {}
print(f"\n{'='*80}")
print(f"{country_info['flag']} {country_info['name'].upper()} ({country_code})")
print(f"{'='*80}\n")
# Filter institutions for this country
country_institutions_idx = [
idx for idx, inst in enumerate(institutions)
if inst.get('locations', [{}])[0].get('country') == country_code
]
print(f"📊 Found {len(country_institutions_idx):,} {country_info['name']} institutions")
# Count those without real Wikidata
without_wikidata = [
idx for idx in country_institutions_idx
if not has_real_wikidata_id(institutions[idx])
]
current_coverage = (len(country_institutions_idx) - len(without_wikidata)) / len(country_institutions_idx) * 100 if country_institutions_idx else 0
print(f"✅ With Wikidata: {len(country_institutions_idx) - len(without_wikidata):,} ({current_coverage:.1f}%)")
print(f"❓ Without Wikidata: {len(without_wikidata):,}\n")
if not without_wikidata:
print("✨ All institutions already have Wikidata IDs!")
return 0, 0, {}
# Query Wikidata
print(f"🔍 Querying Wikidata for {country_info['name']} heritage institutions...")
print(" (This may take 30-90 seconds)\n")
# Query for museums, libraries, archives, galleries
institution_types = ["Q33506", "Q7075", "Q166118", "Q2668072"]
languages = country_info.get('languages', 'en')
wikidata_results = query_wikidata_institutions(sparql, country_info['qid'], institution_types, languages)
print(f"✅ Found {len(wikidata_results):,} {country_info['name']} institutions in Wikidata\n")
if not wikidata_results:
print("⚠️ No Wikidata results, skipping fuzzy matching")
return len(without_wikidata), 0, {}
# Fuzzy match
print(f"🔗 Matching institutions (threshold: {threshold:.2f})...\n")
country_insts = [institutions[idx] for idx in without_wikidata]
matches = fuzzy_match_institutions(country_insts, wikidata_results, threshold=threshold)
print(f"✨ Found {len(matches):,} high-confidence matches\n")
# Track match methods
match_methods_stats = {
"isil_cross_reference": 0,
"viaf_cross_reference": 0,
"fuzzy_name_match": 0
}
# Show sample matches
if matches:
print(f"{'='*80}")
print(f"📋 SAMPLE MATCHES (Top 10)")
print(f"{'='*80}")
for i, (local_idx, qid, score, wd_data, method) in enumerate(matches[:10]):
inst = country_insts[local_idx]
print(f"\n{i+1}. Method: {method.upper()}, Confidence: {score:.3f}")
print(f" Local: {inst.get('name')} ({inst.get('locations', [{}])[0].get('city', 'Unknown')})")
print(f" Wikidata: {wd_data.get('name')} ({wd_data.get('qid')})")
print(f" Type: {wd_data.get('type', 'Unknown')}")
if "ISIL" in wd_data.get("identifiers", {}):
print(f" ISIL: {wd_data['identifiers']['ISIL']}")
if "VIAF" in wd_data.get("identifiers", {}):
print(f" VIAF: {wd_data['identifiers']['VIAF']}")
print(f"\n{'='*80}\n")
if dry_run:
print("🔍 DRY RUN: Would enrich the following institutions:\n")
for local_idx, qid, score, wd_data, method in matches:
inst = country_insts[local_idx]
print(f" - {inst.get('name')}{qid} (method: {method}, confidence: {score:.3f})")
print(f"\n✅ Dry run complete. Use --no-dry-run to apply changes.\n")
return len(without_wikidata), 0, {}
# Apply all matches
print("✅ Applying all matches...\n")
enriched_count = 0
for local_idx, qid, score, wd_data, method in matches:
global_idx = without_wikidata[local_idx]
if enrich_institution(institutions[global_idx], wd_data, method, score):
enriched_count += 1
match_methods_stats[method] += 1
new_coverage = (len(country_institutions_idx) - len(without_wikidata) + enriched_count) / len(country_institutions_idx) * 100 if country_institutions_idx else 0
print(f"✨ Enriched {enriched_count:,} institutions")
print(f"📈 Coverage: {current_coverage:.1f}% → {new_coverage:.1f}%")
print(f"\n📊 Match methods:")
for method, count in match_methods_stats.items():
if count > 0:
print(f" {method}: {count:,}")
print()
return len(without_wikidata), enriched_count, match_methods_stats
else:
print("❌ No matches found. Try lowering threshold.\n")
return len(without_wikidata), 0, {}
def main():
parser = argparse.ArgumentParser(
description="Enrich heritage institutions with real Wikidata Q-numbers using SPARQL",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Dutch and Chilean institutions (priority 1)
%(prog)s --countries NL CL --threshold 0.85
# All priority 1 and 2 countries
%(prog)s --priority 1 2 --threshold 0.85
# Dry run (preview matches without applying)
%(prog)s --countries NL --dry-run
# All countries (not recommended - use priority groups)
%(prog)s --all-countries --threshold 0.85
"""
)
parser.add_argument(
'--countries',
nargs='+',
metavar='CODE',
help='Country codes to process (e.g., NL CL BE IT)'
)
parser.add_argument(
'--priority',
nargs='+',
type=int,
metavar='N',
help='Process countries by priority level (1-5)'
)
parser.add_argument(
'--all-countries',
action='store_true',
help='Process all configured countries (use with caution)'
)
parser.add_argument(
'--threshold',
type=float,
default=0.85,
help='Fuzzy match threshold (0.0-1.0, default: 0.85)'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Preview matches without applying changes'
)
parser.add_argument(
'--input',
type=Path,
help='Input YAML file (default: data/instances/global/global_heritage_institutions_wikidata_enriched.yaml)'
)
parser.add_argument(
'--output',
type=Path,
help='Output YAML file (default: overwrites input or creates new file with _sparql_enriched suffix)'
)
args = parser.parse_args()
# Determine countries to process
countries_to_process = []
if args.countries:
countries_to_process = args.countries
elif args.priority:
countries_to_process = [
code for code, info in COUNTRY_CONFIGS.items()
if info.get('priority') in args.priority
]
elif args.all_countries:
countries_to_process = list(COUNTRY_CONFIGS.keys())
else:
# Default: Priority 1 countries
countries_to_process = [
code for code, info in COUNTRY_CONFIGS.items()
if info.get('priority') == 1
]
# Validate country codes
invalid_countries = [c for c in countries_to_process if c not in COUNTRY_CONFIGS]
if invalid_countries:
print(f"❌ Invalid country codes: {', '.join(invalid_countries)}")
print(f" Valid codes: {', '.join(sorted(COUNTRY_CONFIGS.keys()))}")
return 1
# File paths
base_dir = Path(__file__).parent.parent
if args.input:
input_file = args.input
else:
input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml"
if args.output:
output_file = args.output
elif args.dry_run:
output_file = None # No output for dry run
else:
output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_sparql_enriched.yaml"
# Header
print("="*80)
print("🌍 GLOBAL WIKIDATA SPARQL ENRICHMENT")
print("="*80)
print(f"\n📖 Loading dataset: {input_file.name}\n")
start_time = time.time()
# Load dataset
with open(input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
load_time = time.time() - start_time
print(f"✅ Loaded {len(institutions):,} institutions in {load_time:.1f}s")
# Setup SPARQL
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(SPARQL_JSON)
sparql.setMethod('POST')
sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2.0 (Wikidata Enrichment)")
# Process countries
print(f"\n🌍 Processing {len(countries_to_process)} countries:")
country_names = [COUNTRY_CONFIGS[c]['name'] for c in countries_to_process]
print(f" {', '.join(country_names)}\n")
if args.dry_run:
print("🔍 DRY RUN MODE: No changes will be saved\n")
total_without_wikidata = 0
total_enriched = 0
total_match_methods = {
"isil_cross_reference": 0,
"viaf_cross_reference": 0,
"fuzzy_name_match": 0
}
for i, country_code in enumerate(countries_to_process):
without, enriched, methods = process_country(
institutions,
country_code,
sparql,
threshold=args.threshold,
dry_run=args.dry_run
)
total_without_wikidata += without
total_enriched += enriched
for method, count in methods.items():
total_match_methods[method] += count
# Rate limiting - be nice to Wikidata
if i < len(countries_to_process) - 1:
print("⏸️ Waiting 5 seconds (Wikidata rate limiting)...\n")
time.sleep(5)
# Write output (unless dry run)
if not args.dry_run and total_enriched > 0 and output_file:
print("="*80)
print("💾 WRITING ENRICHED DATASET")
print("="*80 + "\n")
header = f"""---
# Global Heritage Institutions - SPARQL Enriched
# Generated: {datetime.now(timezone.utc).isoformat()}
#
# Total institutions: {len(institutions):,}
# Countries processed: {', '.join(countries_to_process)}
# New Wikidata matches: {total_enriched:,}
# Match threshold: {args.threshold:.2f}
"""
with open(output_file, 'w', encoding='utf-8') as f:
f.write(header)
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
print(f"✅ Complete! Output: {output_file}\n")
# Final report
print("="*80)
print("📊 FINAL ENRICHMENT REPORT")
print("="*80)
print(f"\n✨ Results:")
print(f" Total institutions enriched: {total_enriched:,}")
print(f" Institutions still without Wikidata: {total_without_wikidata - total_enriched:,}")
if total_enriched > 0:
print(f"\n📊 Enrichment methods:")
for method, count in total_match_methods.items():
if count > 0:
percentage = (count / total_enriched * 100) if total_enriched > 0 else 0
print(f" {method}: {count:,} ({percentage:.1f}%)")
print(f"\n⏱️ Total processing time: {(time.time()-start_time)/60:.1f} minutes")
if args.dry_run:
print("\n🔍 This was a dry run. Use --no-dry-run to apply changes.")
print("="*80 + "\n")
return 0
if __name__ == "__main__":
sys.exit(main())