- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
929 lines
35 KiB
Python
Executable file
929 lines
35 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Global Wikidata SPARQL Enrichment for Heritage Institutions
|
|
|
|
This script enriches heritage institutions worldwide by querying Wikidata's SPARQL endpoint
|
|
and performing fuzzy name matching to find real Q-numbers.
|
|
|
|
🚨 CRITICAL POLICY: REAL IDENTIFIERS ONLY 🚨
|
|
This script NEVER generates synthetic Q-numbers. If no Wikidata match is found,
|
|
institutions remain without Q-numbers and are flagged for manual enrichment.
|
|
|
|
Strategy:
|
|
1. Process institutions by country (configurable priority order)
|
|
2. Query Wikidata for museums/archives/libraries in each country using SPARQL
|
|
3. Fuzzy match institution names (threshold > 0.85)
|
|
4. Cross-reference ISIL/VIAF identifiers for high-confidence matches
|
|
5. Update GHCIDs ONLY when collision resolution requires Q-number
|
|
6. Track provenance with match confidence scores
|
|
|
|
Priority Countries (configurable):
|
|
- Netherlands (NL, Q55): Highest data quality, 1,351 institutions
|
|
- Chile (CL, Q298): Good name quality, 28.9% current coverage
|
|
- Belgium (BE, Q31): ~500 institutions
|
|
- Italy (IT, Q38): ~400 institutions
|
|
- Denmark (DK, Q35): ~300 institutions
|
|
|
|
Usage:
|
|
python enrich_institutions_wikidata_sparql.py --countries NL CL BE --threshold 0.85 --dry-run
|
|
python enrich_institutions_wikidata_sparql.py --all-countries --skip-existing
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
from datetime import datetime, timezone
|
|
import time
|
|
import yaml
|
|
from difflib import SequenceMatcher
|
|
import re
|
|
import argparse
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore
|
|
|
|
|
|
# Country configurations (Wikidata QIDs)
|
|
# Prioritized by data quality, institution count, and expected match rate
|
|
COUNTRY_CONFIGS = {
|
|
# Priority 1: High data quality, large datasets
|
|
'NL': {'name': 'Netherlands', 'qid': 'Q55', 'flag': '🇳🇱', 'languages': 'nl,en', 'priority': 1},
|
|
'CL': {'name': 'Chile', 'qid': 'Q298', 'flag': '🇨🇱', 'languages': 'es,en', 'priority': 1},
|
|
|
|
# Priority 2: Medium datasets, good coverage potential
|
|
'BE': {'name': 'Belgium', 'qid': 'Q31', 'flag': '🇧🇪', 'languages': 'nl,fr,en', 'priority': 2},
|
|
'IT': {'name': 'Italy', 'qid': 'Q38', 'flag': '🇮🇹', 'languages': 'it,en', 'priority': 2},
|
|
'DK': {'name': 'Denmark', 'qid': 'Q35', 'flag': '🇩🇰', 'languages': 'da,en', 'priority': 2},
|
|
'AT': {'name': 'Austria', 'qid': 'Q40', 'flag': '🇦🇹', 'languages': 'de,en', 'priority': 2},
|
|
'CH': {'name': 'Switzerland', 'qid': 'Q39', 'flag': '🇨🇭', 'languages': 'de,fr,it,en', 'priority': 2},
|
|
'NO': {'name': 'Norway', 'qid': 'Q20', 'flag': '🇳🇴', 'languages': 'no,en', 'priority': 2},
|
|
|
|
# Priority 3: Latin America (already partially enriched)
|
|
'BR': {'name': 'Brazil', 'qid': 'Q155', 'flag': '🇧🇷', 'languages': 'pt,en', 'priority': 3},
|
|
'MX': {'name': 'Mexico', 'qid': 'Q96', 'flag': '🇲🇽', 'languages': 'es,en', 'priority': 3},
|
|
'AR': {'name': 'Argentina', 'qid': 'Q414', 'flag': '🇦🇷', 'languages': 'es,en', 'priority': 3},
|
|
'CO': {'name': 'Colombia', 'qid': 'Q739', 'flag': '🇨🇴', 'languages': 'es,en', 'priority': 3},
|
|
|
|
# Priority 4: Asian countries (language barriers)
|
|
'JP': {'name': 'Japan', 'qid': 'Q17', 'flag': '🇯🇵', 'languages': 'ja,en', 'priority': 4},
|
|
'VN': {'name': 'Vietnam', 'qid': 'Q881', 'flag': '🇻🇳', 'languages': 'vi,en', 'priority': 4},
|
|
'TH': {'name': 'Thailand', 'qid': 'Q869', 'flag': '🇹🇭', 'languages': 'th,en', 'priority': 4},
|
|
'TW': {'name': 'Taiwan', 'qid': 'Q865', 'flag': '🇹🇼', 'languages': 'zh,en', 'priority': 4},
|
|
'KR': {'name': 'South Korea', 'qid': 'Q884', 'flag': '🇰🇷', 'languages': 'ko,en', 'priority': 4},
|
|
'MY': {'name': 'Malaysia', 'qid': 'Q833', 'flag': '🇲🇾', 'languages': 'ms,en', 'priority': 4},
|
|
'ID': {'name': 'Indonesia', 'qid': 'Q252', 'flag': '🇮🇩', 'languages': 'id,en', 'priority': 4},
|
|
'PH': {'name': 'Philippines', 'qid': 'Q928', 'flag': '🇵🇭', 'languages': 'en,tl', 'priority': 4},
|
|
|
|
# Priority 5: African/Middle Eastern countries (fewer Wikidata entries)
|
|
'EG': {'name': 'Egypt', 'qid': 'Q79', 'flag': '🇪🇬', 'languages': 'ar,en', 'priority': 5},
|
|
'ZA': {'name': 'South Africa', 'qid': 'Q258', 'flag': '🇿🇦', 'languages': 'en,af', 'priority': 5},
|
|
'KE': {'name': 'Kenya', 'qid': 'Q114', 'flag': '🇰🇪', 'languages': 'en,sw', 'priority': 5},
|
|
'NG': {'name': 'Nigeria', 'qid': 'Q1033', 'flag': '🇳🇬', 'languages': 'en', 'priority': 5},
|
|
'GH': {'name': 'Ghana', 'qid': 'Q117', 'flag': '🇬🇭', 'languages': 'en', 'priority': 5},
|
|
|
|
# Add more countries as needed
|
|
}
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""
|
|
Normalize institution name for fuzzy matching.
|
|
|
|
Removes common prefixes/suffixes in multiple languages to improve matching.
|
|
"""
|
|
# Lowercase
|
|
name = name.lower()
|
|
|
|
# Remove common prefixes (multilingual)
|
|
prefixes = [
|
|
# Dutch
|
|
r'^(het |de |museum |archief |bibliotheek |stichting |nationaal |provinciaal |gemeentelijk |regionaal )',
|
|
# English
|
|
r'^(the |museum |archive |library |foundation |national |provincial |municipal |regional )',
|
|
# Spanish/Portuguese
|
|
r'^(el |la |los |las |museo |archivo |biblioteca |fundación |fundação |nacional |provincial |municipal |regional )',
|
|
# French
|
|
r'^(le |la |les |musée |archives |bibliothèque |fondation |national |provincial |municipal |régional )',
|
|
# German
|
|
r'^(das |die |der |museum |archiv |bibliothek |stiftung |national |provinziell |kommunal |regional )',
|
|
# Italian
|
|
r'^(il |lo |la |museo |archivio |biblioteca |fondazione |nazionale |provinciale |comunale |regionale )',
|
|
]
|
|
for prefix_pattern in prefixes:
|
|
name = re.sub(prefix_pattern, '', name, flags=re.IGNORECASE)
|
|
|
|
# Remove common suffixes (multilingual)
|
|
suffixes = [
|
|
r'\s+(museum|museu|museo|musée)$',
|
|
r'\s+(archief|archive|archivo|arquivo|archives)$',
|
|
r'\s+(bibliotheek|library|biblioteca|bibliothèque)$',
|
|
r'\s+(stichting|foundation|fundación|fundação|fondation|fondazione)$',
|
|
r'\s+(national|nacional|nationale|nationaal)$',
|
|
r'\s+(regional|regional|régional)$',
|
|
r'\s+(municipal|comunal|municipale)$',
|
|
]
|
|
for suffix_pattern in suffixes:
|
|
name = re.sub(suffix_pattern, '', name, flags=re.IGNORECASE)
|
|
|
|
# Remove punctuation
|
|
name = re.sub(r'[^\w\s]', ' ', name)
|
|
|
|
# Normalize whitespace
|
|
name = ' '.join(name.split())
|
|
|
|
return name.strip()
|
|
|
|
|
|
def similarity_score(name1: str, name2: str) -> float:
|
|
"""Calculate similarity between two names using SequenceMatcher (0-1)."""
|
|
norm1 = normalize_name(name1)
|
|
norm2 = normalize_name(name2)
|
|
return SequenceMatcher(None, norm1, norm2).ratio()
|
|
|
|
|
|
def query_wikidata_institutions(
|
|
sparql: SPARQLWrapper,
|
|
country_qid: str,
|
|
institution_types: list[str],
|
|
languages: str = "en"
|
|
) -> dict[str, dict[str, Any]]:
|
|
"""
|
|
Query Wikidata for heritage institutions in a specific country.
|
|
|
|
🔧 OPTIMIZED VERSION: Queries each institution type separately to avoid
|
|
expensive transitive subclass queries (wdt:P279*) that cause 504 timeouts.
|
|
|
|
Args:
|
|
sparql: Configured SPARQLWrapper instance
|
|
country_qid: Wikidata QID for country (e.g., Q55 for Netherlands)
|
|
institution_types: List of Wikidata QIDs for institution types:
|
|
Q33506 - museum
|
|
Q7075 - library
|
|
Q166118 - archive
|
|
Q2668072 - art gallery
|
|
Q5282129 - cultural center
|
|
languages: Comma-separated language codes for labels (e.g., "nl,en")
|
|
|
|
Returns:
|
|
Dictionary mapping Wikidata QIDs to institution metadata
|
|
"""
|
|
|
|
# Query each type separately to avoid timeout
|
|
all_results = {}
|
|
|
|
for inst_type_qid in institution_types:
|
|
print(f" - Querying {inst_type_qid}...", end="", flush=True)
|
|
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?itemAltLabel ?isil ?viaf ?coords ?website ?inception ?instType ?instTypeLabel
|
|
WHERE {{
|
|
# Direct instance-of match (no expensive transitive subclass)
|
|
?item wdt:P31/wdt:P279? wd:{inst_type_qid} . # instance of (or subclass of) type
|
|
?item wdt:P17 wd:{country_qid} . # country
|
|
|
|
# Capture the specific type
|
|
?item wdt:P31 ?instType .
|
|
|
|
# Optional identifiers and metadata
|
|
OPTIONAL {{ ?item wdt:P791 ?isil . }} # ISIL code
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf . }} # VIAF ID
|
|
OPTIONAL {{ ?item wdt:P625 ?coords . }} # Coordinates
|
|
OPTIONAL {{ ?item wdt:P856 ?website . }} # Official website
|
|
OPTIONAL {{ ?item wdt:P571 ?inception . }} # Founding date
|
|
|
|
SERVICE wikibase:label {{
|
|
bd:serviceParam wikibase:language "{languages}" .
|
|
?item rdfs:label ?itemLabel .
|
|
?item schema:description ?itemDescription .
|
|
?item skos:altLabel ?itemAltLabel .
|
|
?instType rdfs:label ?instTypeLabel .
|
|
}}
|
|
}}
|
|
LIMIT 1000
|
|
"""
|
|
|
|
sparql.setQuery(query)
|
|
|
|
try:
|
|
raw_results = sparql.query().convert()
|
|
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
|
|
|
|
# Merge results
|
|
type_results = _parse_sparql_bindings(bindings)
|
|
all_results.update(type_results)
|
|
print(f" {len(type_results)} found")
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Error: {e}")
|
|
continue
|
|
|
|
return all_results
|
|
|
|
|
|
def _parse_sparql_bindings(bindings: list[dict]) -> dict[str, dict[str, Any]]:
|
|
"""
|
|
Helper function to parse SPARQL query bindings into institution metadata.
|
|
|
|
Returns:
|
|
Dictionary mapping Wikidata QIDs to institution metadata
|
|
"""
|
|
results = {}
|
|
|
|
for binding in bindings:
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
|
|
if not qid or not qid.startswith("Q"):
|
|
continue
|
|
|
|
# Check if it's a REAL Wikidata Q-number (not synthetic)
|
|
try:
|
|
qid_num = int(qid[1:])
|
|
if qid_num >= 90000000:
|
|
# Synthetic Q-number range - SKIP
|
|
continue
|
|
except ValueError:
|
|
continue
|
|
|
|
# Initialize or update result
|
|
if qid not in results:
|
|
results[qid] = {
|
|
"qid": qid,
|
|
"name": binding.get("itemLabel", {}).get("value", ""),
|
|
"description": binding.get("itemDescription", {}).get("value", ""),
|
|
"type": binding.get("instTypeLabel", {}).get("value", ""),
|
|
"alternative_names": [],
|
|
"identifiers": {}
|
|
}
|
|
|
|
# Collect alternative names (multilingual labels)
|
|
alt_label = binding.get("itemAltLabel", {}).get("value", "")
|
|
if alt_label and alt_label not in results[qid]["alternative_names"]:
|
|
results[qid]["alternative_names"].append(alt_label)
|
|
|
|
# Add identifiers
|
|
if "isil" in binding:
|
|
results[qid]["identifiers"]["ISIL"] = binding["isil"]["value"]
|
|
|
|
if "viaf" in binding:
|
|
results[qid]["identifiers"]["VIAF"] = binding["viaf"]["value"]
|
|
|
|
if "website" in binding:
|
|
results[qid]["identifiers"]["Website"] = binding["website"]["value"]
|
|
|
|
if "inception" in binding:
|
|
results[qid]["founding_date"] = binding["inception"]["value"].split("T")[0]
|
|
|
|
if "coords" in binding:
|
|
coords_str = binding["coords"]["value"]
|
|
if coords_str.startswith("Point("):
|
|
lon, lat = coords_str[6:-1].split()
|
|
results[qid]["latitude"] = float(lat)
|
|
results[qid]["longitude"] = float(lon)
|
|
|
|
return results
|
|
|
|
|
|
def has_real_wikidata_id(inst: dict[str, Any]) -> bool:
|
|
"""Check if institution already has a REAL (non-synthetic) Wikidata ID."""
|
|
for id_obj in inst.get("identifiers", []):
|
|
if not isinstance(id_obj, dict):
|
|
continue
|
|
|
|
if id_obj.get("identifier_scheme") == "Wikidata":
|
|
qid = id_obj.get("identifier_value", "")
|
|
if qid.startswith("Q"):
|
|
try:
|
|
qid_num = int(qid[1:])
|
|
if qid_num < 90000000:
|
|
return True # Real Wikidata ID
|
|
except ValueError:
|
|
pass
|
|
|
|
return False
|
|
|
|
|
|
def institution_type_compatible(inst_name: str, inst_type: str, wd_type: str) -> bool:
|
|
"""
|
|
Check if institution types are compatible (avoid museum/archive/library mismatches).
|
|
|
|
Uses both the institution's formal type and name keywords to validate compatibility.
|
|
"""
|
|
inst_lower = inst_name.lower()
|
|
wd_lower = wd_type.lower()
|
|
formal_type = inst_type.upper()
|
|
|
|
# Define type keywords (multilingual)
|
|
museum_keywords = ['museum', 'museo', 'museu', 'musée', 'muzeum', 'muzeu']
|
|
archive_keywords = ['archief', 'archive', 'archivo', 'arquivo', 'archiv', 'arkiv']
|
|
library_keywords = ['bibliotheek', 'library', 'biblioteca', 'bibliothèque', 'bibliothek', 'bibliotek']
|
|
gallery_keywords = ['gallery', 'galerie', 'galería', 'galleria', 'kunsthal', 'kunsthalle']
|
|
|
|
# Check if institution name contains type keyword
|
|
inst_is_museum = any(kw in inst_lower for kw in museum_keywords)
|
|
inst_is_archive = any(kw in inst_lower for kw in archive_keywords)
|
|
inst_is_library = any(kw in inst_lower for kw in library_keywords)
|
|
inst_is_gallery = any(kw in inst_lower for kw in gallery_keywords)
|
|
|
|
# Check if Wikidata type contains type keyword
|
|
wd_is_museum = any(kw in wd_lower for kw in museum_keywords)
|
|
wd_is_archive = any(kw in wd_lower for kw in archive_keywords)
|
|
wd_is_library = any(kw in wd_lower for kw in library_keywords)
|
|
wd_is_gallery = any(kw in wd_lower for kw in gallery_keywords)
|
|
|
|
# Check formal institution type
|
|
formal_is_museum = formal_type in ('MUSEUM', 'GALLERY', 'BOTANICAL_ZOO')
|
|
formal_is_archive = formal_type == 'ARCHIVE'
|
|
formal_is_library = formal_type == 'LIBRARY'
|
|
|
|
# If Wikidata type is empty, allow match (type will be determined by name/formal type)
|
|
if not wd_type or not wd_lower.strip():
|
|
return True
|
|
|
|
# If both have explicit types, they must match
|
|
if (inst_is_museum or formal_is_museum) and not wd_is_museum and not wd_is_gallery:
|
|
return False
|
|
if (inst_is_archive or formal_is_archive) and not wd_is_archive:
|
|
return False
|
|
if (inst_is_library or formal_is_library) and not wd_is_library:
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def isil_cross_reference_match(inst: dict[str, Any], wd_data: dict[str, Any]) -> Optional[float]:
|
|
"""
|
|
Check for ISIL code cross-reference (highest confidence match).
|
|
|
|
Returns:
|
|
1.0 if ISIL codes match exactly
|
|
None if no ISIL match
|
|
"""
|
|
inst_isil = None
|
|
for id_obj in inst.get("identifiers", []):
|
|
if isinstance(id_obj, dict) and id_obj.get("identifier_scheme") == "ISIL":
|
|
inst_isil = id_obj.get("identifier_value", "").strip()
|
|
break
|
|
|
|
wd_isil = wd_data.get("identifiers", {}).get("ISIL", "").strip()
|
|
|
|
if inst_isil and wd_isil and inst_isil == wd_isil:
|
|
return 1.0 # Perfect match via ISIL
|
|
|
|
return None
|
|
|
|
|
|
def viaf_cross_reference_match(inst: dict[str, Any], wd_data: dict[str, Any]) -> Optional[float]:
|
|
"""
|
|
Check for VIAF ID cross-reference (high confidence match).
|
|
|
|
Returns:
|
|
0.98 if VIAF IDs match exactly
|
|
None if no VIAF match
|
|
"""
|
|
inst_viaf = None
|
|
for id_obj in inst.get("identifiers", []):
|
|
if isinstance(id_obj, dict) and id_obj.get("identifier_scheme") == "VIAF":
|
|
inst_viaf = id_obj.get("identifier_value", "").strip()
|
|
break
|
|
|
|
wd_viaf = wd_data.get("identifiers", {}).get("VIAF", "").strip()
|
|
|
|
if inst_viaf and wd_viaf and inst_viaf == wd_viaf:
|
|
return 0.98 # Very high confidence via VIAF
|
|
|
|
return None
|
|
|
|
|
|
def fuzzy_match_institutions(
|
|
institutions: list[dict[str, Any]],
|
|
wikidata_results: dict[str, dict[str, Any]],
|
|
threshold: float = 0.85
|
|
) -> list[tuple[int, str, float, dict[str, Any], str]]:
|
|
"""
|
|
Fuzzy match institutions with Wikidata results using multiple strategies.
|
|
|
|
Matching strategies (in priority order):
|
|
1. ISIL code cross-reference (confidence: 1.0)
|
|
2. VIAF ID cross-reference (confidence: 0.98)
|
|
3. Fuzzy name matching (confidence: similarity score)
|
|
|
|
Returns:
|
|
List of (institution_idx, qid, confidence_score, wd_data, match_method)
|
|
"""
|
|
matches = []
|
|
|
|
for idx, inst in enumerate(institutions):
|
|
inst_name = inst.get("name", "")
|
|
inst_type = inst.get("institution_type", "")
|
|
if not inst_name:
|
|
continue
|
|
|
|
# Skip if already has real Wikidata ID
|
|
if has_real_wikidata_id(inst):
|
|
continue
|
|
|
|
# Find best match using multiple strategies
|
|
best_score = 0.0
|
|
best_qid = None
|
|
best_data = None
|
|
best_method = "fuzzy_name_match"
|
|
|
|
for qid, wd_data in wikidata_results.items():
|
|
wd_name = wd_data.get("name", "")
|
|
wd_type = wd_data.get("type", "")
|
|
if not wd_name:
|
|
continue
|
|
|
|
# Check type compatibility
|
|
if not institution_type_compatible(inst_name, inst_type, wd_type):
|
|
continue
|
|
|
|
# Strategy 1: ISIL cross-reference (highest confidence)
|
|
isil_score = isil_cross_reference_match(inst, wd_data)
|
|
if isil_score:
|
|
best_score = isil_score
|
|
best_qid = qid
|
|
best_data = wd_data
|
|
best_method = "isil_cross_reference"
|
|
break # Perfect match, no need to continue
|
|
|
|
# Strategy 2: VIAF cross-reference (very high confidence)
|
|
viaf_score = viaf_cross_reference_match(inst, wd_data)
|
|
if viaf_score and viaf_score > best_score:
|
|
best_score = viaf_score
|
|
best_qid = qid
|
|
best_data = wd_data
|
|
best_method = "viaf_cross_reference"
|
|
|
|
# Strategy 3: Fuzzy name matching (check primary name and alternatives)
|
|
names_to_check = [wd_name] + wd_data.get("alternative_names", [])
|
|
for wd_name_variant in names_to_check:
|
|
score = similarity_score(inst_name, wd_name_variant)
|
|
if score > best_score:
|
|
best_score = score
|
|
best_qid = qid
|
|
best_data = wd_data
|
|
best_method = "fuzzy_name_match"
|
|
|
|
# Only include matches above threshold
|
|
if best_score >= threshold and best_qid and best_data:
|
|
matches.append((idx, best_qid, best_score, best_data, best_method))
|
|
|
|
return matches
|
|
|
|
|
|
def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any], match_method: str, confidence: float) -> bool:
|
|
"""
|
|
Enrich an institution with Wikidata data.
|
|
|
|
🚨 CRITICAL: This function ONLY adds REAL Wikidata Q-numbers.
|
|
It NEVER generates synthetic Q-numbers.
|
|
|
|
Returns:
|
|
True if institution was enriched
|
|
"""
|
|
enriched = False
|
|
|
|
if "identifiers" not in inst or not inst["identifiers"]:
|
|
inst["identifiers"] = []
|
|
|
|
identifiers_list = inst["identifiers"]
|
|
existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)}
|
|
|
|
# Check if Q-number is REAL (not synthetic)
|
|
qid = wd_data["qid"]
|
|
try:
|
|
qid_num = int(qid[1:])
|
|
if qid_num >= 90000000:
|
|
print(f"⚠️ WARNING: Attempted to add synthetic Q-number {qid} - REJECTED")
|
|
return False
|
|
except ValueError:
|
|
print(f"⚠️ WARNING: Invalid Q-number format {qid} - REJECTED")
|
|
return False
|
|
|
|
# Add or replace Wikidata ID
|
|
wikidata_idx = None
|
|
for i, id_obj in enumerate(identifiers_list):
|
|
if isinstance(id_obj, dict) and id_obj.get("identifier_scheme") == "Wikidata":
|
|
wikidata_idx = i
|
|
break
|
|
|
|
if wikidata_idx is not None:
|
|
# Replace existing (possibly synthetic) Wikidata ID
|
|
old_value = identifiers_list[wikidata_idx].get("identifier_value", "")
|
|
if old_value != qid:
|
|
identifiers_list[wikidata_idx] = {
|
|
"identifier_scheme": "Wikidata",
|
|
"identifier_value": qid,
|
|
"identifier_url": f"https://www.wikidata.org/wiki/{qid}"
|
|
}
|
|
enriched = True
|
|
else:
|
|
# Add new Wikidata ID
|
|
identifiers_list.append({
|
|
"identifier_scheme": "Wikidata",
|
|
"identifier_value": qid,
|
|
"identifier_url": f"https://www.wikidata.org/wiki/{qid}"
|
|
})
|
|
enriched = True
|
|
|
|
# Add other identifiers from Wikidata
|
|
wd_identifiers = wd_data.get("identifiers", {})
|
|
for scheme, value in wd_identifiers.items():
|
|
if scheme not in existing_schemes:
|
|
id_obj = {
|
|
"identifier_scheme": scheme,
|
|
"identifier_value": value
|
|
}
|
|
|
|
if scheme == "VIAF":
|
|
id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
|
|
elif scheme == "Website":
|
|
id_obj["identifier_url"] = value
|
|
elif scheme == "ISIL":
|
|
# Don't override existing ISIL, but add if missing
|
|
pass
|
|
|
|
identifiers_list.append(id_obj)
|
|
enriched = True
|
|
|
|
# Add founding date if missing
|
|
if "founding_date" in wd_data and not inst.get("founding_date"):
|
|
inst["founding_date"] = wd_data["founding_date"]
|
|
enriched = True
|
|
|
|
# Add coordinates if missing
|
|
if "latitude" in wd_data and "longitude" in wd_data:
|
|
locations = inst.get("locations", [])
|
|
if isinstance(locations, list) and len(locations) > 0:
|
|
first_loc = locations[0]
|
|
if isinstance(first_loc, dict):
|
|
if first_loc.get("latitude") is None or first_loc.get("longitude") is None:
|
|
first_loc["latitude"] = wd_data["latitude"]
|
|
first_loc["longitude"] = wd_data["longitude"]
|
|
enriched = True
|
|
|
|
# Update provenance
|
|
if enriched:
|
|
prov = inst.get("provenance", {})
|
|
if isinstance(prov, dict):
|
|
existing_method = prov.get("extraction_method", "")
|
|
match_type_desc = {
|
|
"isil_cross_reference": "Wikidata enrichment (ISIL cross-reference)",
|
|
"viaf_cross_reference": "Wikidata enrichment (VIAF cross-reference)",
|
|
"fuzzy_name_match": f"Wikidata enrichment (fuzzy name match, confidence: {confidence:.3f})"
|
|
}.get(match_method, "Wikidata enrichment")
|
|
|
|
if existing_method:
|
|
prov["extraction_method"] = f"{existing_method} + {match_type_desc}"
|
|
else:
|
|
prov["extraction_method"] = match_type_desc
|
|
|
|
# Update extraction date
|
|
prov["enrichment_date"] = datetime.now(timezone.utc).isoformat()
|
|
|
|
return enriched
|
|
|
|
|
|
def process_country(
|
|
institutions: list[dict[str, Any]],
|
|
country_code: str,
|
|
sparql: SPARQLWrapper,
|
|
threshold: float = 0.85,
|
|
dry_run: bool = False
|
|
) -> tuple[int, int, dict[str, int]]:
|
|
"""
|
|
Process a single country's institutions.
|
|
|
|
Returns:
|
|
(institutions_without_wikidata, enriched_count, match_methods_stats)
|
|
"""
|
|
country_info = COUNTRY_CONFIGS.get(country_code)
|
|
if not country_info:
|
|
print(f"\n⚠️ Unknown country code: {country_code}")
|
|
return 0, 0, {}
|
|
|
|
print(f"\n{'='*80}")
|
|
print(f"{country_info['flag']} {country_info['name'].upper()} ({country_code})")
|
|
print(f"{'='*80}\n")
|
|
|
|
# Filter institutions for this country
|
|
country_institutions_idx = [
|
|
idx for idx, inst in enumerate(institutions)
|
|
if inst.get('locations', [{}])[0].get('country') == country_code
|
|
]
|
|
|
|
print(f"📊 Found {len(country_institutions_idx):,} {country_info['name']} institutions")
|
|
|
|
# Count those without real Wikidata
|
|
without_wikidata = [
|
|
idx for idx in country_institutions_idx
|
|
if not has_real_wikidata_id(institutions[idx])
|
|
]
|
|
|
|
current_coverage = (len(country_institutions_idx) - len(without_wikidata)) / len(country_institutions_idx) * 100 if country_institutions_idx else 0
|
|
print(f"✅ With Wikidata: {len(country_institutions_idx) - len(without_wikidata):,} ({current_coverage:.1f}%)")
|
|
print(f"❓ Without Wikidata: {len(without_wikidata):,}\n")
|
|
|
|
if not without_wikidata:
|
|
print("✨ All institutions already have Wikidata IDs!")
|
|
return 0, 0, {}
|
|
|
|
# Query Wikidata
|
|
print(f"🔍 Querying Wikidata for {country_info['name']} heritage institutions...")
|
|
print(" (This may take 30-90 seconds)\n")
|
|
|
|
# Query for museums, libraries, archives, galleries
|
|
institution_types = ["Q33506", "Q7075", "Q166118", "Q2668072"]
|
|
languages = country_info.get('languages', 'en')
|
|
wikidata_results = query_wikidata_institutions(sparql, country_info['qid'], institution_types, languages)
|
|
|
|
print(f"✅ Found {len(wikidata_results):,} {country_info['name']} institutions in Wikidata\n")
|
|
|
|
if not wikidata_results:
|
|
print("⚠️ No Wikidata results, skipping fuzzy matching")
|
|
return len(without_wikidata), 0, {}
|
|
|
|
# Fuzzy match
|
|
print(f"🔗 Matching institutions (threshold: {threshold:.2f})...\n")
|
|
|
|
country_insts = [institutions[idx] for idx in without_wikidata]
|
|
matches = fuzzy_match_institutions(country_insts, wikidata_results, threshold=threshold)
|
|
|
|
print(f"✨ Found {len(matches):,} high-confidence matches\n")
|
|
|
|
# Track match methods
|
|
match_methods_stats = {
|
|
"isil_cross_reference": 0,
|
|
"viaf_cross_reference": 0,
|
|
"fuzzy_name_match": 0
|
|
}
|
|
|
|
# Show sample matches
|
|
if matches:
|
|
print(f"{'='*80}")
|
|
print(f"📋 SAMPLE MATCHES (Top 10)")
|
|
print(f"{'='*80}")
|
|
for i, (local_idx, qid, score, wd_data, method) in enumerate(matches[:10]):
|
|
inst = country_insts[local_idx]
|
|
print(f"\n{i+1}. Method: {method.upper()}, Confidence: {score:.3f}")
|
|
print(f" Local: {inst.get('name')} ({inst.get('locations', [{}])[0].get('city', 'Unknown')})")
|
|
print(f" Wikidata: {wd_data.get('name')} ({wd_data.get('qid')})")
|
|
print(f" Type: {wd_data.get('type', 'Unknown')}")
|
|
if "ISIL" in wd_data.get("identifiers", {}):
|
|
print(f" ISIL: {wd_data['identifiers']['ISIL']}")
|
|
if "VIAF" in wd_data.get("identifiers", {}):
|
|
print(f" VIAF: {wd_data['identifiers']['VIAF']}")
|
|
|
|
print(f"\n{'='*80}\n")
|
|
|
|
if dry_run:
|
|
print("🔍 DRY RUN: Would enrich the following institutions:\n")
|
|
for local_idx, qid, score, wd_data, method in matches:
|
|
inst = country_insts[local_idx]
|
|
print(f" - {inst.get('name')} → {qid} (method: {method}, confidence: {score:.3f})")
|
|
print(f"\n✅ Dry run complete. Use --no-dry-run to apply changes.\n")
|
|
return len(without_wikidata), 0, {}
|
|
|
|
# Apply all matches
|
|
print("✅ Applying all matches...\n")
|
|
enriched_count = 0
|
|
|
|
for local_idx, qid, score, wd_data, method in matches:
|
|
global_idx = without_wikidata[local_idx]
|
|
if enrich_institution(institutions[global_idx], wd_data, method, score):
|
|
enriched_count += 1
|
|
match_methods_stats[method] += 1
|
|
|
|
new_coverage = (len(country_institutions_idx) - len(without_wikidata) + enriched_count) / len(country_institutions_idx) * 100 if country_institutions_idx else 0
|
|
print(f"✨ Enriched {enriched_count:,} institutions")
|
|
print(f"📈 Coverage: {current_coverage:.1f}% → {new_coverage:.1f}%")
|
|
print(f"\n📊 Match methods:")
|
|
for method, count in match_methods_stats.items():
|
|
if count > 0:
|
|
print(f" {method}: {count:,}")
|
|
print()
|
|
|
|
return len(without_wikidata), enriched_count, match_methods_stats
|
|
else:
|
|
print("❌ No matches found. Try lowering threshold.\n")
|
|
return len(without_wikidata), 0, {}
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Enrich heritage institutions with real Wikidata Q-numbers using SPARQL",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Dutch and Chilean institutions (priority 1)
|
|
%(prog)s --countries NL CL --threshold 0.85
|
|
|
|
# All priority 1 and 2 countries
|
|
%(prog)s --priority 1 2 --threshold 0.85
|
|
|
|
# Dry run (preview matches without applying)
|
|
%(prog)s --countries NL --dry-run
|
|
|
|
# All countries (not recommended - use priority groups)
|
|
%(prog)s --all-countries --threshold 0.85
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--countries',
|
|
nargs='+',
|
|
metavar='CODE',
|
|
help='Country codes to process (e.g., NL CL BE IT)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--priority',
|
|
nargs='+',
|
|
type=int,
|
|
metavar='N',
|
|
help='Process countries by priority level (1-5)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--all-countries',
|
|
action='store_true',
|
|
help='Process all configured countries (use with caution)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--threshold',
|
|
type=float,
|
|
default=0.85,
|
|
help='Fuzzy match threshold (0.0-1.0, default: 0.85)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--dry-run',
|
|
action='store_true',
|
|
help='Preview matches without applying changes'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--input',
|
|
type=Path,
|
|
help='Input YAML file (default: data/instances/global/global_heritage_institutions_wikidata_enriched.yaml)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--output',
|
|
type=Path,
|
|
help='Output YAML file (default: overwrites input or creates new file with _sparql_enriched suffix)'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Determine countries to process
|
|
countries_to_process = []
|
|
|
|
if args.countries:
|
|
countries_to_process = args.countries
|
|
elif args.priority:
|
|
countries_to_process = [
|
|
code for code, info in COUNTRY_CONFIGS.items()
|
|
if info.get('priority') in args.priority
|
|
]
|
|
elif args.all_countries:
|
|
countries_to_process = list(COUNTRY_CONFIGS.keys())
|
|
else:
|
|
# Default: Priority 1 countries
|
|
countries_to_process = [
|
|
code for code, info in COUNTRY_CONFIGS.items()
|
|
if info.get('priority') == 1
|
|
]
|
|
|
|
# Validate country codes
|
|
invalid_countries = [c for c in countries_to_process if c not in COUNTRY_CONFIGS]
|
|
if invalid_countries:
|
|
print(f"❌ Invalid country codes: {', '.join(invalid_countries)}")
|
|
print(f" Valid codes: {', '.join(sorted(COUNTRY_CONFIGS.keys()))}")
|
|
return 1
|
|
|
|
# File paths
|
|
base_dir = Path(__file__).parent.parent
|
|
|
|
if args.input:
|
|
input_file = args.input
|
|
else:
|
|
input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml"
|
|
|
|
if args.output:
|
|
output_file = args.output
|
|
elif args.dry_run:
|
|
output_file = None # No output for dry run
|
|
else:
|
|
output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_sparql_enriched.yaml"
|
|
|
|
# Header
|
|
print("="*80)
|
|
print("🌍 GLOBAL WIKIDATA SPARQL ENRICHMENT")
|
|
print("="*80)
|
|
print(f"\n📖 Loading dataset: {input_file.name}\n")
|
|
|
|
start_time = time.time()
|
|
|
|
# Load dataset
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
load_time = time.time() - start_time
|
|
print(f"✅ Loaded {len(institutions):,} institutions in {load_time:.1f}s")
|
|
|
|
# Setup SPARQL
|
|
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
|
|
sparql.setReturnFormat(SPARQL_JSON)
|
|
sparql.setMethod('POST')
|
|
sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2.0 (Wikidata Enrichment)")
|
|
|
|
# Process countries
|
|
print(f"\n🌍 Processing {len(countries_to_process)} countries:")
|
|
country_names = [COUNTRY_CONFIGS[c]['name'] for c in countries_to_process]
|
|
print(f" {', '.join(country_names)}\n")
|
|
|
|
if args.dry_run:
|
|
print("🔍 DRY RUN MODE: No changes will be saved\n")
|
|
|
|
total_without_wikidata = 0
|
|
total_enriched = 0
|
|
total_match_methods = {
|
|
"isil_cross_reference": 0,
|
|
"viaf_cross_reference": 0,
|
|
"fuzzy_name_match": 0
|
|
}
|
|
|
|
for i, country_code in enumerate(countries_to_process):
|
|
without, enriched, methods = process_country(
|
|
institutions,
|
|
country_code,
|
|
sparql,
|
|
threshold=args.threshold,
|
|
dry_run=args.dry_run
|
|
)
|
|
total_without_wikidata += without
|
|
total_enriched += enriched
|
|
|
|
for method, count in methods.items():
|
|
total_match_methods[method] += count
|
|
|
|
# Rate limiting - be nice to Wikidata
|
|
if i < len(countries_to_process) - 1:
|
|
print("⏸️ Waiting 5 seconds (Wikidata rate limiting)...\n")
|
|
time.sleep(5)
|
|
|
|
# Write output (unless dry run)
|
|
if not args.dry_run and total_enriched > 0 and output_file:
|
|
print("="*80)
|
|
print("💾 WRITING ENRICHED DATASET")
|
|
print("="*80 + "\n")
|
|
|
|
header = f"""---
|
|
# Global Heritage Institutions - SPARQL Enriched
|
|
# Generated: {datetime.now(timezone.utc).isoformat()}
|
|
#
|
|
# Total institutions: {len(institutions):,}
|
|
# Countries processed: {', '.join(countries_to_process)}
|
|
# New Wikidata matches: {total_enriched:,}
|
|
# Match threshold: {args.threshold:.2f}
|
|
|
|
"""
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write(header)
|
|
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
|
|
|
|
print(f"✅ Complete! Output: {output_file}\n")
|
|
|
|
# Final report
|
|
print("="*80)
|
|
print("📊 FINAL ENRICHMENT REPORT")
|
|
print("="*80)
|
|
print(f"\n✨ Results:")
|
|
print(f" Total institutions enriched: {total_enriched:,}")
|
|
print(f" Institutions still without Wikidata: {total_without_wikidata - total_enriched:,}")
|
|
|
|
if total_enriched > 0:
|
|
print(f"\n📊 Enrichment methods:")
|
|
for method, count in total_match_methods.items():
|
|
if count > 0:
|
|
percentage = (count / total_enriched * 100) if total_enriched > 0 else 0
|
|
print(f" {method}: {count:,} ({percentage:.1f}%)")
|
|
|
|
print(f"\n⏱️ Total processing time: {(time.time()-start_time)/60:.1f} minutes")
|
|
|
|
if args.dry_run:
|
|
print("\n🔍 This was a dry run. Use --no-dry-run to apply changes.")
|
|
|
|
print("="*80 + "\n")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|