#!/usr/bin/env python3 """ Enrich Dutch institutions using fuzzy name matching in Wikidata. This script addresses the low Dutch coverage (4.8%) by querying Wikidata for Dutch heritage institutions using name-based searches rather than ISIL codes. Strategy: 1. Find Dutch institutions without Wikidata IDs 2. Query Wikidata for museums/archives/libraries in Netherlands 3. Fuzzy match names (normalized) 4. Manual verification for high-confidence matches (>0.85) """ import sys from pathlib import Path from typing import Any, Optional from datetime import datetime, timezone import time import yaml from difflib import SequenceMatcher import re sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore def normalize_name(name: str) -> str: """Normalize institution name for fuzzy matching.""" # Lowercase name = name.lower() # Remove common prefixes/suffixes name = re.sub(r'^(stichting|gemeentearchief|regionaal archief|museum)\s+', '', name) name = re.sub(r'\s+(archief|museum|bibliotheek|library|archive)$', '', name) # Remove punctuation name = re.sub(r'[^\w\s]', ' ', name) # Normalize whitespace name = ' '.join(name.split()) return name def similarity_score(name1: str, name2: str) -> float: """Calculate similarity between two names (0-1).""" norm1 = normalize_name(name1) norm2 = normalize_name(name2) return SequenceMatcher(None, norm1, norm2).ratio() def query_dutch_institutions(sparql: SPARQLWrapper, institution_types: list[str]) -> dict[str, dict[str, Any]]: """ Query Wikidata for Dutch heritage institutions. institution_types: List of Wikidata QIDs for institution types Q33506 - museum Q7075 - library Q166118 - archive """ types_values = " ".join(f"wd:{qid}" for qid in institution_types) query = f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception ?typeLabel WHERE {{ VALUES ?type {{ {types_values} }} ?item wdt:P31 ?type . # instance of museum/library/archive ?item wdt:P17 wd:Q55 . # country: Netherlands OPTIONAL {{ ?item wdt:P791 ?isil . }} OPTIONAL {{ ?item wdt:P214 ?viaf . }} OPTIONAL {{ ?item wdt:P625 ?coords . }} OPTIONAL {{ ?item wdt:P856 ?website . }} OPTIONAL {{ ?item wdt:P571 ?inception . }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "nl,en" . }} }} LIMIT 2000 """ sparql.setQuery(query) try: raw_results = sparql.query().convert() bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else [] # Parse results into dict keyed by QID results = {} for binding in bindings: item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q"): continue result = { "qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "description": binding.get("itemDescription", {}).get("value", ""), "type": binding.get("typeLabel", {}).get("value", ""), "identifiers": {} } if "isil" in binding: result["identifiers"]["ISIL"] = binding["isil"]["value"] if "viaf" in binding: result["identifiers"]["VIAF"] = binding["viaf"]["value"] if "website" in binding: result["identifiers"]["Website"] = binding["website"]["value"] if "inception" in binding: result["founding_date"] = binding["inception"]["value"].split("T")[0] if "coords" in binding: coords_str = binding["coords"]["value"] if coords_str.startswith("Point("): lon, lat = coords_str[6:-1].split() result["latitude"] = float(lat) result["longitude"] = float(lon) results[qid] = result return results except Exception as e: print(f"\nāŒ Error querying Wikidata: {e}") import traceback traceback.print_exc() return {} def institution_type_compatible(inst_name: str, wd_type: str) -> bool: """Check if institution types are compatible (avoid museum/archive mismatches).""" inst_lower = inst_name.lower() wd_lower = wd_type.lower() # Define type keywords museum_keywords = ['museum', 'museo', 'museu'] archive_keywords = ['archief', 'archive', 'archivo'] library_keywords = ['bibliotheek', 'library', 'biblioteca'] # Check if institution name contains type keyword inst_is_museum = any(kw in inst_lower for kw in museum_keywords) inst_is_archive = any(kw in inst_lower for kw in archive_keywords) inst_is_library = any(kw in inst_lower for kw in library_keywords) # Check if Wikidata type contains type keyword wd_is_museum = any(kw in wd_lower for kw in museum_keywords) wd_is_archive = any(kw in wd_lower for kw in archive_keywords) wd_is_library = any(kw in wd_lower for kw in library_keywords) # If both have explicit types, they must match if inst_is_museum and not wd_is_museum: return False if inst_is_archive and not wd_is_archive: return False if inst_is_library and not wd_is_library: return False return True def fuzzy_match_dutch_institutions( institutions: list[dict[str, Any]], wikidata_results: dict[str, dict[str, Any]], threshold: float = 0.85 ) -> list[tuple[int, str, float, dict[str, Any]]]: """ Fuzzy match Dutch institutions with Wikidata results. Returns: List of (institution_idx, qid, confidence_score, wd_data) """ matches = [] for idx, inst in enumerate(institutions): inst_name = inst.get("name", "") if not inst_name: continue # Skip if already has Wikidata ID has_wikidata = any( id_obj.get("identifier_scheme") == "Wikidata" and id_obj.get("identifier_value", "").startswith("Q") and int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000 for id_obj in inst.get("identifiers", []) ) if has_wikidata: continue # Find best match best_score = 0.0 best_qid = None best_data = None for qid, wd_data in wikidata_results.items(): wd_name = wd_data.get("name", "") wd_type = wd_data.get("type", "") if not wd_name: continue # Check type compatibility if not institution_type_compatible(inst_name, wd_type): continue score = similarity_score(inst_name, wd_name) if score > best_score: best_score = score best_qid = qid best_data = wd_data # Only include matches above threshold if best_score >= threshold and best_qid and best_data: matches.append((idx, best_qid, best_score, best_data)) return matches def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any]) -> bool: """Enrich an institution with Wikidata data. Returns True if enriched.""" enriched = False if "identifiers" not in inst or not inst["identifiers"]: inst["identifiers"] = [] identifiers_list = inst["identifiers"] existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)} # Add Wikidata ID if "Wikidata" not in existing_schemes: identifiers_list.append({ "identifier_scheme": "Wikidata", "identifier_value": wd_data["qid"], "identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}" }) enriched = True # Add other identifiers wd_identifiers = wd_data.get("identifiers", {}) for scheme, value in wd_identifiers.items(): if scheme not in existing_schemes: id_obj = { "identifier_scheme": scheme, "identifier_value": value } if scheme == "VIAF": id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}" elif scheme == "Website": id_obj["identifier_url"] = value identifiers_list.append(id_obj) enriched = True # Add founding date if "founding_date" in wd_data and not inst.get("founding_date"): inst["founding_date"] = wd_data["founding_date"] enriched = True # Add coordinates if missing if "latitude" in wd_data and "longitude" in wd_data: locations = inst.get("locations", []) if isinstance(locations, list) and len(locations) > 0: first_loc = locations[0] if isinstance(first_loc, dict) and first_loc.get("latitude") is None: first_loc["latitude"] = wd_data["latitude"] first_loc["longitude"] = wd_data["longitude"] enriched = True # Update provenance if enriched: prov = inst.get("provenance", {}) if isinstance(prov, dict): existing_method = prov.get("extraction_method", "") if existing_method: prov["extraction_method"] = f"{existing_method} + Wikidata enrichment (fuzzy name match)" else: prov["extraction_method"] = "Wikidata enrichment (fuzzy name match)" return enriched def main(): base_dir = Path(__file__).parent.parent input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml" output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_dutch_enriched.yaml" print("="*80) print("šŸ‡³šŸ‡± DUTCH INSTITUTIONS FUZZY MATCHING") print("="*80) print(f"\nšŸ“– Loading dataset...\n") start_time = time.time() with open(input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) load_time = time.time() - start_time print(f"āœ… Loaded {len(institutions):,} institutions in {load_time:.1f}s\n") # Filter Dutch institutions dutch_institutions_idx = [ idx for idx, inst in enumerate(institutions) if inst.get('locations', [{}])[0].get('country') == 'NL' ] print(f"šŸ‡³šŸ‡± Found {len(dutch_institutions_idx):,} Dutch institutions\n") # Count those without Wikidata dutch_without_wikidata = [ idx for idx in dutch_institutions_idx if not any( id_obj.get("identifier_scheme") == "Wikidata" and id_obj.get("identifier_value", "").startswith("Q") and int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000 for id_obj in institutions[idx].get("identifiers", []) ) ] print(f"ā“ Dutch institutions without Wikidata: {len(dutch_without_wikidata):,}\n") # Setup SPARQL sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setReturnFormat(SPARQL_JSON) sparql.setMethod('POST') sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2") # Query Wikidata for Dutch institutions print("šŸ” Querying Wikidata for Dutch museums, libraries, and archives...") print(" (This may take 30-60 seconds)\n") institution_types = ["Q33506", "Q7075", "Q166118"] # museum, library, archive wikidata_results = query_dutch_institutions(sparql, institution_types) print(f"āœ… Found {len(wikidata_results):,} Dutch institutions in Wikidata\n") # Fuzzy match print("šŸ”— Fuzzy matching names (threshold: 0.85)...\n") dutch_insts = [institutions[idx] for idx in dutch_without_wikidata] matches = fuzzy_match_dutch_institutions(dutch_insts, wikidata_results, threshold=0.85) print(f"✨ Found {len(matches):,} high-confidence matches\n") # Show sample matches for verification if matches: print("="*80) print("šŸ“‹ SAMPLE MATCHES (Top 10)") print("="*80) for i, (local_idx, qid, score, wd_data) in enumerate(matches[:10]): inst = dutch_insts[local_idx] print(f"\n{i+1}. Confidence: {score:.3f}") print(f" Local: {inst.get('name')} ({inst.get('locations', [{}])[0].get('city', 'Unknown')})") print(f" Wikidata: {wd_data.get('name')} ({wd_data.get('qid')})") print(f" Type: {wd_data.get('type', 'Unknown')}") if "ISIL" in wd_data.get("identifiers", {}): print(f" ISIL: {wd_data['identifiers']['ISIL']}") print("\n" + "="*80) print("\nāš ļø AUTOMATIC APPLICATION") print("="*80) print("\nApplying all high-confidence matches (>0.85 similarity)...") choice = "1" # Auto-apply if choice == "1": # Apply all matches print("\nāœ… Applying all matches...\n") enriched_count = 0 for local_idx, qid, score, wd_data in matches: global_idx = dutch_without_wikidata[local_idx] if enrich_institution(institutions[global_idx], wd_data): enriched_count += 1 print(f"✨ Enriched {enriched_count:,} institutions\n") # Write output print("šŸ’¾ Writing enriched dataset...") header = f"""--- # Global Heritage Institutions - Dutch Fuzzy Match Enriched # Generated: {datetime.now(timezone.utc).isoformat()} # # Total institutions: {len(institutions):,} # Dutch institutions: {len(dutch_institutions_idx):,} # New Dutch matches: {enriched_count:,} """ with open(output_file, 'w', encoding='utf-8') as f: f.write(header) yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) print(f"āœ… Complete! Output: {output_file}\n") # Final report print("="*80) print("šŸ“Š ENRICHMENT REPORT") print("="*80) print(f"\n✨ Results:") print(f" Dutch institutions enriched: {enriched_count:,}") print(f" New Dutch Wikidata coverage: {(49 + enriched_count) / len(dutch_institutions_idx) * 100:.1f}%") print(f" (was 4.8%, now {(49 + enriched_count) / len(dutch_institutions_idx) * 100:.1f}%)") print(f"\nā±ļø Processing time: {(time.time()-start_time)/60:.1f} minutes") print("="*80 + "\n") elif choice == "2": print("\nāš ļø Interactive review not yet implemented") print(" Please review matches manually and run with choice 1 if approved\n") else: print("\nāŒ Cancelled\n") else: print("āŒ No matches found. Try lowering threshold.\n") if __name__ == "__main__": main()