#!/usr/bin/env python3 """ Phase 2 Enrichment: Netherlands (NL) Target: 622 institutions, 31.0% Wikidata coverage → 62%+ (385+ institutions) Strategy: SPARQL batch query + fuzzy name matching (Dutch normalization) Based on: Mexico Phase 2 methodology (achieved 50.0% coverage from 17.7%) GLAM Data Extraction Project - Phase 2: High-Volume Country Enrichment """ import sys from pathlib import Path from typing import Any, Dict, List, Tuple, Optional from datetime import datetime, timezone import time import yaml from difflib import SequenceMatcher import re sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON def normalize_name(name: str) -> str: """Normalize institution name for fuzzy matching (Dutch + English + German).""" name = name.lower() # Remove common prefixes/suffixes (Dutch + English + German) name = re.sub(r'^(stichting|museum|bibliotheek|archief|centrum|galerie|verzameling)\s+', '', name) name = re.sub(r'\s+(museum|bibliotheek|archief|nationaal|regionaal|gemeentelijk|provinciaal|stedelijk)$', '', name) name = re.sub(r'^(foundation|museum|library|archive|center|centre|gallery|collection)\s+', '', name) name = re.sub(r'\s+(museum|library|archive|national|regional|municipal|provincial|city)$', '', name) name = re.sub(r'^(stiftung|bibliothek|archiv|zentrum|galerie|sammlung)\s+', '', name) name = re.sub(r'\s+(national|regional|städtisch)$', '', name) # Remove abbreviations in parentheses name = re.sub(r'\s*\([^)]*\)\s*', ' ', name) # Remove punctuation name = re.sub(r'[^\w\s]', ' ', name) # Normalize whitespace name = ' '.join(name.split()) return name def similarity_score(name1: str, name2: str) -> float: """Calculate similarity between two names (0-1).""" norm1 = normalize_name(name1) norm2 = normalize_name(name2) return SequenceMatcher(None, norm1, norm2).ratio() def query_wikidata_dutch_institutions(sparql: SPARQLWrapper) -> Dict[str, Dict[str, Any]]: """ Query Wikidata for ALL heritage institutions in the Netherlands. Institution types: museums, libraries, archives, galleries, universities with collections """ query = """ SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception ?typeLabel WHERE { VALUES ?type { wd:Q33506 wd:Q7075 wd:Q166118 wd:Q207694 wd:Q473972 wd:Q641635 } ?item wdt:P31/wdt:P279* ?type . # instance of (or subclass of) institution type ?item wdt:P17 wd:Q55 . # country = Netherlands (Q55) OPTIONAL { ?item wdt:P791 ?isil . } OPTIONAL { ?item wdt:P214 ?viaf . } OPTIONAL { ?item wdt:P625 ?coords . } OPTIONAL { ?item wdt:P856 ?website . } OPTIONAL { ?item wdt:P571 ?inception . } SERVICE wikibase:label { bd:serviceParam wikibase:language "nl,en,de" . } } LIMIT 5000 """ sparql.setQuery(query) try: raw_results = sparql.query().convert() bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else [] # Parse results into dict keyed by QID results = {} for binding in bindings: item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q"): continue result = { "qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "description": binding.get("itemDescription", {}).get("value", ""), "type": binding.get("typeLabel", {}).get("value", ""), "identifiers": {} } if "isil" in binding: result["identifiers"]["ISIL"] = binding["isil"]["value"] if "viaf" in binding: result["identifiers"]["VIAF"] = binding["viaf"]["value"] if "website" in binding: result["identifiers"]["Website"] = binding["website"]["value"] if "inception" in binding: result["founding_date"] = binding["inception"]["value"].split("T")[0] if "coords" in binding: coords_str = binding["coords"]["value"] if coords_str.startswith("Point("): lon, lat = coords_str[6:-1].split() result["latitude"] = float(lat) result["longitude"] = float(lon) results[qid] = result return results except Exception as e: print(f"\n❌ Error querying Wikidata: {e}") import traceback traceback.print_exc() return {} def institution_type_compatible(inst_name: str, inst_type: str, wd_type: str) -> bool: """Check if institution types are compatible (prevent museum → library mismatches).""" inst_lower = inst_name.lower() wd_lower = wd_type.lower() museum_kw = ['museum', 'museu', 'museo'] archive_kw = ['archief', 'archive', 'archivo', 'arquivo'] library_kw = ['bibliotheek', 'library', 'biblioteca', 'bibliothèque', 'bibliothek'] gallery_kw = ['galerie', 'gallery', 'galería', 'galeria'] inst_is_museum = any(kw in inst_lower for kw in museum_kw) or inst_type == "MUSEUM" inst_is_archive = any(kw in inst_lower for kw in archive_kw) or inst_type == "ARCHIVE" inst_is_library = any(kw in inst_lower for kw in library_kw) or inst_type == "LIBRARY" inst_is_gallery = any(kw in inst_lower for kw in gallery_kw) or inst_type == "GALLERY" wd_is_museum = any(kw in wd_lower for kw in museum_kw) wd_is_archive = any(kw in wd_lower for kw in archive_kw) wd_is_library = any(kw in wd_lower for kw in library_kw) wd_is_gallery = any(kw in wd_lower for kw in gallery_kw) # MIXED, OFFICIAL_INSTITUTION, EDUCATION_PROVIDER can match any type if inst_type in ["MIXED", "OFFICIAL_INSTITUTION", "EDUCATION_PROVIDER"]: return True # If both have explicit types, they must match if inst_is_museum and not wd_is_museum: return False if inst_is_archive and not wd_is_archive: return False if inst_is_library and not wd_is_library: return False if inst_is_gallery and not wd_is_gallery: return False return True def fuzzy_match_institutions( institutions: List[Dict[str, Any]], wikidata_results: Dict[str, Dict[str, Any]], threshold: float = 0.70 ) -> List[Tuple[int, str, float, Dict[str, Any]]]: """ Fuzzy match Dutch institutions with Wikidata results. Returns: List of (institution_idx, qid, confidence_score, wd_data) """ matches = [] for idx, inst in enumerate(institutions): inst_name = inst.get("name", "") inst_type = inst.get("institution_type", "") if not inst_name: continue # Skip if already has real Wikidata ID has_wikidata = any( id_obj.get("identifier_scheme") == "Wikidata" and id_obj.get("identifier_value", "").startswith("Q") for id_obj in inst.get("identifiers", []) or [] ) if has_wikidata: continue # Find best match best_score = 0.0 best_qid = None best_data = None for qid, wd_data in wikidata_results.items(): wd_name = wd_data.get("name", "") wd_type = wd_data.get("type", "") if not wd_name: continue # Check type compatibility if not institution_type_compatible(inst_name, inst_type, wd_type): continue score = similarity_score(inst_name, wd_name) if score > best_score: best_score = score best_qid = qid best_data = wd_data # Only include matches above threshold if best_score >= threshold and best_qid and best_data: matches.append((idx, best_qid, best_score, best_data)) return matches def enrich_institution(inst: Dict[str, Any], wd_data: Dict[str, Any], match_score: float) -> bool: """Enrich an institution with Wikidata data. Returns True if enriched.""" enriched = False if "identifiers" not in inst or not inst["identifiers"]: inst["identifiers"] = [] identifiers_list = inst["identifiers"] existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)} # Add Wikidata ID if "Wikidata" not in existing_schemes: identifiers_list.append({ "identifier_scheme": "Wikidata", "identifier_value": wd_data["qid"], "identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}" }) enriched = True # Add other identifiers wd_identifiers = wd_data.get("identifiers", {}) for scheme, value in wd_identifiers.items(): if scheme not in existing_schemes: id_obj = { "identifier_scheme": scheme, "identifier_value": value } if scheme == "VIAF": id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}" elif scheme == "Website": id_obj["identifier_url"] = value identifiers_list.append(id_obj) enriched = True # Update provenance with enrichment metadata if enriched: prov = inst.get("provenance", {}) if isinstance(prov, dict): existing_method = prov.get("extraction_method", "") if existing_method: prov["extraction_method"] = f"{existing_method} + Phase 2 Netherlands Wikidata enrichment" else: prov["extraction_method"] = "Phase 2 Netherlands Wikidata enrichment" # Add enrichment history if "enrichment_history" not in prov: prov["enrichment_history"] = [] prov["enrichment_history"].append({ "enrichment_date": datetime.now(timezone.utc).isoformat(), "enrichment_method": "SPARQL query + fuzzy name matching (Dutch normalization, 70% threshold)", "enrichment_source": [f"https://www.wikidata.org/wiki/{wd_data['qid']}"], "match_score": match_score, "enrichment_notes": f"Phase 2: Fuzzy matched '{inst.get('name')}' to Wikidata '{wd_data.get('name')}'" }) return enriched def main(): base_dir = Path(__file__).parent.parent master_file = base_dir / "data" / "instances" / "all" / "globalglam-20251111.yaml" backup_file = base_dir / "data" / "instances" / "all" / "globalglam-20251111.yaml.phase2_netherlands_backup" print("="*80) print("🇳🇱 PHASE 2 NETHERLANDS WIKIDATA ENRICHMENT") print("="*80) print(f"\n📖 Loading master dataset: {master_file.name}\n") start_time = time.time() # Load master dataset with open(master_file, 'r', encoding='utf-8') as f: all_institutions = yaml.safe_load(f) # Filter Netherlands institutions netherlands_institutions = [ inst for inst in all_institutions if inst.get('locations') and any(loc.get('country') == 'NL' for loc in inst['locations']) ] # Get Netherlands institution indices in master dataset netherlands_indices = [ i for i, inst in enumerate(all_institutions) if inst.get('locations') and any(loc.get('country') == 'NL' for loc in inst['locations']) ] load_time = time.time() - start_time print(f"✅ Loaded {len(all_institutions):,} total institutions in {load_time:.1f}s") print(f"✅ Found {len(netherlands_institutions):,} Dutch institutions\n") # Count Wikidata coverage with_wikidata = sum( 1 for inst in netherlands_institutions if inst.get('identifiers') and any( id_obj.get("identifier_scheme") == "Wikidata" for id_obj in inst['identifiers'] ) ) without_wikidata = len(netherlands_institutions) - with_wikidata current_coverage = (with_wikidata / len(netherlands_institutions) * 100) if netherlands_institutions else 0 print(f"✅ With Wikidata: {with_wikidata:,} ({current_coverage:.1f}%)") print(f"❓ Without Wikidata: {without_wikidata:,}\n") if without_wikidata == 0: print("✨ All Dutch institutions already have Wikidata IDs!") return # Create backup print(f"💾 Creating backup: {backup_file.name}") with open(backup_file, 'w', encoding='utf-8') as f: yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) print(f"✅ Backup created\n") # Query Wikidata print("🔍 Querying Wikidata for Dutch heritage institutions...") print(" (This may take 30-60 seconds)\n") sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setReturnFormat(SPARQL_JSON) sparql.setMethod('POST') sparql.setTimeout(120) # 2 minute timeout sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2 (Phase 2 Netherlands Enrichment)") query_start = time.time() wikidata_results = query_wikidata_dutch_institutions(sparql) query_time = time.time() - query_start print(f"✅ Found {len(wikidata_results):,} Dutch institutions in Wikidata (query took {query_time:.1f}s)\n") if not wikidata_results: print("⚠️ No Wikidata results, aborting enrichment") return # Fuzzy match print("🔗 Fuzzy matching names (threshold: 0.70, Dutch normalization)...\n") matches = fuzzy_match_institutions(netherlands_institutions, wikidata_results, threshold=0.70) print(f"✨ Found {len(matches):,} high-confidence matches\n") if not matches: print("❌ No matches found. Try lowering threshold below 0.70.\n") return # Show sample matches print(f"{'='*80}") print(f"📋 SAMPLE MATCHES (Top 10)") print(f"{'='*80}") sorted_matches = sorted(matches, key=lambda x: x[2], reverse=True) for i, (idx, qid, score, wd_data) in enumerate(sorted_matches[:10], 1): inst = netherlands_institutions[idx] city = inst.get("locations", [{}])[0].get("city", inst.get("locations", [{}])[0].get("region", "Unknown")) print(f"\n{i}. Confidence: {score:.3f}") print(f" Local: {inst.get('name')} ({city})") print(f" Wikidata: {wd_data.get('name')} ({wd_data.get('qid')})") print(f" Type: {wd_data.get('type', 'Unknown')}") print(f"\n{'='*80}\n") # Apply matches to master dataset print("✅ Applying matches to master dataset...\n") enriched_count = 0 for local_idx, qid, score, wd_data in matches: master_idx = netherlands_indices[local_idx] if enrich_institution(all_institutions[master_idx], wd_data, score): enriched_count += 1 new_coverage = (with_wikidata + enriched_count) / len(netherlands_institutions) * 100 print(f"✨ Enriched {enriched_count:,} institutions") print(f"📈 Coverage: {current_coverage:.1f}% → {new_coverage:.1f}%") print(f" (+{new_coverage - current_coverage:.1f} percentage points)\n") # Write updated master dataset print("="*80) print("💾 WRITING UPDATED MASTER DATASET") print("="*80 + "\n") print(f"📝 Writing {len(all_institutions):,} institutions to disk...") print(" (This may take 2-3 minutes for large datasets)\n") write_start = time.time() with open(master_file, 'w', encoding='utf-8') as f: yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) write_time = time.time() - write_start print(f"✅ Updated: {master_file} (write took {write_time:.1f}s)\n") # Final summary print("="*80) print("📊 ENRICHMENT COMPLETE") print("="*80) print(f"\n✨ Results:") print(f" Dutch institutions enriched: {enriched_count:,}") print(f" Coverage increase: {current_coverage:.1f}% → {new_coverage:.1f}% (+{new_coverage - current_coverage:.1f}pp)") print(f" Remaining without Wikidata: {without_wikidata - enriched_count:,}") print(f" Overall dataset: {len(all_institutions):,} institutions") print(f"\n⏱️ Total processing time: {(time.time()-start_time)/60:.1f} minutes") print(f"\n🎯 Phase 2 Target: 62%+ coverage (385+ institutions)") if new_coverage >= 62: print(f" ✅ TARGET ACHIEVED!") else: print(f" ⏳ In progress... ({new_coverage:.1f}% / 62%)") print("="*80 + "\n") if __name__ == "__main__": main()