#!/usr/bin/env python3 """ Enrich Swiss custodian files with Wikidata Q-numbers using fuzzy name matching. Uses Wikidata SPARQL endpoint to find matching institutions by name + location. Writes enrichment data directly to individual custodian YAML files. Process: 1. Query Wikidata for ALL Swiss heritage institutions 2. Load CH-*.yaml files without wikidata_enrichment 3. Fuzzy match by name + city location 4. Add Wikidata identifiers to matched files 5. Mark with enrichment_version: 2.1_generic Usage: python scripts/enrich_swiss_wikidata_fuzzy.py [--limit N] [--dry-run] [--threshold N] """ import yaml import requests import argparse import sys import time from pathlib import Path from typing import Dict, List, Optional, Tuple from datetime import datetime, timezone from rapidfuzz import fuzz # Wikidata SPARQL endpoint WIKIDATA_SPARQL = "https://query.wikidata.org/sparql" # Switzerland Wikidata ID SWITZERLAND_QID = "Q39" # Languages for Swiss institutions (German, French, Italian, Romansh, English) SWISS_LANGUAGES = "de,fr,it,rm,en" # Default similarity threshold DEFAULT_THRESHOLD = 85.0 def query_wikidata_swiss_institutions() -> List[Dict]: """ Query Wikidata for ALL Swiss heritage institutions. Returns: List of dicts with: qid, label, type, location, coordinates, isil, viaf """ # Simplified SPARQL query - direct instance of, no subclass traversal # This is much faster and avoids timeouts query = f""" SELECT DISTINCT ?item ?itemLabel ?typeLabel ?locationLabel ?coords ?isil ?viaf WHERE {{ # Direct instance of heritage institution types (faster than subclass traversal) VALUES ?type {{ wd:Q33506 # museum wd:Q7075 # library wd:Q166118 # archive wd:Q1007870 # art gallery wd:Q28564 # public library wd:Q207694 # art museum wd:Q17431399 # natural history museum wd:Q3329412 # cantonal archive wd:Q2668072 # cantonal library wd:Q856584 # research library }} # Direct instance of (no subclass traversal for speed) ?item wdt:P31 ?type . # Located in Switzerland ?item wdt:P17 wd:{SWITZERLAND_QID} . # Optional: specific location (city/town/canton) OPTIONAL {{ ?item wdt:P131 ?location }} # Optional: coordinates OPTIONAL {{ ?item wdt:P625 ?coords }} # Optional: ISIL code OPTIONAL {{ ?item wdt:P791 ?isil }} # Optional: VIAF ID OPTIONAL {{ ?item wdt:P214 ?viaf }} # Get labels in Swiss languages + English SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{SWISS_LANGUAGES}" }} }} LIMIT 10000 """ print("Querying Wikidata for Swiss heritage institutions...") print(f" Endpoint: {WIKIDATA_SPARQL}") print(f" Languages: {SWISS_LANGUAGES}") headers = { 'User-Agent': 'GLAM-Data-Extraction/0.2.1 (Swiss heritage institution research)', 'Accept': 'application/sparql-results+json' } try: response = requests.get( WIKIDATA_SPARQL, params={'query': query}, headers=headers, timeout=120 # Generous timeout for large query ) response.raise_for_status() data = response.json() # Parse results institutions = [] seen_qids = set() # Deduplicate by QID for binding in data['results']['bindings']: qid = binding['item']['value'].split('/')[-1] # Skip duplicates (same institution may have multiple types) if qid in seen_qids: continue seen_qids.add(qid) label = binding['itemLabel']['value'] inst_type = binding.get('typeLabel', {}).get('value', '') location = binding.get('locationLabel', {}).get('value', '') coords = binding.get('coords', {}).get('value', '') isil = binding.get('isil', {}).get('value', '') viaf = binding.get('viaf', {}).get('value', '') institutions.append({ 'qid': qid, 'label': label, 'type': inst_type, 'location': location, 'coordinates': coords, 'isil': isil, 'viaf': viaf }) print(f" Found {len(institutions)} unique institutions in Wikidata") return institutions except requests.exceptions.Timeout: print("ERROR: Wikidata query timed out. Try again later.") return [] except requests.exceptions.RequestException as e: print(f"ERROR: Failed to query Wikidata: {e}") return [] except Exception as e: print(f"ERROR: Unexpected error: {e}") return [] def fuzzy_match_institution( inst_name: str, inst_city: str, wikidata_results: List[Dict], threshold: float = DEFAULT_THRESHOLD ) -> Optional[Tuple[Dict, float]]: """ Fuzzy match institution to Wikidata results. Uses a two-pass algorithm: 1. First try to find matches with BOTH name and location match (high confidence) 2. If no location match, fall back to name-only match with higher threshold Args: inst_name: Institution name from our dataset inst_city: City location wikidata_results: List of Wikidata query results threshold: Minimum similarity threshold (0-100) Returns: Tuple of (matched_wikidata_record, confidence_score) or None """ best_match = None best_score = 0.0 best_has_location_match = False # Normalize our institution name inst_name_lower = inst_name.lower().strip() inst_city_lower = inst_city.lower().strip() if inst_city else '' for wd in wikidata_results: wd_label_lower = wd['label'].lower().strip() wd_location_lower = wd.get('location', '').lower() # Name similarity using token sort ratio (handles word reordering) name_score = fuzz.token_sort_ratio(inst_name_lower, wd_label_lower) # Check for location match location_match = False location_boost = 0 if inst_city_lower and wd_location_lower: # Exact city name match in location if inst_city_lower in wd_location_lower: location_match = True location_boost = 10 # Also check if city name is IN the Wikidata label itself (e.g., "Stadtarchiv Aarau") elif inst_city_lower in wd_label_lower: location_match = True location_boost = 8 # Fuzzy location match elif fuzz.partial_ratio(inst_city_lower, wd_location_lower) > 90: location_match = True location_boost = 5 # If we have a city but Wikidata label contains a DIFFERENT city, penalize if inst_city_lower and not location_match: # Check if Wikidata label contains a different Swiss city # Common Swiss cities that might cause false matches swiss_cities = ['zürich', 'zurich', 'bern', 'basel', 'geneva', 'genf', 'lausanne', 'luzern', 'lucerne', 'aarau', 'aarburg', 'winterthur', 'st. gallen', 'lugano', 'biel', 'thun', 'köniz', 'chur', 'schaffhausen', 'fribourg'] for city in swiss_cities: if city in wd_label_lower and city != inst_city_lower: # Different city mentioned in Wikidata label - big penalty name_score = max(0, name_score - 20) break # Combined score total_score = min(name_score + location_boost, 100) # Prefer matches with location confirmation is_better = False if total_score >= threshold: if location_match and not best_has_location_match: # Location match beats non-location match is_better = True elif location_match == best_has_location_match and total_score > best_score: # Same location status, higher score wins is_better = True if is_better: best_score = total_score best_match = wd best_has_location_match = location_match # For matches without location confirmation, require higher threshold if best_match and not best_has_location_match: # Require 95% name match if no location confirmation if best_score < 95: return None if best_match: return (best_match, best_score) return None def load_unenriched_files(custodian_dir: Path, limit: Optional[int] = None) -> List[Tuple[Path, Dict]]: """ Load CH-*.yaml files that don't have wikidata_enrichment. Args: custodian_dir: Path to data/custodian directory limit: Optional limit on number of files to load Returns: List of (file_path, data_dict) tuples """ files = [] ch_files = sorted(custodian_dir.glob("CH-*.yaml")) print(f"Scanning {len(ch_files)} CH-*.yaml files...") for filepath in ch_files: if limit and len(files) >= limit: break try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Skip if already has wikidata_enrichment if data.get('wikidata_enrichment'): continue # Skip if already has Wikidata identifier has_wikidata = False for identifier in data.get('identifiers', []): if identifier.get('identifier_scheme') == 'Wikidata': has_wikidata = True break if has_wikidata: continue files.append((filepath, data)) except Exception as e: print(f" Warning: Could not load {filepath.name}: {e}") print(f" Found {len(files)} files needing Wikidata enrichment") return files def save_enriched_file(filepath: Path, data: Dict) -> bool: """Save enriched data back to YAML file.""" try: with open(filepath, 'w', encoding='utf-8') as f: yaml.dump( data, f, allow_unicode=True, sort_keys=False, default_flow_style=False, width=120 ) return True except Exception as e: print(f" ERROR saving {filepath.name}: {e}") return False def enrich_with_wikidata( limit: Optional[int] = None, dry_run: bool = False, threshold: float = DEFAULT_THRESHOLD ): """Main enrichment workflow.""" print("=" * 80) print("SWISS INSTITUTIONS - WIKIDATA FUZZY MATCHING ENRICHMENT") print("=" * 80) print() # Setup paths custodian_dir = Path(__file__).parent.parent / "data" / "custodian" if not custodian_dir.exists(): print(f"ERROR: Custodian directory not found: {custodian_dir}") sys.exit(1) # Query Wikidata for Swiss institutions wikidata_results = query_wikidata_swiss_institutions() if not wikidata_results: print("No Wikidata results found. Exiting.") sys.exit(1) print() # Load unenriched files files_to_enrich = load_unenriched_files(custodian_dir, limit) if not files_to_enrich: print("No files need enrichment. Exiting.") return print() print(f"Fuzzy matching {len(files_to_enrich)} institutions...") print(f" Match threshold: {threshold}%") print(f" Dry run: {dry_run}") print() # Statistics matched = 0 high_confidence = 0 low_confidence = 0 saved = 0 errors = 0 timestamp = datetime.now(timezone.utc).isoformat() for idx, (filepath, data) in enumerate(files_to_enrich, 1): # Progress indicator if idx % 50 == 0 or idx == len(files_to_enrich): print(f" [{idx}/{len(files_to_enrich)}] Matched: {matched}, Saved: {saved}") # Extract institution name inst_name = None if data.get('custodian_name', {}).get('claim_value'): inst_name = data['custodian_name']['claim_value'] elif data.get('original_entry', {}).get('name'): inst_name = data['original_entry']['name'] if not inst_name: continue # Extract city inst_city = '' if data.get('location', {}).get('city'): inst_city = data['location']['city'] elif data.get('ghcid', {}).get('location_resolution', {}).get('city_name'): inst_city = data['ghcid']['location_resolution']['city_name'] elif data.get('original_entry', {}).get('locations'): locs = data['original_entry']['locations'] if locs and isinstance(locs, list) and locs[0].get('city'): inst_city = locs[0]['city'] # Fuzzy match match_result = fuzzy_match_institution( inst_name, inst_city, wikidata_results, threshold=threshold ) if not match_result: continue matched_wd, confidence = match_result matched += 1 if confidence >= 95: high_confidence += 1 else: low_confidence += 1 if dry_run: print(f" [DRY RUN] Would match: {inst_name}") print(f" -> {matched_wd['qid']} ({matched_wd['label']}) [{confidence:.1f}%]") continue # Add Wikidata enrichment data['wikidata_enrichment'] = { 'wikidata_id': matched_wd['qid'], 'wikidata_label': matched_wd['label'], 'wikidata_url': f"https://www.wikidata.org/wiki/{matched_wd['qid']}", 'enrichment_date': timestamp, 'enrichment_version': '2.1_generic', 'enrichment_method': 'wikidata_fuzzy_match', 'match_confidence': round(confidence, 1), 'match_location': matched_wd.get('location', ''), } # Add ISIL if available from Wikidata if matched_wd.get('isil'): # Check if already has ISIL has_isil = any( i.get('identifier_scheme') == 'ISIL' and i.get('identifier_value') == matched_wd['isil'] for i in data.get('identifiers', []) ) if not has_isil: if 'identifiers' not in data: data['identifiers'] = [] data['identifiers'].append({ 'identifier_scheme': 'ISIL', 'identifier_value': matched_wd['isil'], 'identifier_source': 'wikidata' }) # Add VIAF if available if matched_wd.get('viaf'): has_viaf = any( i.get('identifier_scheme') == 'VIAF' for i in data.get('identifiers', []) ) if not has_viaf: if 'identifiers' not in data: data['identifiers'] = [] data['identifiers'].append({ 'identifier_scheme': 'VIAF', 'identifier_value': matched_wd['viaf'], 'identifier_url': f"https://viaf.org/viaf/{matched_wd['viaf']}", 'identifier_source': 'wikidata' }) # Add Wikidata identifier has_wd_id = any( i.get('identifier_scheme') == 'Wikidata' for i in data.get('identifiers', []) ) if not has_wd_id: if 'identifiers' not in data: data['identifiers'] = [] data['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': matched_wd['qid'], 'identifier_url': f"https://www.wikidata.org/wiki/{matched_wd['qid']}", 'identifier_source': 'wikidata_fuzzy_match' }) # Update provenance notes if 'provenance' not in data: data['provenance'] = {} if 'notes' not in data['provenance']: data['provenance']['notes'] = [] data['provenance']['notes'].append( f"Wikidata fuzzy match enrichment {timestamp}: " f"Matched to {matched_wd['qid']} ({matched_wd['label']}) " f"with {confidence:.1f}% confidence" ) # Save file if save_enriched_file(filepath, data): saved += 1 else: errors += 1 # Final summary print() print("=" * 80) print("ENRICHMENT COMPLETE") print("=" * 80) print(f" Files scanned: {len(files_to_enrich)}") print(f" Matched: {matched} ({matched/len(files_to_enrich)*100:.1f}%)") print(f" High confidence (>=95%): {high_confidence}") print(f" Low confidence (<95%): {low_confidence}") if not dry_run: print(f" Saved: {saved}") print(f" Errors: {errors}") else: print(f" [DRY RUN - no files modified]") print() def main(): parser = argparse.ArgumentParser( description="Enrich Swiss custodian files with Wikidata via fuzzy matching" ) parser.add_argument( '--limit', '-l', type=int, default=None, help='Limit number of files to process (for testing)' ) parser.add_argument( '--dry-run', '-n', action='store_true', help='Show what would be matched without saving' ) parser.add_argument( '--threshold', '-t', type=float, default=DEFAULT_THRESHOLD, help=f'Minimum similarity threshold (default: {DEFAULT_THRESHOLD})' ) args = parser.parse_args() enrich_with_wikidata( limit=args.limit, dry_run=args.dry_run, threshold=args.threshold ) if __name__ == '__main__': main()