#!/usr/bin/env python3 """ Chilean GLAM Institutions - Batch 20 Wikidata Enrichment FIRST PRODUCTION USE OF SCHEMA v0.2.2 enrichment_history This script demonstrates the NEW structured enrichment tracking: - Uses Provenance.enrichment_history (list of EnrichmentHistoryEntry) - Replaces unstructured provenance.notes with queryable metadata - Tracks enrichment_type, match_score, verified status - Aligns with PROV-O, ADMS, and Dublin Core ontologies Target: 19 unenriched institutions → 79% coverage goal Schema: v0.2.2 (enrichment_history structure) """ import yaml import time from pathlib import Path from datetime import datetime, timezone from typing import List, Dict, Optional from SPARQLWrapper import SPARQLWrapper, JSON # ============================================================================= # CONFIGURATION # ============================================================================= INPUT_FILE = Path('data/instances/chile/chilean_institutions_batch19_enriched.yaml') OUTPUT_FILE = Path('data/instances/chile/chilean_institutions_batch20_enriched.yaml') WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql" # Fuzzy match threshold (0.0-1.0) MIN_MATCH_SCORE = 0.75 # TEST MODE: Set to a number to limit enrichment (e.g., 3 for testing) # Set to None to process all unenriched institutions TEST_MODE_LIMIT = 3 # TEST MODE - Only process first 3 institutions # Rate limiting (seconds between queries) QUERY_DELAY = 2.0 # Respect Wikidata's rate limits (1 req/sec + buffer) # ============================================================================= # WIKIDATA SPARQL QUERIES # ============================================================================= def query_wikidata_by_name(institution_name: str, country: str = "Chile") -> Optional[Dict]: """ Query Wikidata for heritage institutions by name. Returns dict with Q-number, label, and additional metadata if found. """ # SPARQL query for Chilean heritage institutions query = f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil ?founded WHERE {{ # Heritage institution types VALUES ?type {{ wd:Q33506 # Museum wd:Q7075 # Library wd:Q166118 # Archive wd:Q207694 # Art gallery wd:Q2668072 # Cultural institution }} ?item wdt:P31/wdt:P279* ?type . # Instance of (with subclasses) ?item wdt:P17 wd:Q298 . # Country: Chile # Optional identifiers OPTIONAL {{ ?item wdt:P214 ?viaf }} OPTIONAL {{ ?item wdt:P791 ?isil }} OPTIONAL {{ ?item wdt:P571 ?founded }} # Filter by name (case-insensitive) FILTER(CONTAINS(LCASE(?itemLabel), "{institution_name.lower()}")) SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }} }} LIMIT 5 """ sparql = SPARQLWrapper(WIKIDATA_ENDPOINT) sparql.setQuery(query) sparql.setReturnFormat(JSON) sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2.2 (https://github.com/sct/glam)") try: results = sparql.query().convert() # Rate limiting - wait after each query time.sleep(QUERY_DELAY) if results['results']['bindings']: # Return first match result = results['results']['bindings'][0] q_number = result['item']['value'].split('/')[-1] label = result['itemLabel']['value'] description = result.get('itemDescription', {}).get('value', '') viaf = result.get('viaf', {}).get('value', None) isil = result.get('isil', {}).get('value', None) founded = result.get('founded', {}).get('value', None) return { 'q_number': q_number, 'wikidata_label': label, 'description': description, 'viaf': viaf, 'isil': isil, 'founded': founded } except Exception as e: print(f" ⚠️ SPARQL query failed: {e}") time.sleep(QUERY_DELAY) # Wait even on error return None # ============================================================================= # FUZZY MATCHING # ============================================================================= def calculate_match_score(name1: str, name2: str) -> float: """ Calculate fuzzy match score between two institution names. Uses simple token-based matching (can be enhanced with rapidfuzz). Returns score 0.0-1.0 """ from difflib import SequenceMatcher # Normalize n1 = name1.lower().strip() n2 = name2.lower().strip() # Direct comparison if n1 == n2: return 1.0 # Token-based matching matcher = SequenceMatcher(None, n1, n2) return matcher.ratio() # ============================================================================= # SCHEMA v0.2.2 ENRICHMENT FUNCTIONS # ============================================================================= def create_enrichment_entry( enrichment_type: str, enrichment_method: str, match_score: Optional[float] = None, verified: bool = False, enrichment_source: Optional[str] = None, enrichment_notes: Optional[str] = None ) -> Dict: """ Create EnrichmentHistoryEntry conforming to schema v0.2.2. Args: enrichment_type: EnrichmentTypeEnum value (e.g., "WIKIDATA_IDENTIFIER") enrichment_method: Description of enrichment method match_score: Fuzzy match confidence (0.0-1.0), null for manual verified: Whether manually verified (default False) enrichment_source: Source URL (e.g., https://www.wikidata.org) enrichment_notes: Additional notes about enrichment Returns: Dict conforming to EnrichmentHistoryEntry class """ entry = { 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'enrichment_method': enrichment_method, 'enrichment_type': enrichment_type, 'verified': verified } if match_score is not None: entry['match_score'] = round(match_score, 3) if enrichment_source: entry['enrichment_source'] = enrichment_source if enrichment_notes: entry['enrichment_notes'] = enrichment_notes return entry def add_wikidata_identifier(inst: Dict, wikidata_data: Dict, match_score: float) -> bool: """ Add Wikidata identifier and enrichment_history entry to institution. Returns True if enrichment was added, False if already exists. """ # Check if already has Wikidata existing_ids = inst.get('identifiers', []) has_wikidata = any( id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in existing_ids ) if has_wikidata: print(f" ⚠️ Already has Wikidata identifier") return False # Add Wikidata identifier wikidata_id = { 'identifier_scheme': 'Wikidata', 'identifier_value': wikidata_data['q_number'], 'identifier_url': f"https://www.wikidata.org/wiki/{wikidata_data['q_number']}" } if 'identifiers' not in inst: inst['identifiers'] = [] inst['identifiers'].append(wikidata_id) # Add VIAF if available if wikidata_data.get('viaf'): viaf_id = { 'identifier_scheme': 'VIAF', 'identifier_value': wikidata_data['viaf'], 'identifier_url': f"https://viaf.org/viaf/{wikidata_data['viaf']}" } inst['identifiers'].append(viaf_id) print(f" 📚 Added VIAF: {wikidata_data['viaf']}") # Create enrichment_history entry (SCHEMA v0.2.2) if 'provenance' not in inst: inst['provenance'] = {} if 'enrichment_history' not in inst['provenance']: inst['provenance']['enrichment_history'] = [] # Wikidata enrichment entry wikidata_entry = create_enrichment_entry( enrichment_type='WIKIDATA_IDENTIFIER', enrichment_method='Wikidata SPARQL query with fuzzy name matching', match_score=match_score, verified=False, # Automated enrichment enrichment_source='https://www.wikidata.org', enrichment_notes=f"Matched to '{wikidata_data['wikidata_label']}' (Q{wikidata_data['q_number']})" ) inst['provenance']['enrichment_history'].append(wikidata_entry) # If VIAF was added, create separate enrichment entry if wikidata_data.get('viaf'): viaf_entry = create_enrichment_entry( enrichment_type='VIAF_IDENTIFIER', enrichment_method='VIAF identifier extracted from Wikidata entity', match_score=None, # Derived data, not fuzzy matched verified=False, enrichment_source='https://viaf.org', enrichment_notes=f"Extracted from Wikidata Q{wikidata_data['q_number']}" ) inst['provenance']['enrichment_history'].append(viaf_entry) print(f" ✅ Added Wikidata: {wikidata_data['q_number']} ({wikidata_data['wikidata_label']})") print(f" 📊 Match score: {match_score:.3f}") return True # ============================================================================= # FILE I/O # ============================================================================= def load_yaml(file_path: Path) -> List[Dict]: """Load YAML file.""" with open(file_path, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def save_yaml(data: List[Dict], file_path: Path) -> None: """Save data to YAML file.""" with open(file_path, 'w', encoding='utf-8') as f: yaml.dump( data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120, indent=2 ) # ============================================================================= # MAIN ENRICHMENT WORKFLOW # ============================================================================= def main(): print("=" * 80) print("CHILEAN GLAM INSTITUTIONS - BATCH 20 ENRICHMENT (SCHEMA v0.2.2)") print("=" * 80) print() print("🆕 FIRST PRODUCTION USE OF enrichment_history STRUCTURE") print(" Schema: v0.2.2 (structured provenance tracking)") print(" Target: 19 unenriched institutions") print() # Load data print(f"📖 Loading: {INPUT_FILE}") institutions = load_yaml(INPUT_FILE) print(f" Loaded {len(institutions)} institutions") print() # Find unenriched institutions unenriched = [ inst for inst in institutions if not any( id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in inst.get('identifiers', []) ) ] print(f"🔍 Found {len(unenriched)} institutions without Wikidata") print() if not unenriched: print("✅ All institutions already enriched!") return # Create backup backup_file = INPUT_FILE.with_suffix('.yaml.batch20_backup') print(f"💾 Creating backup: {backup_file}") save_yaml(institutions, backup_file) print() # Enrich institutions print(f"🔧 Starting Wikidata enrichment...") print() enriched_count = 0 skipped_count = 0 for i, inst in enumerate(unenriched, 1): inst_name = inst['name'] inst_city = inst.get('locations', [{}])[0].get('city', 'Unknown') print(f"{i}/{len(unenriched)}. {inst_name} ({inst_city})") # Query Wikidata wikidata_data = query_wikidata_by_name(inst_name) if not wikidata_data: print(f" ❌ No Wikidata match found") skipped_count += 1 print() continue # Calculate match score match_score = calculate_match_score(inst_name, wikidata_data['wikidata_label']) if match_score < MIN_MATCH_SCORE: print(f" ⚠️ Match score too low: {match_score:.3f} < {MIN_MATCH_SCORE}") print(f" 📝 Wikidata label: {wikidata_data['wikidata_label']}") skipped_count += 1 print() continue # Add enrichment if add_wikidata_identifier(inst, wikidata_data, match_score): enriched_count += 1 print() # Save enriched data print("=" * 80) print(f"💾 Saving enriched data to: {OUTPUT_FILE}") save_yaml(institutions, OUTPUT_FILE) print() # Statistics total_with_wikidata = sum( 1 for inst in institutions if any( id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in inst.get('identifiers', []) ) ) print("=" * 80) print("ENRICHMENT SUMMARY") print("=" * 80) print(f"✅ Newly enriched: {enriched_count}") print(f"⏭️ Skipped (no match or low confidence): {skipped_count}") print(f"📊 Total with Wikidata: {total_with_wikidata}/{len(institutions)} ({total_with_wikidata/len(institutions)*100:.1f}%)") print() print("🎯 Schema v0.2.2 Features Used:") print(" - enrichment_history (list of EnrichmentHistoryEntry)") print(" - enrichment_type: WIKIDATA_IDENTIFIER, VIAF_IDENTIFIER") print(" - match_score: Fuzzy matching confidence (0.0-1.0)") print(" - verified: false (automated enrichment)") print(" - enrichment_source: Wikidata and VIAF URLs") print() print("✅ DONE") if __name__ == '__main__': main()