#!/usr/bin/env python3 """ Enrich Brazilian institutions - Batch 11 Adds Wikidata Q-numbers to 10 Brazilian institutions found via authenticated Wikidata search. """ import yaml from datetime import datetime, timezone from pathlib import Path # Batch 11 matches from Wikidata authenticated search BATCH11_MATCHES = { # Universities (for repository institutions) "UFES Digital Libraries": { "qid": "Q10387830", "label": "Universidade Federal do Espírito Santo", "confidence": 0.90, "notes": "Parent university for UFES digital libraries/repositories" }, "UFBA Repository": { "qid": "Q56695176", "label": "arquivo da Universidade Federal da Bahia", "confidence": 0.95, "notes": "Exact match - UFBA archive/repository" }, "UFC Repository": { "qid": "Q2749558", "label": "Universidade Federal do Ceará", "confidence": 0.90, "notes": "Parent university for UFC repository" }, "UFG Repositories": { "qid": "Q7894375", "label": "Universidade Federal de Goiás", "confidence": 0.90, "notes": "Parent university for UFG repositories" }, "UFMA": { "qid": "Q5440477", "label": "Universidade Federal do Maranhão", "confidence": 0.92, "notes": "Chapadinha campus - parent university match" }, "CEPAP-UNIFAP": { "qid": "Q7894381", "label": "Universidade Federal do Amapá", "confidence": 0.90, "notes": "Parent university UNIFAP for CEPAP research center" }, # Museums & Cultural Sites "Museu Sacaca": { "qid": "Q10333626", "label": "Museu Sacaca", "confidence": 0.98, "notes": "Exact match - Centro de Pesquisas Museológicas Museu Sacaca" }, "Serra da Barriga": { "qid": "Q10370333", "label": "Serra da Barriga", "confidence": 0.95, "notes": "Geographic feature with heritage significance (Quilombo dos Palmares)" }, # Government Heritage Institutions "FPC/IPAC": { "qid": "Q10302963", "label": "Instituto do Patrimônio Artístico e Cultural da Bahia", "confidence": 0.93, "notes": "IPAC - Bahia state heritage preservation agency" }, "State Archives": { "qid": "Q56692537", "label": "Arquivo Público do Estado do Espírito Santo", "confidence": 0.95, "notes": "State Archive of Espírito Santo - exact match" } } def find_institution_by_name(institutions, name): """Find institution by exact or partial name match.""" for i, inst in enumerate(institutions): inst_name = inst.get('name', '').strip() # Skip empty names if not inst_name: continue # Exact match first (case-insensitive) if inst_name.lower() == name.lower(): return i, inst # If no exact match, try partial match (but still require non-empty names) for i, inst in enumerate(institutions): inst_name = inst.get('name', '').strip() if inst_name and (name.lower() in inst_name.lower() or inst_name.lower() in name.lower()): return i, inst return None, None def add_wikidata_identifier(institution, qid, label, confidence, notes): """Add Wikidata identifier to institution.""" if 'identifiers' not in institution: institution['identifiers'] = [] # Check if Wikidata ID already exists has_wikidata = any( id.get('identifier_scheme') == 'Wikidata' for id in institution['identifiers'] ) if not has_wikidata: institution['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': qid, 'identifier_url': f'https://www.wikidata.org/wiki/{qid}' }) # Add enrichment history if 'provenance' not in institution: institution['provenance'] = {} if 'enrichment_history' not in institution['provenance']: institution['provenance']['enrichment_history'] = [] institution['provenance']['enrichment_history'].append({ 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'enrichment_type': 'WIKIDATA_IDENTIFIER', 'enrichment_method': 'WIKIDATA_AUTHENTICATED_SEARCH', 'match_score': confidence, 'verified': True, 'enrichment_source': 'https://www.wikidata.org', 'enrichment_notes': f'Batch 11: {notes}. Wikidata label: {label}' }) # Update last_updated timestamp institution['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat() return True def main(): # Load global dataset input_file = Path('data/instances/all/globalglam-20251111.yaml') print("=" * 80) print("BATCH 11 ENRICHMENT - Brazilian Institutions") print("=" * 80) print(f"\nLoading: {input_file}") with open(input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f"Loaded {len(institutions)} institutions") # Track enrichments enriched = [] not_found = [] print("\n" + "=" * 80) print("PROCESSING BATCH 11 MATCHES") print("=" * 80) for name, data in BATCH11_MATCHES.items(): print(f"\n🔍 Searching for: {name}") idx, inst = find_institution_by_name(institutions, name) if inst: # Verify it's Brazilian locations = inst.get('locations', []) is_brazilian = any(loc.get('country') == 'BR' for loc in locations) if is_brazilian: success = add_wikidata_identifier( inst, data['qid'], data['label'], data['confidence'], data['notes'] ) if success: print(f" ✅ ENRICHED: {inst.get('name')}") print(f" Added Q-number: {data['qid']} ({data['label']})") enriched.append({ 'name': inst.get('name'), 'qid': data['qid'], 'label': data['label'], 'confidence': data['confidence'] }) else: print(f" ⚠️ Found but not Brazilian: {inst.get('name')}") not_found.append(name) else: print(f" ❌ NOT FOUND in dataset") not_found.append(name) # Create backup backup_file = input_file.parent / f"{input_file.stem}.batch11_backup" print(f"\n📦 Creating backup: {backup_file}") with open(backup_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False) # Save enriched dataset print(f"💾 Saving enriched dataset: {input_file}") with open(input_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False) # Create batch 11 enriched file batch_file = Path('data/instances/brazil/batch11_enriched.yaml') batch_file.parent.mkdir(parents=True, exist_ok=True) print(f"💾 Saving batch file: {batch_file}") with open(batch_file, 'w', encoding='utf-8') as f: yaml.dump(enriched, f, allow_unicode=True, sort_keys=False) # Generate statistics print("\n" + "=" * 80) print("BATCH 11 ENRICHMENT RESULTS") print("=" * 80) print(f"\n✅ Successfully enriched: {len(enriched)}") print(f"❌ Not found/matched: {len(not_found)}") print(f"📊 Success rate: {len(enriched)/len(BATCH11_MATCHES)*100:.1f}%") if enriched: print("\n📋 Enriched institutions:") for item in enriched: print(f" • {item['name']} → {item['qid']} ({item['confidence']:.0%})") if not_found: print("\n⚠️ Not matched:") for name in not_found: print(f" • {name}") # Calculate new coverage brazil_total = len([i for i in institutions if any(loc.get('country') == 'BR' for loc in (i.get('locations') or []))]) brazil_with_q = len([i for i in institutions if any(loc.get('country') == 'BR' for loc in (i.get('locations') or [])) and any(id.get('identifier_scheme') == 'Wikidata' for id in (i.get('identifiers') or []))]) print("\n" + "=" * 80) print("OVERALL BRAZILIAN COVERAGE") print("=" * 80) print(f"Total Brazilian institutions: {brazil_total}") print(f"With Wikidata Q-numbers: {brazil_with_q}") print(f"Coverage: {brazil_with_q/brazil_total*100:.1f}%") print(f"Remaining to enrich: {brazil_total - brazil_with_q}") print("\n✅ Batch 11 enrichment complete!") if __name__ == '__main__': main()