#!/usr/bin/env python3 """ Analyze remaining 41 Brazilian institutions without Wikidata identifiers. This script assesses enrichment potential for Batch 17 decision-making. """ import yaml import sys from pathlib import Path from collections import defaultdict def analyze_brazilian_institutions(yaml_path: Path): """Analyze Brazilian institutions without Wikidata identifiers.""" print("=" * 80) print("BRAZILIAN ENRICHMENT BATCH 17 ANALYSIS") print("=" * 80) print() # Load the dataset print(f"📂 Loading dataset: {yaml_path.name}") with open(yaml_path, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) # Filter Brazilian institutions brazilian = [i for i in institutions if i.get('locations', [{}])[0].get('country') == 'BR'] # Separate by Wikidata presence with_wikidata = [] without_wikidata = [] for inst in brazilian: identifiers = inst.get('identifiers', []) has_wikidata = any(id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in identifiers) if has_wikidata: with_wikidata.append(inst) else: without_wikidata.append(inst) # Summary statistics total = len(brazilian) enriched_count = len(with_wikidata) remaining_count = len(without_wikidata) coverage_pct = (enriched_count / total * 100) if total > 0 else 0 print(f"📊 Current Coverage:") print(f" Total Brazilian institutions: {total}") print(f" With Wikidata: {enriched_count} ({coverage_pct:.1f}%)") print(f" Without Wikidata: {remaining_count} ({100-coverage_pct:.1f}%)") print() # 70% goal calculation institutions_needed_70pct = int(total * 0.70) - enriched_count print(f"🎯 70% Stretch Goal:") print(f" Need {institutions_needed_70pct} more institutions to reach 70%") print(f" Target: {int(total * 0.70)}/{total} institutions") print() # Analyze remaining institutions print("=" * 80) print(f"DETAILED ANALYSIS OF {remaining_count} REMAINING INSTITUTIONS") print("=" * 80) print() # Group by institution type by_type = defaultdict(list) for inst in without_wikidata: inst_type = inst.get('institution_type', 'UNKNOWN') by_type[inst_type].append(inst) print("📋 Breakdown by Institution Type:") for inst_type, institutions in sorted(by_type.items(), key=lambda x: len(x[1]), reverse=True): print(f" {inst_type}: {len(institutions)}") print() # Analyze enrichment potential print("=" * 80) print("ENRICHMENT POTENTIAL ANALYSIS") print("=" * 80) print() high_potential = [] medium_potential = [] low_potential = [] for inst in without_wikidata: name = inst.get('name', 'Unnamed') desc = inst.get('description', '') inst_type = inst.get('institution_type', 'UNKNOWN') city = inst.get('locations', [{}])[0].get('city', 'Unknown') # Scoring criteria has_detailed_desc = len(desc) > 100 has_location = city != 'Unknown' is_major_type = inst_type in ['MUSEUM', 'ARCHIVE', 'LIBRARY', 'OFFICIAL_INSTITUTION'] score = sum([has_detailed_desc, has_location, is_major_type]) entry = { 'name': name, 'type': inst_type, 'city': city, 'desc_length': len(desc), 'score': score } if score >= 2: high_potential.append(entry) elif score == 1: medium_potential.append(entry) else: low_potential.append(entry) print(f"🟢 HIGH POTENTIAL ({len(high_potential)} institutions):") print(f" (Detailed description + location + major type)") for i, entry in enumerate(high_potential[:10], 1): # Show top 10 print(f" {i}. {entry['name']}") print(f" Type: {entry['type']} | City: {entry['city']} | Desc: {entry['desc_length']} chars") if len(high_potential) > 10: print(f" ... and {len(high_potential) - 10} more") print() print(f"🟡 MEDIUM POTENTIAL ({len(medium_potential)} institutions):") for i, entry in enumerate(medium_potential[:5], 1): # Show top 5 print(f" {i}. {entry['name']}") print(f" Type: {entry['type']} | City: {entry['city']} | Desc: {entry['desc_length']} chars") if len(medium_potential) > 5: print(f" ... and {len(medium_potential) - 5} more") print() print(f"🔴 LOW POTENTIAL ({len(low_potential)} institutions):") print(f" (Minimal metadata, difficult to match confidently)") print() # Recommendation print("=" * 80) print("BATCH 17 RECOMMENDATION") print("=" * 80) print() if len(high_potential) >= institutions_needed_70pct: print("✅ PROCEED WITH BATCH 17") print(f" Rationale: {len(high_potential)} high-potential candidates available") print(f" Need only {institutions_needed_70pct} institutions to reach 70%") print(f" Recommended approach: Target high-potential institutions first") elif len(high_potential) + len(medium_potential) >= institutions_needed_70pct: print("⚠️ CONDITIONAL BATCH 17") print(f" Rationale: {len(high_potential)} high + {len(medium_potential)} medium potential") print(f" May require more research and verification effort") print(f" Risk: Medium-potential matches may be ambiguous") else: print("❌ CONCLUDE AT 67.5%") print(f" Rationale: Insufficient high-quality candidates") print(f" Risk: Pursuing 70% may compromise data quality") print(f" Current coverage (67.5%) exceeds minimum goal (65%)") print() # Export high-potential candidates output_path = Path(__file__).parent.parent / 'reports' / 'brazil' / 'batch17_candidates.yaml' output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: yaml.dump({ 'summary': { 'total_candidates': len(high_potential), 'institutions_needed_70pct': institutions_needed_70pct, 'current_coverage': f"{coverage_pct:.1f}%" }, 'high_potential_candidates': high_potential }, f, allow_unicode=True, sort_keys=False) print(f"📄 Candidate list exported: {output_path}") print() if __name__ == '__main__': yaml_path = Path(__file__).parent.parent / 'data' / 'instances' / 'all' / 'globalglam-20251111-batch16-fixed.yaml' if not yaml_path.exists(): print(f"❌ Error: Dataset not found at {yaml_path}") sys.exit(1) analyze_brazilian_institutions(yaml_path)