glam/archive/scripts/brazil/analyze_brazil_remaining.py
2025-11-19 23:25:22 +01:00

184 lines
6.7 KiB
Python

#!/usr/bin/env python3
"""
Analyze remaining 41 Brazilian institutions without Wikidata identifiers.
This script assesses enrichment potential for Batch 17 decision-making.
"""
import yaml
import sys
from pathlib import Path
from collections import defaultdict
def analyze_brazilian_institutions(yaml_path: Path):
"""Analyze Brazilian institutions without Wikidata identifiers."""
print("=" * 80)
print("BRAZILIAN ENRICHMENT BATCH 17 ANALYSIS")
print("=" * 80)
print()
# Load the dataset
print(f"📂 Loading dataset: {yaml_path.name}")
with open(yaml_path, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
# Filter Brazilian institutions
brazilian = [i for i in institutions if i.get('locations', [{}])[0].get('country') == 'BR']
# Separate by Wikidata presence
with_wikidata = []
without_wikidata = []
for inst in brazilian:
identifiers = inst.get('identifiers', [])
has_wikidata = any(id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in identifiers)
if has_wikidata:
with_wikidata.append(inst)
else:
without_wikidata.append(inst)
# Summary statistics
total = len(brazilian)
enriched_count = len(with_wikidata)
remaining_count = len(without_wikidata)
coverage_pct = (enriched_count / total * 100) if total > 0 else 0
print(f"📊 Current Coverage:")
print(f" Total Brazilian institutions: {total}")
print(f" With Wikidata: {enriched_count} ({coverage_pct:.1f}%)")
print(f" Without Wikidata: {remaining_count} ({100-coverage_pct:.1f}%)")
print()
# 70% goal calculation
institutions_needed_70pct = int(total * 0.70) - enriched_count
print(f"🎯 70% Stretch Goal:")
print(f" Need {institutions_needed_70pct} more institutions to reach 70%")
print(f" Target: {int(total * 0.70)}/{total} institutions")
print()
# Analyze remaining institutions
print("=" * 80)
print(f"DETAILED ANALYSIS OF {remaining_count} REMAINING INSTITUTIONS")
print("=" * 80)
print()
# Group by institution type
by_type = defaultdict(list)
for inst in without_wikidata:
inst_type = inst.get('institution_type', 'UNKNOWN')
by_type[inst_type].append(inst)
print("📋 Breakdown by Institution Type:")
for inst_type, institutions in sorted(by_type.items(), key=lambda x: len(x[1]), reverse=True):
print(f" {inst_type}: {len(institutions)}")
print()
# Analyze enrichment potential
print("=" * 80)
print("ENRICHMENT POTENTIAL ANALYSIS")
print("=" * 80)
print()
high_potential = []
medium_potential = []
low_potential = []
for inst in without_wikidata:
name = inst.get('name', 'Unnamed')
desc = inst.get('description', '')
inst_type = inst.get('institution_type', 'UNKNOWN')
city = inst.get('locations', [{}])[0].get('city', 'Unknown')
# Scoring criteria
has_detailed_desc = len(desc) > 100
has_location = city != 'Unknown'
is_major_type = inst_type in ['MUSEUM', 'ARCHIVE', 'LIBRARY', 'OFFICIAL_INSTITUTION']
score = sum([has_detailed_desc, has_location, is_major_type])
entry = {
'name': name,
'type': inst_type,
'city': city,
'desc_length': len(desc),
'score': score
}
if score >= 2:
high_potential.append(entry)
elif score == 1:
medium_potential.append(entry)
else:
low_potential.append(entry)
print(f"🟢 HIGH POTENTIAL ({len(high_potential)} institutions):")
print(f" (Detailed description + location + major type)")
for i, entry in enumerate(high_potential[:10], 1): # Show top 10
print(f" {i}. {entry['name']}")
print(f" Type: {entry['type']} | City: {entry['city']} | Desc: {entry['desc_length']} chars")
if len(high_potential) > 10:
print(f" ... and {len(high_potential) - 10} more")
print()
print(f"🟡 MEDIUM POTENTIAL ({len(medium_potential)} institutions):")
for i, entry in enumerate(medium_potential[:5], 1): # Show top 5
print(f" {i}. {entry['name']}")
print(f" Type: {entry['type']} | City: {entry['city']} | Desc: {entry['desc_length']} chars")
if len(medium_potential) > 5:
print(f" ... and {len(medium_potential) - 5} more")
print()
print(f"🔴 LOW POTENTIAL ({len(low_potential)} institutions):")
print(f" (Minimal metadata, difficult to match confidently)")
print()
# Recommendation
print("=" * 80)
print("BATCH 17 RECOMMENDATION")
print("=" * 80)
print()
if len(high_potential) >= institutions_needed_70pct:
print("✅ PROCEED WITH BATCH 17")
print(f" Rationale: {len(high_potential)} high-potential candidates available")
print(f" Need only {institutions_needed_70pct} institutions to reach 70%")
print(f" Recommended approach: Target high-potential institutions first")
elif len(high_potential) + len(medium_potential) >= institutions_needed_70pct:
print("⚠️ CONDITIONAL BATCH 17")
print(f" Rationale: {len(high_potential)} high + {len(medium_potential)} medium potential")
print(f" May require more research and verification effort")
print(f" Risk: Medium-potential matches may be ambiguous")
else:
print("❌ CONCLUDE AT 67.5%")
print(f" Rationale: Insufficient high-quality candidates")
print(f" Risk: Pursuing 70% may compromise data quality")
print(f" Current coverage (67.5%) exceeds minimum goal (65%)")
print()
# Export high-potential candidates
output_path = Path(__file__).parent.parent / 'reports' / 'brazil' / 'batch17_candidates.yaml'
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump({
'summary': {
'total_candidates': len(high_potential),
'institutions_needed_70pct': institutions_needed_70pct,
'current_coverage': f"{coverage_pct:.1f}%"
},
'high_potential_candidates': high_potential
}, f, allow_unicode=True, sort_keys=False)
print(f"📄 Candidate list exported: {output_path}")
print()
if __name__ == '__main__':
yaml_path = Path(__file__).parent.parent / 'data' / 'instances' / 'all' / 'globalglam-20251111-batch16-fixed.yaml'
if not yaml_path.exists():
print(f"❌ Error: Dataset not found at {yaml_path}")
sys.exit(1)
analyze_brazilian_institutions(yaml_path)