184 lines
6.7 KiB
Python
184 lines
6.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Analyze remaining 41 Brazilian institutions without Wikidata identifiers.
|
|
|
|
This script assesses enrichment potential for Batch 17 decision-making.
|
|
"""
|
|
|
|
import yaml
|
|
import sys
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
def analyze_brazilian_institutions(yaml_path: Path):
|
|
"""Analyze Brazilian institutions without Wikidata identifiers."""
|
|
|
|
print("=" * 80)
|
|
print("BRAZILIAN ENRICHMENT BATCH 17 ANALYSIS")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load the dataset
|
|
print(f"📂 Loading dataset: {yaml_path.name}")
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
# Filter Brazilian institutions
|
|
brazilian = [i for i in institutions if i.get('locations', [{}])[0].get('country') == 'BR']
|
|
|
|
# Separate by Wikidata presence
|
|
with_wikidata = []
|
|
without_wikidata = []
|
|
|
|
for inst in brazilian:
|
|
identifiers = inst.get('identifiers', [])
|
|
has_wikidata = any(id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in identifiers)
|
|
|
|
if has_wikidata:
|
|
with_wikidata.append(inst)
|
|
else:
|
|
without_wikidata.append(inst)
|
|
|
|
# Summary statistics
|
|
total = len(brazilian)
|
|
enriched_count = len(with_wikidata)
|
|
remaining_count = len(without_wikidata)
|
|
coverage_pct = (enriched_count / total * 100) if total > 0 else 0
|
|
|
|
print(f"📊 Current Coverage:")
|
|
print(f" Total Brazilian institutions: {total}")
|
|
print(f" With Wikidata: {enriched_count} ({coverage_pct:.1f}%)")
|
|
print(f" Without Wikidata: {remaining_count} ({100-coverage_pct:.1f}%)")
|
|
print()
|
|
|
|
# 70% goal calculation
|
|
institutions_needed_70pct = int(total * 0.70) - enriched_count
|
|
print(f"🎯 70% Stretch Goal:")
|
|
print(f" Need {institutions_needed_70pct} more institutions to reach 70%")
|
|
print(f" Target: {int(total * 0.70)}/{total} institutions")
|
|
print()
|
|
|
|
# Analyze remaining institutions
|
|
print("=" * 80)
|
|
print(f"DETAILED ANALYSIS OF {remaining_count} REMAINING INSTITUTIONS")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Group by institution type
|
|
by_type = defaultdict(list)
|
|
for inst in without_wikidata:
|
|
inst_type = inst.get('institution_type', 'UNKNOWN')
|
|
by_type[inst_type].append(inst)
|
|
|
|
print("📋 Breakdown by Institution Type:")
|
|
for inst_type, institutions in sorted(by_type.items(), key=lambda x: len(x[1]), reverse=True):
|
|
print(f" {inst_type}: {len(institutions)}")
|
|
print()
|
|
|
|
# Analyze enrichment potential
|
|
print("=" * 80)
|
|
print("ENRICHMENT POTENTIAL ANALYSIS")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
high_potential = []
|
|
medium_potential = []
|
|
low_potential = []
|
|
|
|
for inst in without_wikidata:
|
|
name = inst.get('name', 'Unnamed')
|
|
desc = inst.get('description', '')
|
|
inst_type = inst.get('institution_type', 'UNKNOWN')
|
|
city = inst.get('locations', [{}])[0].get('city', 'Unknown')
|
|
|
|
# Scoring criteria
|
|
has_detailed_desc = len(desc) > 100
|
|
has_location = city != 'Unknown'
|
|
is_major_type = inst_type in ['MUSEUM', 'ARCHIVE', 'LIBRARY', 'OFFICIAL_INSTITUTION']
|
|
|
|
score = sum([has_detailed_desc, has_location, is_major_type])
|
|
|
|
entry = {
|
|
'name': name,
|
|
'type': inst_type,
|
|
'city': city,
|
|
'desc_length': len(desc),
|
|
'score': score
|
|
}
|
|
|
|
if score >= 2:
|
|
high_potential.append(entry)
|
|
elif score == 1:
|
|
medium_potential.append(entry)
|
|
else:
|
|
low_potential.append(entry)
|
|
|
|
print(f"🟢 HIGH POTENTIAL ({len(high_potential)} institutions):")
|
|
print(f" (Detailed description + location + major type)")
|
|
for i, entry in enumerate(high_potential[:10], 1): # Show top 10
|
|
print(f" {i}. {entry['name']}")
|
|
print(f" Type: {entry['type']} | City: {entry['city']} | Desc: {entry['desc_length']} chars")
|
|
if len(high_potential) > 10:
|
|
print(f" ... and {len(high_potential) - 10} more")
|
|
print()
|
|
|
|
print(f"🟡 MEDIUM POTENTIAL ({len(medium_potential)} institutions):")
|
|
for i, entry in enumerate(medium_potential[:5], 1): # Show top 5
|
|
print(f" {i}. {entry['name']}")
|
|
print(f" Type: {entry['type']} | City: {entry['city']} | Desc: {entry['desc_length']} chars")
|
|
if len(medium_potential) > 5:
|
|
print(f" ... and {len(medium_potential) - 5} more")
|
|
print()
|
|
|
|
print(f"🔴 LOW POTENTIAL ({len(low_potential)} institutions):")
|
|
print(f" (Minimal metadata, difficult to match confidently)")
|
|
print()
|
|
|
|
# Recommendation
|
|
print("=" * 80)
|
|
print("BATCH 17 RECOMMENDATION")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
if len(high_potential) >= institutions_needed_70pct:
|
|
print("✅ PROCEED WITH BATCH 17")
|
|
print(f" Rationale: {len(high_potential)} high-potential candidates available")
|
|
print(f" Need only {institutions_needed_70pct} institutions to reach 70%")
|
|
print(f" Recommended approach: Target high-potential institutions first")
|
|
elif len(high_potential) + len(medium_potential) >= institutions_needed_70pct:
|
|
print("⚠️ CONDITIONAL BATCH 17")
|
|
print(f" Rationale: {len(high_potential)} high + {len(medium_potential)} medium potential")
|
|
print(f" May require more research and verification effort")
|
|
print(f" Risk: Medium-potential matches may be ambiguous")
|
|
else:
|
|
print("❌ CONCLUDE AT 67.5%")
|
|
print(f" Rationale: Insufficient high-quality candidates")
|
|
print(f" Risk: Pursuing 70% may compromise data quality")
|
|
print(f" Current coverage (67.5%) exceeds minimum goal (65%)")
|
|
print()
|
|
|
|
# Export high-potential candidates
|
|
output_path = Path(__file__).parent.parent / 'reports' / 'brazil' / 'batch17_candidates.yaml'
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump({
|
|
'summary': {
|
|
'total_candidates': len(high_potential),
|
|
'institutions_needed_70pct': institutions_needed_70pct,
|
|
'current_coverage': f"{coverage_pct:.1f}%"
|
|
},
|
|
'high_potential_candidates': high_potential
|
|
}, f, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"📄 Candidate list exported: {output_path}")
|
|
print()
|
|
|
|
if __name__ == '__main__':
|
|
yaml_path = Path(__file__).parent.parent / 'data' / 'instances' / 'all' / 'globalglam-20251111-batch16-fixed.yaml'
|
|
|
|
if not yaml_path.exists():
|
|
print(f"❌ Error: Dataset not found at {yaml_path}")
|
|
sys.exit(1)
|
|
|
|
analyze_brazilian_institutions(yaml_path)
|