glam/analyze_brazil_batch13_candidates.py

#!/usr/bin/env python3
"""
Analyze Brazilian institutions for Batch 13 Wikidata enrichment.

Identifies institutions without Wikidata Q-numbers and prioritizes them
for enrichment based on institutional significance and metadata completeness.
"""

import yaml
from pathlib import Path
from typing import List, Dict, Any
from collections import defaultdict


def load_institutions(yaml_path: str) -> List[Dict[str, Any]]:
    """Load all institutions from the YAML file."""
    print(f"Loading institutions from {yaml_path}...")
    with open(yaml_path, 'r', encoding='utf-8') as f:
        institutions = yaml.safe_load(f)
    print(f"Loaded {len(institutions)} total institutions")
    return institutions


def extract_brazilian_institutions(institutions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Extract institutions with country code 'BR'."""
    brazilian = []
    for inst in institutions:
        locations = inst.get('locations', [])
        if locations and any(loc.get('country') == 'BR' for loc in locations):
            brazilian.append(inst)
    print(f"Found {len(brazilian)} Brazilian institutions")
    return brazilian


def has_wikidata(institution: Dict[str, Any]) -> bool:
    """Check if institution has a Wikidata Q-number."""
    identifiers = institution.get('identifiers', [])
    for ident in identifiers:
        if ident.get('identifier_scheme') == 'Wikidata':
            q_value = ident.get('identifier_value', '')
            # Check for non-empty Q-number
            if q_value and q_value.strip() and q_value.startswith('Q'):
                return True
    return False


def get_location_info(institution: Dict[str, Any]) -> Dict[str, str]:
    """Extract city and state from institution."""
    locations = institution.get('locations', [])
    if not locations:
        return {'city': 'Unknown', 'state': 'Unknown'}

    loc = locations[0]
    return {
        'city': loc.get('city', 'Unknown'),
        'state': loc.get('region', 'Unknown')
    }


def calculate_priority_score(institution: Dict[str, Any]) -> int:
    """
    Calculate enrichment priority score (higher = better candidate).

    Scoring criteria:
    - State/national institution: +50 points
    - University: +40 points
    - Major municipal museum: +30 points
    - Has description: +20 points
    - Has complete location (city + state): +10 points
    - Has website: +10 points
    - Generic/common name: -30 points
    """
    score = 0
    name = institution.get('name', '').lower()
    description = institution.get('description', '')
    inst_type = institution.get('institution_type', '')

    # Institutional significance
    if 'nacional' in name or 'federal' in name or 'estadual' in name or 'estado' in name:
        score += 50

    if inst_type == 'UNIVERSITY' or 'universidade' in name:
        score += 40

    if 'museu' in name and ('municipal' in name or 'histórico' in name):
        score += 30

    # Metadata completeness
    if description and len(description) > 50:
        score += 20

    loc_info = get_location_info(institution)
    if loc_info['city'] != 'Unknown' and loc_info['state'] != 'Unknown':
        score += 10

    identifiers = institution.get('identifiers', [])
    has_website = any(i.get('identifier_scheme') == 'Website' for i in identifiers)
    if has_website:
        score += 10

    # Penalize generic names
    generic_terms = ['arquivo', 'biblioteca', 'museu', 'centro']
    if any(term in name for term in generic_terms) and len(name.split()) <= 2:
        score -= 30

    return score


def categorize_institutions(institutions: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
    """Categorize institutions by type."""
    categories = defaultdict(list)

    for inst in institutions:
        inst_type = inst.get('institution_type', 'UNKNOWN')
        categories[inst_type].append(inst)

    return dict(categories)


def analyze_candidates(brazilian_institutions: List[Dict[str, Any]]):
    """Analyze and report on enrichment candidates."""

    # Separate institutions with/without Wikidata
    with_wikidata = [inst for inst in brazilian_institutions if has_wikidata(inst)]
    without_wikidata = [inst for inst in brazilian_institutions if not has_wikidata(inst)]

    print("\n" + "="*80)
    print("BATCH 13 CANDIDATE ANALYSIS")
    print("="*80)
    print(f"\nTotal Brazilian institutions: {len(brazilian_institutions)}")
    print(f"With Wikidata Q-numbers: {len(with_wikidata)} ({len(with_wikidata)/len(brazilian_institutions)*100:.1f}%)")
    print(f"Without Wikidata Q-numbers: {len(without_wikidata)} ({len(without_wikidata)/len(brazilian_institutions)*100:.1f}%)")

    # Categorize candidates without Wikidata
    categories = categorize_institutions(without_wikidata)

    print("\n" + "-"*80)
    print("CANDIDATES BY TYPE")
    print("-"*80)
    for inst_type, insts in sorted(categories.items(), key=lambda x: len(x[1]), reverse=True):
        print(f"{inst_type}: {len(insts)}")

    # Calculate priority scores
    scored_candidates = []
    for inst in without_wikidata:
        score = calculate_priority_score(inst)
        loc_info = get_location_info(inst)
        scored_candidates.append({
            'institution': inst,
            'score': score,
            'name': inst.get('name', 'Unknown'),
            'type': inst.get('institution_type', 'UNKNOWN'),
            'city': loc_info['city'],
            'state': loc_info['state']
        })

    # Sort by priority score
    scored_candidates.sort(key=lambda x: x['score'], reverse=True)

    # Print top 15 candidates
    print("\n" + "-"*80)
    print("TOP 15 ENRICHMENT CANDIDATES (by priority score)")
    print("-"*80)
    print(f"{'Rank':<6}{'Score':<8}{'Type':<12}{'Name':<40}{'Location':<25}")
    print("-"*80)

    for rank, candidate in enumerate(scored_candidates[:15], 1):
        name = candidate['name'][:38] + '..' if len(candidate['name']) > 40 else candidate['name']
        location = f"{candidate['city']}, {candidate['state']}"
        location = location[:23] + '..' if len(location) > 25 else location
        print(f"{rank:<6}{candidate['score']:<8}{candidate['type']:<12}{name:<40}{location:<25}")

    # State distribution
    state_dist = defaultdict(int)
    for candidate in without_wikidata:
        loc_info = get_location_info(candidate)
        state_dist[loc_info['state']] += 1

    print("\n" + "-"*80)
    print("CANDIDATES BY STATE")
    print("-"*80)
    for state, count in sorted(state_dist.items(), key=lambda x: x[1], reverse=True)[:10]:
        print(f"{state}: {count}")

    # Save full candidate list to file
    output_path = Path(__file__).parent / 'data' / 'instances' / 'brazil' / 'batch13_candidates.yaml'
    output_path.parent.mkdir(parents=True, exist_ok=True)

    candidate_data = []
    for candidate in scored_candidates:
        candidate_data.append({
            'name': candidate['name'],
            'type': candidate['type'],
            'city': candidate['city'],
            'state': candidate['state'],
            'priority_score': candidate['score'],
            'id': candidate['institution']['id'],
            'description': candidate['institution'].get('description', '')[:200]
        })

    with open(output_path, 'w', encoding='utf-8') as f:
        yaml.dump(candidate_data, f, allow_unicode=True, sort_keys=False)

    print(f"\n✓ Full candidate list saved to: {output_path}")

    # Recommendations for Batch 13
    print("\n" + "="*80)
    print("BATCH 13 RECOMMENDATIONS")
    print("="*80)
    print(f"Target: 10-12 institutions (aiming for ~60-65% coverage)")
    print(f"\nRecommended focus:")
    print("  1. State museums (Museu Estadual de...)")
    print("  2. Federal universities without Q-numbers")
    print("  3. National institution branches (Biblioteca Nacional, IPHAN)")
    print("  4. Major municipal museums in capital cities")
    print(f"\nExpected success rate: 70-80% (7-10 successful enrichments)")
    print(f"Projected coverage after Batch 13: 61-65%")

    return scored_candidates


def main():
    """Main analysis workflow."""
    # Load dataset
    yaml_path = Path(__file__).parent / 'data' / 'instances' / 'all' / 'globalglam-20251111.yaml'

    if not yaml_path.exists():
        print(f"Error: Dataset not found at {yaml_path}")
        return

    institutions = load_institutions(str(yaml_path))
    brazilian_institutions = extract_brazilian_institutions(institutions)

    # Analyze candidates
    candidates = analyze_candidates(brazilian_institutions)

    print("\n" + "="*80)
    print("Analysis complete. Ready to create enrichment script.")
    print("="*80)


if __name__ == '__main__':
    main()