glam/analyze_brazil_batch13_candidates.py
2025-11-19 23:25:22 +01:00

243 lines
8.6 KiB
Python

#!/usr/bin/env python3
"""
Analyze Brazilian institutions for Batch 13 Wikidata enrichment.
Identifies institutions without Wikidata Q-numbers and prioritizes them
for enrichment based on institutional significance and metadata completeness.
"""
import yaml
from pathlib import Path
from typing import List, Dict, Any
from collections import defaultdict
def load_institutions(yaml_path: str) -> List[Dict[str, Any]]:
"""Load all institutions from the YAML file."""
print(f"Loading institutions from {yaml_path}...")
with open(yaml_path, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f"Loaded {len(institutions)} total institutions")
return institutions
def extract_brazilian_institutions(institutions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Extract institutions with country code 'BR'."""
brazilian = []
for inst in institutions:
locations = inst.get('locations', [])
if locations and any(loc.get('country') == 'BR' for loc in locations):
brazilian.append(inst)
print(f"Found {len(brazilian)} Brazilian institutions")
return brazilian
def has_wikidata(institution: Dict[str, Any]) -> bool:
"""Check if institution has a Wikidata Q-number."""
identifiers = institution.get('identifiers', [])
for ident in identifiers:
if ident.get('identifier_scheme') == 'Wikidata':
q_value = ident.get('identifier_value', '')
# Check for non-empty Q-number
if q_value and q_value.strip() and q_value.startswith('Q'):
return True
return False
def get_location_info(institution: Dict[str, Any]) -> Dict[str, str]:
"""Extract city and state from institution."""
locations = institution.get('locations', [])
if not locations:
return {'city': 'Unknown', 'state': 'Unknown'}
loc = locations[0]
return {
'city': loc.get('city', 'Unknown'),
'state': loc.get('region', 'Unknown')
}
def calculate_priority_score(institution: Dict[str, Any]) -> int:
"""
Calculate enrichment priority score (higher = better candidate).
Scoring criteria:
- State/national institution: +50 points
- University: +40 points
- Major municipal museum: +30 points
- Has description: +20 points
- Has complete location (city + state): +10 points
- Has website: +10 points
- Generic/common name: -30 points
"""
score = 0
name = institution.get('name', '').lower()
description = institution.get('description', '')
inst_type = institution.get('institution_type', '')
# Institutional significance
if 'nacional' in name or 'federal' in name or 'estadual' in name or 'estado' in name:
score += 50
if inst_type == 'UNIVERSITY' or 'universidade' in name:
score += 40
if 'museu' in name and ('municipal' in name or 'histórico' in name):
score += 30
# Metadata completeness
if description and len(description) > 50:
score += 20
loc_info = get_location_info(institution)
if loc_info['city'] != 'Unknown' and loc_info['state'] != 'Unknown':
score += 10
identifiers = institution.get('identifiers', [])
has_website = any(i.get('identifier_scheme') == 'Website' for i in identifiers)
if has_website:
score += 10
# Penalize generic names
generic_terms = ['arquivo', 'biblioteca', 'museu', 'centro']
if any(term in name for term in generic_terms) and len(name.split()) <= 2:
score -= 30
return score
def categorize_institutions(institutions: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
"""Categorize institutions by type."""
categories = defaultdict(list)
for inst in institutions:
inst_type = inst.get('institution_type', 'UNKNOWN')
categories[inst_type].append(inst)
return dict(categories)
def analyze_candidates(brazilian_institutions: List[Dict[str, Any]]):
"""Analyze and report on enrichment candidates."""
# Separate institutions with/without Wikidata
with_wikidata = [inst for inst in brazilian_institutions if has_wikidata(inst)]
without_wikidata = [inst for inst in brazilian_institutions if not has_wikidata(inst)]
print("\n" + "="*80)
print("BATCH 13 CANDIDATE ANALYSIS")
print("="*80)
print(f"\nTotal Brazilian institutions: {len(brazilian_institutions)}")
print(f"With Wikidata Q-numbers: {len(with_wikidata)} ({len(with_wikidata)/len(brazilian_institutions)*100:.1f}%)")
print(f"Without Wikidata Q-numbers: {len(without_wikidata)} ({len(without_wikidata)/len(brazilian_institutions)*100:.1f}%)")
# Categorize candidates without Wikidata
categories = categorize_institutions(without_wikidata)
print("\n" + "-"*80)
print("CANDIDATES BY TYPE")
print("-"*80)
for inst_type, insts in sorted(categories.items(), key=lambda x: len(x[1]), reverse=True):
print(f"{inst_type}: {len(insts)}")
# Calculate priority scores
scored_candidates = []
for inst in without_wikidata:
score = calculate_priority_score(inst)
loc_info = get_location_info(inst)
scored_candidates.append({
'institution': inst,
'score': score,
'name': inst.get('name', 'Unknown'),
'type': inst.get('institution_type', 'UNKNOWN'),
'city': loc_info['city'],
'state': loc_info['state']
})
# Sort by priority score
scored_candidates.sort(key=lambda x: x['score'], reverse=True)
# Print top 15 candidates
print("\n" + "-"*80)
print("TOP 15 ENRICHMENT CANDIDATES (by priority score)")
print("-"*80)
print(f"{'Rank':<6}{'Score':<8}{'Type':<12}{'Name':<40}{'Location':<25}")
print("-"*80)
for rank, candidate in enumerate(scored_candidates[:15], 1):
name = candidate['name'][:38] + '..' if len(candidate['name']) > 40 else candidate['name']
location = f"{candidate['city']}, {candidate['state']}"
location = location[:23] + '..' if len(location) > 25 else location
print(f"{rank:<6}{candidate['score']:<8}{candidate['type']:<12}{name:<40}{location:<25}")
# State distribution
state_dist = defaultdict(int)
for candidate in without_wikidata:
loc_info = get_location_info(candidate)
state_dist[loc_info['state']] += 1
print("\n" + "-"*80)
print("CANDIDATES BY STATE")
print("-"*80)
for state, count in sorted(state_dist.items(), key=lambda x: x[1], reverse=True)[:10]:
print(f"{state}: {count}")
# Save full candidate list to file
output_path = Path(__file__).parent / 'data' / 'instances' / 'brazil' / 'batch13_candidates.yaml'
output_path.parent.mkdir(parents=True, exist_ok=True)
candidate_data = []
for candidate in scored_candidates:
candidate_data.append({
'name': candidate['name'],
'type': candidate['type'],
'city': candidate['city'],
'state': candidate['state'],
'priority_score': candidate['score'],
'id': candidate['institution']['id'],
'description': candidate['institution'].get('description', '')[:200]
})
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump(candidate_data, f, allow_unicode=True, sort_keys=False)
print(f"\n✓ Full candidate list saved to: {output_path}")
# Recommendations for Batch 13
print("\n" + "="*80)
print("BATCH 13 RECOMMENDATIONS")
print("="*80)
print(f"Target: 10-12 institutions (aiming for ~60-65% coverage)")
print(f"\nRecommended focus:")
print(" 1. State museums (Museu Estadual de...)")
print(" 2. Federal universities without Q-numbers")
print(" 3. National institution branches (Biblioteca Nacional, IPHAN)")
print(" 4. Major municipal museums in capital cities")
print(f"\nExpected success rate: 70-80% (7-10 successful enrichments)")
print(f"Projected coverage after Batch 13: 61-65%")
return scored_candidates
def main():
"""Main analysis workflow."""
# Load dataset
yaml_path = Path(__file__).parent / 'data' / 'instances' / 'all' / 'globalglam-20251111.yaml'
if not yaml_path.exists():
print(f"Error: Dataset not found at {yaml_path}")
return
institutions = load_institutions(str(yaml_path))
brazilian_institutions = extract_brazilian_institutions(institutions)
# Analyze candidates
candidates = analyze_candidates(brazilian_institutions)
print("\n" + "="*80)
print("Analysis complete. Ready to create enrichment script.")
print("="*80)
if __name__ == '__main__':
main()