243 lines
8.6 KiB
Python
243 lines
8.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Analyze Brazilian institutions for Batch 13 Wikidata enrichment.
|
|
|
|
Identifies institutions without Wikidata Q-numbers and prioritizes them
|
|
for enrichment based on institutional significance and metadata completeness.
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any
|
|
from collections import defaultdict
|
|
|
|
|
|
def load_institutions(yaml_path: str) -> List[Dict[str, Any]]:
|
|
"""Load all institutions from the YAML file."""
|
|
print(f"Loading institutions from {yaml_path}...")
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
print(f"Loaded {len(institutions)} total institutions")
|
|
return institutions
|
|
|
|
|
|
def extract_brazilian_institutions(institutions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""Extract institutions with country code 'BR'."""
|
|
brazilian = []
|
|
for inst in institutions:
|
|
locations = inst.get('locations', [])
|
|
if locations and any(loc.get('country') == 'BR' for loc in locations):
|
|
brazilian.append(inst)
|
|
print(f"Found {len(brazilian)} Brazilian institutions")
|
|
return brazilian
|
|
|
|
|
|
def has_wikidata(institution: Dict[str, Any]) -> bool:
|
|
"""Check if institution has a Wikidata Q-number."""
|
|
identifiers = institution.get('identifiers', [])
|
|
for ident in identifiers:
|
|
if ident.get('identifier_scheme') == 'Wikidata':
|
|
q_value = ident.get('identifier_value', '')
|
|
# Check for non-empty Q-number
|
|
if q_value and q_value.strip() and q_value.startswith('Q'):
|
|
return True
|
|
return False
|
|
|
|
|
|
def get_location_info(institution: Dict[str, Any]) -> Dict[str, str]:
|
|
"""Extract city and state from institution."""
|
|
locations = institution.get('locations', [])
|
|
if not locations:
|
|
return {'city': 'Unknown', 'state': 'Unknown'}
|
|
|
|
loc = locations[0]
|
|
return {
|
|
'city': loc.get('city', 'Unknown'),
|
|
'state': loc.get('region', 'Unknown')
|
|
}
|
|
|
|
|
|
def calculate_priority_score(institution: Dict[str, Any]) -> int:
|
|
"""
|
|
Calculate enrichment priority score (higher = better candidate).
|
|
|
|
Scoring criteria:
|
|
- State/national institution: +50 points
|
|
- University: +40 points
|
|
- Major municipal museum: +30 points
|
|
- Has description: +20 points
|
|
- Has complete location (city + state): +10 points
|
|
- Has website: +10 points
|
|
- Generic/common name: -30 points
|
|
"""
|
|
score = 0
|
|
name = institution.get('name', '').lower()
|
|
description = institution.get('description', '')
|
|
inst_type = institution.get('institution_type', '')
|
|
|
|
# Institutional significance
|
|
if 'nacional' in name or 'federal' in name or 'estadual' in name or 'estado' in name:
|
|
score += 50
|
|
|
|
if inst_type == 'UNIVERSITY' or 'universidade' in name:
|
|
score += 40
|
|
|
|
if 'museu' in name and ('municipal' in name or 'histórico' in name):
|
|
score += 30
|
|
|
|
# Metadata completeness
|
|
if description and len(description) > 50:
|
|
score += 20
|
|
|
|
loc_info = get_location_info(institution)
|
|
if loc_info['city'] != 'Unknown' and loc_info['state'] != 'Unknown':
|
|
score += 10
|
|
|
|
identifiers = institution.get('identifiers', [])
|
|
has_website = any(i.get('identifier_scheme') == 'Website' for i in identifiers)
|
|
if has_website:
|
|
score += 10
|
|
|
|
# Penalize generic names
|
|
generic_terms = ['arquivo', 'biblioteca', 'museu', 'centro']
|
|
if any(term in name for term in generic_terms) and len(name.split()) <= 2:
|
|
score -= 30
|
|
|
|
return score
|
|
|
|
|
|
def categorize_institutions(institutions: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
|
|
"""Categorize institutions by type."""
|
|
categories = defaultdict(list)
|
|
|
|
for inst in institutions:
|
|
inst_type = inst.get('institution_type', 'UNKNOWN')
|
|
categories[inst_type].append(inst)
|
|
|
|
return dict(categories)
|
|
|
|
|
|
def analyze_candidates(brazilian_institutions: List[Dict[str, Any]]):
|
|
"""Analyze and report on enrichment candidates."""
|
|
|
|
# Separate institutions with/without Wikidata
|
|
with_wikidata = [inst for inst in brazilian_institutions if has_wikidata(inst)]
|
|
without_wikidata = [inst for inst in brazilian_institutions if not has_wikidata(inst)]
|
|
|
|
print("\n" + "="*80)
|
|
print("BATCH 13 CANDIDATE ANALYSIS")
|
|
print("="*80)
|
|
print(f"\nTotal Brazilian institutions: {len(brazilian_institutions)}")
|
|
print(f"With Wikidata Q-numbers: {len(with_wikidata)} ({len(with_wikidata)/len(brazilian_institutions)*100:.1f}%)")
|
|
print(f"Without Wikidata Q-numbers: {len(without_wikidata)} ({len(without_wikidata)/len(brazilian_institutions)*100:.1f}%)")
|
|
|
|
# Categorize candidates without Wikidata
|
|
categories = categorize_institutions(without_wikidata)
|
|
|
|
print("\n" + "-"*80)
|
|
print("CANDIDATES BY TYPE")
|
|
print("-"*80)
|
|
for inst_type, insts in sorted(categories.items(), key=lambda x: len(x[1]), reverse=True):
|
|
print(f"{inst_type}: {len(insts)}")
|
|
|
|
# Calculate priority scores
|
|
scored_candidates = []
|
|
for inst in without_wikidata:
|
|
score = calculate_priority_score(inst)
|
|
loc_info = get_location_info(inst)
|
|
scored_candidates.append({
|
|
'institution': inst,
|
|
'score': score,
|
|
'name': inst.get('name', 'Unknown'),
|
|
'type': inst.get('institution_type', 'UNKNOWN'),
|
|
'city': loc_info['city'],
|
|
'state': loc_info['state']
|
|
})
|
|
|
|
# Sort by priority score
|
|
scored_candidates.sort(key=lambda x: x['score'], reverse=True)
|
|
|
|
# Print top 15 candidates
|
|
print("\n" + "-"*80)
|
|
print("TOP 15 ENRICHMENT CANDIDATES (by priority score)")
|
|
print("-"*80)
|
|
print(f"{'Rank':<6}{'Score':<8}{'Type':<12}{'Name':<40}{'Location':<25}")
|
|
print("-"*80)
|
|
|
|
for rank, candidate in enumerate(scored_candidates[:15], 1):
|
|
name = candidate['name'][:38] + '..' if len(candidate['name']) > 40 else candidate['name']
|
|
location = f"{candidate['city']}, {candidate['state']}"
|
|
location = location[:23] + '..' if len(location) > 25 else location
|
|
print(f"{rank:<6}{candidate['score']:<8}{candidate['type']:<12}{name:<40}{location:<25}")
|
|
|
|
# State distribution
|
|
state_dist = defaultdict(int)
|
|
for candidate in without_wikidata:
|
|
loc_info = get_location_info(candidate)
|
|
state_dist[loc_info['state']] += 1
|
|
|
|
print("\n" + "-"*80)
|
|
print("CANDIDATES BY STATE")
|
|
print("-"*80)
|
|
for state, count in sorted(state_dist.items(), key=lambda x: x[1], reverse=True)[:10]:
|
|
print(f"{state}: {count}")
|
|
|
|
# Save full candidate list to file
|
|
output_path = Path(__file__).parent / 'data' / 'instances' / 'brazil' / 'batch13_candidates.yaml'
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
candidate_data = []
|
|
for candidate in scored_candidates:
|
|
candidate_data.append({
|
|
'name': candidate['name'],
|
|
'type': candidate['type'],
|
|
'city': candidate['city'],
|
|
'state': candidate['state'],
|
|
'priority_score': candidate['score'],
|
|
'id': candidate['institution']['id'],
|
|
'description': candidate['institution'].get('description', '')[:200]
|
|
})
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(candidate_data, f, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"\n✓ Full candidate list saved to: {output_path}")
|
|
|
|
# Recommendations for Batch 13
|
|
print("\n" + "="*80)
|
|
print("BATCH 13 RECOMMENDATIONS")
|
|
print("="*80)
|
|
print(f"Target: 10-12 institutions (aiming for ~60-65% coverage)")
|
|
print(f"\nRecommended focus:")
|
|
print(" 1. State museums (Museu Estadual de...)")
|
|
print(" 2. Federal universities without Q-numbers")
|
|
print(" 3. National institution branches (Biblioteca Nacional, IPHAN)")
|
|
print(" 4. Major municipal museums in capital cities")
|
|
print(f"\nExpected success rate: 70-80% (7-10 successful enrichments)")
|
|
print(f"Projected coverage after Batch 13: 61-65%")
|
|
|
|
return scored_candidates
|
|
|
|
|
|
def main():
|
|
"""Main analysis workflow."""
|
|
# Load dataset
|
|
yaml_path = Path(__file__).parent / 'data' / 'instances' / 'all' / 'globalglam-20251111.yaml'
|
|
|
|
if not yaml_path.exists():
|
|
print(f"Error: Dataset not found at {yaml_path}")
|
|
return
|
|
|
|
institutions = load_institutions(str(yaml_path))
|
|
brazilian_institutions = extract_brazilian_institutions(institutions)
|
|
|
|
# Analyze candidates
|
|
candidates = analyze_candidates(brazilian_institutions)
|
|
|
|
print("\n" + "="*80)
|
|
print("Analysis complete. Ready to create enrichment script.")
|
|
print("="*80)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|