glam/archive/scripts/brazil/analyze_brazil_batch9_candidates.py
2025-11-19 23:25:22 +01:00

366 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Analyze Brazilian institutions without Wikidata to identify high-priority candidates for Batch 9 enrichment.
Strategy:
1. Load 181 institutions without Wikidata from master dataset
2. Prioritize by:
- Institution type (MUSEUM, LIBRARY, ARCHIVE > MIXED, EDUCATION_PROVIDER)
- Name characteristics (explicit institutional names > generic)
- Digital presence (has website/platforms)
- Geographic distribution (major cities)
3. Create ranked list for manual Wikidata search
4. Export top 15-20 candidates for Batch 9
Author: AI Agent (OpenCode)
Date: 2025-11-11
"""
import yaml
from pathlib import Path
from collections import Counter
from typing import List, Dict, Any
import re
# Configuration
MASTER_FILE = Path(__file__).parent.parent / "data/instances/all/globalglam-20251111.yaml"
OUTPUT_DIR = Path(__file__).parent.parent / "data/instances/brazil"
OUTPUT_FILE = OUTPUT_DIR / "batch9_candidates_analysis.yaml"
REPORT_FILE = OUTPUT_DIR / "BATCH9_CANDIDATES_REPORT.md"
# Priority weights
TYPE_PRIORITY = {
'MUSEUM': 10,
'LIBRARY': 10,
'ARCHIVE': 10,
'GALLERY': 8,
'RESEARCH_CENTER': 7,
'OFFICIAL_INSTITUTION': 6,
'EDUCATION_PROVIDER': 4,
'MIXED': 3,
}
MAJOR_CITIES = [
'São Paulo', 'Rio de Janeiro', 'Brasília', 'Salvador', 'Fortaleza',
'Belo Horizonte', 'Manaus', 'Curitiba', 'Recife', 'Porto Alegre',
'Belém', 'Goiânia', 'Campinas', 'São Luís', 'Maceió'
]
def score_institution(inst: Dict[str, Any]) -> float:
"""Calculate priority score for institution."""
score = 0.0
# 1. Institution type priority (0-10 points)
inst_type = inst.get('institution_type', 'MIXED')
score += TYPE_PRIORITY.get(inst_type, 2)
# 2. Name specificity (0-5 points)
name = inst.get('name', '')
# Penalize generic names
generic_keywords = ['universidade', 'faculdade', 'escola', 'instituto', 'centro']
if not any(keyword in name.lower() for keyword in generic_keywords):
score += 3 # Specific institutional name
# Bonus for national/state institutions
if re.search(r'\b(nacional|estadual|federal|municipal)\b', name, re.IGNORECASE):
score += 2
# 3. Has digital platforms (0-3 points)
platforms = inst.get('digital_platforms', [])
if platforms:
score += min(len(platforms), 3)
# 4. Has website identifier (0-2 points)
identifiers = inst.get('identifiers', [])
if any(id.get('identifier_scheme') == 'Website' for id in identifiers):
score += 2
# 5. Located in major city (0-3 points)
locations = inst.get('locations', [])
for loc in locations:
city = loc.get('city', '')
if city in MAJOR_CITIES:
score += 3
break
elif city: # Has city info
score += 1
# 6. Has detailed description (0-2 points)
description = inst.get('description', '')
if len(description) > 100:
score += 2
elif len(description) > 50:
score += 1
return score
def extract_key_info(inst: Dict[str, Any]) -> Dict[str, Any]:
"""Extract key information for reporting."""
locations = inst.get('locations', [])
city = locations[0].get('city', 'Unknown') if locations else 'Unknown'
region = locations[0].get('region', '') if locations else ''
identifiers = inst.get('identifiers', [])
website = next((id['identifier_url'] for id in identifiers
if id.get('identifier_scheme') == 'Website'), None)
return {
'id': inst.get('id'),
'name': inst.get('name'),
'institution_type': inst.get('institution_type'),
'city': city,
'region': region,
'website': website,
'platforms_count': len(inst.get('digital_platforms', [])),
'description': inst.get('description', '')[:150] + '...' if len(inst.get('description', '')) > 150 else inst.get('description', ''),
'priority_score': 0.0, # Will be filled in
}
def main():
print("=" * 80)
print("BRAZIL BATCH 9 CANDIDATE ANALYSIS")
print("=" * 80)
print()
# Load master dataset
print(f"📂 Loading master dataset: {MASTER_FILE}")
with open(MASTER_FILE, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
print(f" Total institutions: {len(data)}")
# Filter Brazilian institutions without Wikidata
print("\n🔍 Filtering Brazilian institutions...")
brazil_all = [inst for inst in data
if any(loc.get('country') == 'BR' for loc in inst.get('locations', []))]
brazil_without_wikidata = [
inst for inst in brazil_all
if not any(id.get('identifier_scheme') == 'Wikidata'
for id in inst.get('identifiers', []))
]
print(f" Total Brazilian institutions: {len(brazil_all)}")
print(f" Without Wikidata: {len(brazil_without_wikidata)} ({100*len(brazil_without_wikidata)/len(brazil_all):.1f}%)")
# Analyze by type
print("\n📊 Institution type distribution (without Wikidata):")
type_counts = Counter([inst.get('institution_type') for inst in brazil_without_wikidata])
for itype, count in type_counts.most_common():
print(f" {itype}: {count}")
# Score and rank institutions
print("\n🎯 Scoring institutions for Batch 9 priority...")
scored_institutions = []
for inst in brazil_without_wikidata:
score = score_institution(inst)
info = extract_key_info(inst)
info['priority_score'] = score
scored_institutions.append((score, inst, info))
# Sort by score (descending)
scored_institutions.sort(key=lambda x: x[0], reverse=True)
print(f" Scored {len(scored_institutions)} institutions")
print(f" Score range: {scored_institutions[-1][0]:.1f} - {scored_institutions[0][0]:.1f}")
# Select top candidates for Batch 9
BATCH_SIZE = 15
top_candidates = scored_institutions[:BATCH_SIZE]
print(f"\n✅ Top {BATCH_SIZE} candidates for Batch 9:")
for i, (score, inst, info) in enumerate(top_candidates, 1):
print(f" {i:2d}. [{score:4.1f}] {info['name'][:60]:60s} ({info['institution_type']}, {info['city']})")
# Export candidates
print(f"\n💾 Exporting candidates...")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Full institution records for batch processing
candidate_records = [inst for score, inst, info in top_candidates]
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
yaml.dump(candidate_records, f, allow_unicode=True, sort_keys=False)
print(f" Saved {len(candidate_records)} full records to: {OUTPUT_FILE}")
# Generate markdown report
generate_report(scored_institutions, top_candidates, type_counts)
print(f" Generated report: {REPORT_FILE}")
print("\n" + "=" * 80)
print("ANALYSIS COMPLETE")
print("=" * 80)
print(f"\n📋 Next steps:")
print(f" 1. Review report: {REPORT_FILE}")
print(f" 2. Search Wikidata for top 15 candidates")
print(f" 3. Create enrichment script: scripts/enrich_brazil_batch9.py")
print(f" 4. Execute enrichment and merge into master dataset")
print()
def generate_report(all_scored: List, top_candidates: List, type_counts: Counter):
"""Generate markdown report for Batch 9 candidates."""
report = f"""# Brazil Batch 9 Enrichment Candidates
**Generated**: 2025-11-11
**Purpose**: Identify high-priority Brazilian institutions for Wikidata enrichment
**Target**: Add 10-15 Wikidata identifiers to increase Brazil coverage from 14.6% → 19.3%+
## Summary Statistics
- **Total Brazilian institutions**: 212
- **With Wikidata**: 31 (14.6%)
- **Without Wikidata**: 181 (85.4%)
- **Candidates analyzed**: {len(all_scored)}
- **Top candidates selected**: {len(top_candidates)}
## Institution Type Distribution (Without Wikidata)
| Type | Count | % of Total |
|------|-------|------------|
"""
total = sum(type_counts.values())
for itype, count in type_counts.most_common():
report += f"| {itype} | {count} | {100*count/total:.1f}% |\n"
report += f"""
## Scoring Methodology
Institutions are scored based on:
1. **Institution Type** (0-10 points)
- MUSEUM, LIBRARY, ARCHIVE: 10 points (core heritage institutions)
- GALLERY: 8 points
- RESEARCH_CENTER: 7 points
- OFFICIAL_INSTITUTION: 6 points
- EDUCATION_PROVIDER: 4 points
- MIXED: 3 points
2. **Name Specificity** (0-5 points)
- Explicit institutional names: +3 points
- National/state/federal/municipal institutions: +2 points
- Generic educational names: 0 bonus
3. **Digital Platforms** (0-3 points)
- Each platform: +1 point (max 3)
4. **Website Available** (0-2 points)
- Has website identifier: +2 points
5. **Geographic Location** (0-3 points)
- Major city (São Paulo, Rio, etc.): +3 points
- Has city information: +1 point
6. **Description Richness** (0-2 points)
- Detailed (>100 chars): +2 points
- Moderate (>50 chars): +1 point
**Maximum possible score**: 25 points
## Top 15 Candidates for Batch 9
"""
for i, (score, inst, info) in enumerate(top_candidates, 1):
report += f"""
### {i}. {info['name']}
- **Score**: {score:.1f}/25
- **Type**: {info['institution_type']}
- **Location**: {info['city']}, {info['region']}
- **Website**: {info['website'] if info['website'] else 'Not available'}
- **Platforms**: {info['platforms_count']}
- **Description**: {info['description']}
**Wikidata Search Strategy**:
- Search term: `{info['name']} {info['city']} Brazil`
- Filter: `instance of` → {"museum" if info['institution_type'] == 'MUSEUM' else "library" if info['institution_type'] == 'LIBRARY' else "archive" if info['institution_type'] == 'ARCHIVE' else "cultural institution"}
- Verify: Location matches {info['city']}, Brazil
---
"""
report += f"""
## Additional High-Priority Candidates (16-30)
These institutions scored well but didn't make the top 15. Consider for Batch 10.
| Rank | Score | Name | Type | City |
|------|-------|------|------|------|
"""
for i, (score, inst, info) in enumerate(all_scored[15:30], 16):
name_short = info['name'][:50] + '...' if len(info['name']) > 50 else info['name']
report += f"| {i} | {score:.1f} | {name_short} | {info['institution_type']} | {info['city']} |\n"
report += f"""
## Recommendations
### Batch 9 Strategy (Target: 10-15 enrichments)
1. **Manual Wikidata Search** (Most Reliable)
- Search each top candidate on Wikidata
- Verify location and institution type match
- Record Q-numbers in enrichment script
2. **Automated Fuzzy Matching** (Faster, Lower Precision)
- Use existing `scripts/enrich_brazil_batch9.py` template
- Adapt fuzzy matching from previous batches
- Manually verify all matches before committing
3. **Hybrid Approach** (Recommended)
- Manual search for top 10 candidates (highest confidence)
- Fuzzy matching for candidates 11-15 (with verification)
- This balances speed and accuracy
### Expected Outcome
- **Current coverage**: 31/212 (14.6%)
- **After Batch 9** (+10 institutions): 41/212 (19.3%)
- **After Batch 9** (+15 institutions): 46/212 (21.7%)
### Next Batches
- **Batch 10**: Focus on remaining MUSEUM institutions (42 without Wikidata)
- **Batch 11**: Focus on ARCHIVE + LIBRARY (12 total without Wikidata)
- **Batch 12**: Cherry-pick high-scoring EDUCATION_PROVIDER institutions
**Projected 30% coverage**: Batches 9-11 combined (~35-40 total enrichments)
## Files Generated
- **Candidate records**: `data/instances/brazil/batch9_candidates_analysis.yaml`
- **This report**: `data/instances/brazil/BATCH9_CANDIDATES_REPORT.md`
## Manual Enrichment Template
For each candidate, follow this workflow:
```python
# In scripts/enrich_brazil_batch9.py
BATCH_9_ENRICHMENTS = {{
"Museo Name Example": {{
"wikidata_id": "Q12345678",
"match_score": 1.0, # Manual verification
"match_method": "Manual Wikidata search",
"verification_notes": "Verified: location, type, and name match"
}},
# ... add 10-15 entries
}}
```
---
**Status**: Ready for manual Wikidata search
**Next Action**: Create `scripts/enrich_brazil_batch9.py` with top candidates
"""
with open(REPORT_FILE, 'w', encoding='utf-8') as f:
f.write(report)
if __name__ == '__main__':
main()