366 lines
12 KiB
Python
366 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Analyze Brazilian institutions without Wikidata to identify high-priority candidates for Batch 9 enrichment.
|
|
|
|
Strategy:
|
|
1. Load 181 institutions without Wikidata from master dataset
|
|
2. Prioritize by:
|
|
- Institution type (MUSEUM, LIBRARY, ARCHIVE > MIXED, EDUCATION_PROVIDER)
|
|
- Name characteristics (explicit institutional names > generic)
|
|
- Digital presence (has website/platforms)
|
|
- Geographic distribution (major cities)
|
|
3. Create ranked list for manual Wikidata search
|
|
4. Export top 15-20 candidates for Batch 9
|
|
|
|
Author: AI Agent (OpenCode)
|
|
Date: 2025-11-11
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from collections import Counter
|
|
from typing import List, Dict, Any
|
|
import re
|
|
|
|
# Configuration
|
|
MASTER_FILE = Path(__file__).parent.parent / "data/instances/all/globalglam-20251111.yaml"
|
|
OUTPUT_DIR = Path(__file__).parent.parent / "data/instances/brazil"
|
|
OUTPUT_FILE = OUTPUT_DIR / "batch9_candidates_analysis.yaml"
|
|
REPORT_FILE = OUTPUT_DIR / "BATCH9_CANDIDATES_REPORT.md"
|
|
|
|
# Priority weights
|
|
TYPE_PRIORITY = {
|
|
'MUSEUM': 10,
|
|
'LIBRARY': 10,
|
|
'ARCHIVE': 10,
|
|
'GALLERY': 8,
|
|
'RESEARCH_CENTER': 7,
|
|
'OFFICIAL_INSTITUTION': 6,
|
|
'EDUCATION_PROVIDER': 4,
|
|
'MIXED': 3,
|
|
}
|
|
|
|
MAJOR_CITIES = [
|
|
'São Paulo', 'Rio de Janeiro', 'Brasília', 'Salvador', 'Fortaleza',
|
|
'Belo Horizonte', 'Manaus', 'Curitiba', 'Recife', 'Porto Alegre',
|
|
'Belém', 'Goiânia', 'Campinas', 'São Luís', 'Maceió'
|
|
]
|
|
|
|
|
|
def score_institution(inst: Dict[str, Any]) -> float:
|
|
"""Calculate priority score for institution."""
|
|
score = 0.0
|
|
|
|
# 1. Institution type priority (0-10 points)
|
|
inst_type = inst.get('institution_type', 'MIXED')
|
|
score += TYPE_PRIORITY.get(inst_type, 2)
|
|
|
|
# 2. Name specificity (0-5 points)
|
|
name = inst.get('name', '')
|
|
# Penalize generic names
|
|
generic_keywords = ['universidade', 'faculdade', 'escola', 'instituto', 'centro']
|
|
if not any(keyword in name.lower() for keyword in generic_keywords):
|
|
score += 3 # Specific institutional name
|
|
# Bonus for national/state institutions
|
|
if re.search(r'\b(nacional|estadual|federal|municipal)\b', name, re.IGNORECASE):
|
|
score += 2
|
|
|
|
# 3. Has digital platforms (0-3 points)
|
|
platforms = inst.get('digital_platforms', [])
|
|
if platforms:
|
|
score += min(len(platforms), 3)
|
|
|
|
# 4. Has website identifier (0-2 points)
|
|
identifiers = inst.get('identifiers', [])
|
|
if any(id.get('identifier_scheme') == 'Website' for id in identifiers):
|
|
score += 2
|
|
|
|
# 5. Located in major city (0-3 points)
|
|
locations = inst.get('locations', [])
|
|
for loc in locations:
|
|
city = loc.get('city', '')
|
|
if city in MAJOR_CITIES:
|
|
score += 3
|
|
break
|
|
elif city: # Has city info
|
|
score += 1
|
|
|
|
# 6. Has detailed description (0-2 points)
|
|
description = inst.get('description', '')
|
|
if len(description) > 100:
|
|
score += 2
|
|
elif len(description) > 50:
|
|
score += 1
|
|
|
|
return score
|
|
|
|
|
|
def extract_key_info(inst: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Extract key information for reporting."""
|
|
locations = inst.get('locations', [])
|
|
city = locations[0].get('city', 'Unknown') if locations else 'Unknown'
|
|
region = locations[0].get('region', '') if locations else ''
|
|
|
|
identifiers = inst.get('identifiers', [])
|
|
website = next((id['identifier_url'] for id in identifiers
|
|
if id.get('identifier_scheme') == 'Website'), None)
|
|
|
|
return {
|
|
'id': inst.get('id'),
|
|
'name': inst.get('name'),
|
|
'institution_type': inst.get('institution_type'),
|
|
'city': city,
|
|
'region': region,
|
|
'website': website,
|
|
'platforms_count': len(inst.get('digital_platforms', [])),
|
|
'description': inst.get('description', '')[:150] + '...' if len(inst.get('description', '')) > 150 else inst.get('description', ''),
|
|
'priority_score': 0.0, # Will be filled in
|
|
}
|
|
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("BRAZIL BATCH 9 CANDIDATE ANALYSIS")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load master dataset
|
|
print(f"📂 Loading master dataset: {MASTER_FILE}")
|
|
with open(MASTER_FILE, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
print(f" Total institutions: {len(data)}")
|
|
|
|
# Filter Brazilian institutions without Wikidata
|
|
print("\n🔍 Filtering Brazilian institutions...")
|
|
brazil_all = [inst for inst in data
|
|
if any(loc.get('country') == 'BR' for loc in inst.get('locations', []))]
|
|
|
|
brazil_without_wikidata = [
|
|
inst for inst in brazil_all
|
|
if not any(id.get('identifier_scheme') == 'Wikidata'
|
|
for id in inst.get('identifiers', []))
|
|
]
|
|
|
|
print(f" Total Brazilian institutions: {len(brazil_all)}")
|
|
print(f" Without Wikidata: {len(brazil_without_wikidata)} ({100*len(brazil_without_wikidata)/len(brazil_all):.1f}%)")
|
|
|
|
# Analyze by type
|
|
print("\n📊 Institution type distribution (without Wikidata):")
|
|
type_counts = Counter([inst.get('institution_type') for inst in brazil_without_wikidata])
|
|
for itype, count in type_counts.most_common():
|
|
print(f" {itype}: {count}")
|
|
|
|
# Score and rank institutions
|
|
print("\n🎯 Scoring institutions for Batch 9 priority...")
|
|
scored_institutions = []
|
|
for inst in brazil_without_wikidata:
|
|
score = score_institution(inst)
|
|
info = extract_key_info(inst)
|
|
info['priority_score'] = score
|
|
scored_institutions.append((score, inst, info))
|
|
|
|
# Sort by score (descending)
|
|
scored_institutions.sort(key=lambda x: x[0], reverse=True)
|
|
|
|
print(f" Scored {len(scored_institutions)} institutions")
|
|
print(f" Score range: {scored_institutions[-1][0]:.1f} - {scored_institutions[0][0]:.1f}")
|
|
|
|
# Select top candidates for Batch 9
|
|
BATCH_SIZE = 15
|
|
top_candidates = scored_institutions[:BATCH_SIZE]
|
|
|
|
print(f"\n✅ Top {BATCH_SIZE} candidates for Batch 9:")
|
|
for i, (score, inst, info) in enumerate(top_candidates, 1):
|
|
print(f" {i:2d}. [{score:4.1f}] {info['name'][:60]:60s} ({info['institution_type']}, {info['city']})")
|
|
|
|
# Export candidates
|
|
print(f"\n💾 Exporting candidates...")
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Full institution records for batch processing
|
|
candidate_records = [inst for score, inst, info in top_candidates]
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
yaml.dump(candidate_records, f, allow_unicode=True, sort_keys=False)
|
|
print(f" Saved {len(candidate_records)} full records to: {OUTPUT_FILE}")
|
|
|
|
# Generate markdown report
|
|
generate_report(scored_institutions, top_candidates, type_counts)
|
|
print(f" Generated report: {REPORT_FILE}")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("ANALYSIS COMPLETE")
|
|
print("=" * 80)
|
|
print(f"\n📋 Next steps:")
|
|
print(f" 1. Review report: {REPORT_FILE}")
|
|
print(f" 2. Search Wikidata for top 15 candidates")
|
|
print(f" 3. Create enrichment script: scripts/enrich_brazil_batch9.py")
|
|
print(f" 4. Execute enrichment and merge into master dataset")
|
|
print()
|
|
|
|
|
|
def generate_report(all_scored: List, top_candidates: List, type_counts: Counter):
|
|
"""Generate markdown report for Batch 9 candidates."""
|
|
|
|
report = f"""# Brazil Batch 9 Enrichment Candidates
|
|
|
|
**Generated**: 2025-11-11
|
|
**Purpose**: Identify high-priority Brazilian institutions for Wikidata enrichment
|
|
**Target**: Add 10-15 Wikidata identifiers to increase Brazil coverage from 14.6% → 19.3%+
|
|
|
|
## Summary Statistics
|
|
|
|
- **Total Brazilian institutions**: 212
|
|
- **With Wikidata**: 31 (14.6%)
|
|
- **Without Wikidata**: 181 (85.4%)
|
|
- **Candidates analyzed**: {len(all_scored)}
|
|
- **Top candidates selected**: {len(top_candidates)}
|
|
|
|
## Institution Type Distribution (Without Wikidata)
|
|
|
|
| Type | Count | % of Total |
|
|
|------|-------|------------|
|
|
"""
|
|
|
|
total = sum(type_counts.values())
|
|
for itype, count in type_counts.most_common():
|
|
report += f"| {itype} | {count} | {100*count/total:.1f}% |\n"
|
|
|
|
report += f"""
|
|
## Scoring Methodology
|
|
|
|
Institutions are scored based on:
|
|
|
|
1. **Institution Type** (0-10 points)
|
|
- MUSEUM, LIBRARY, ARCHIVE: 10 points (core heritage institutions)
|
|
- GALLERY: 8 points
|
|
- RESEARCH_CENTER: 7 points
|
|
- OFFICIAL_INSTITUTION: 6 points
|
|
- EDUCATION_PROVIDER: 4 points
|
|
- MIXED: 3 points
|
|
|
|
2. **Name Specificity** (0-5 points)
|
|
- Explicit institutional names: +3 points
|
|
- National/state/federal/municipal institutions: +2 points
|
|
- Generic educational names: 0 bonus
|
|
|
|
3. **Digital Platforms** (0-3 points)
|
|
- Each platform: +1 point (max 3)
|
|
|
|
4. **Website Available** (0-2 points)
|
|
- Has website identifier: +2 points
|
|
|
|
5. **Geographic Location** (0-3 points)
|
|
- Major city (São Paulo, Rio, etc.): +3 points
|
|
- Has city information: +1 point
|
|
|
|
6. **Description Richness** (0-2 points)
|
|
- Detailed (>100 chars): +2 points
|
|
- Moderate (>50 chars): +1 point
|
|
|
|
**Maximum possible score**: 25 points
|
|
|
|
## Top 15 Candidates for Batch 9
|
|
|
|
"""
|
|
|
|
for i, (score, inst, info) in enumerate(top_candidates, 1):
|
|
report += f"""
|
|
### {i}. {info['name']}
|
|
|
|
- **Score**: {score:.1f}/25
|
|
- **Type**: {info['institution_type']}
|
|
- **Location**: {info['city']}, {info['region']}
|
|
- **Website**: {info['website'] if info['website'] else 'Not available'}
|
|
- **Platforms**: {info['platforms_count']}
|
|
- **Description**: {info['description']}
|
|
|
|
**Wikidata Search Strategy**:
|
|
- Search term: `{info['name']} {info['city']} Brazil`
|
|
- Filter: `instance of` → {"museum" if info['institution_type'] == 'MUSEUM' else "library" if info['institution_type'] == 'LIBRARY' else "archive" if info['institution_type'] == 'ARCHIVE' else "cultural institution"}
|
|
- Verify: Location matches {info['city']}, Brazil
|
|
|
|
---
|
|
"""
|
|
|
|
report += f"""
|
|
## Additional High-Priority Candidates (16-30)
|
|
|
|
These institutions scored well but didn't make the top 15. Consider for Batch 10.
|
|
|
|
| Rank | Score | Name | Type | City |
|
|
|------|-------|------|------|------|
|
|
"""
|
|
|
|
for i, (score, inst, info) in enumerate(all_scored[15:30], 16):
|
|
name_short = info['name'][:50] + '...' if len(info['name']) > 50 else info['name']
|
|
report += f"| {i} | {score:.1f} | {name_short} | {info['institution_type']} | {info['city']} |\n"
|
|
|
|
report += f"""
|
|
## Recommendations
|
|
|
|
### Batch 9 Strategy (Target: 10-15 enrichments)
|
|
|
|
1. **Manual Wikidata Search** (Most Reliable)
|
|
- Search each top candidate on Wikidata
|
|
- Verify location and institution type match
|
|
- Record Q-numbers in enrichment script
|
|
|
|
2. **Automated Fuzzy Matching** (Faster, Lower Precision)
|
|
- Use existing `scripts/enrich_brazil_batch9.py` template
|
|
- Adapt fuzzy matching from previous batches
|
|
- Manually verify all matches before committing
|
|
|
|
3. **Hybrid Approach** (Recommended)
|
|
- Manual search for top 10 candidates (highest confidence)
|
|
- Fuzzy matching for candidates 11-15 (with verification)
|
|
- This balances speed and accuracy
|
|
|
|
### Expected Outcome
|
|
|
|
- **Current coverage**: 31/212 (14.6%)
|
|
- **After Batch 9** (+10 institutions): 41/212 (19.3%)
|
|
- **After Batch 9** (+15 institutions): 46/212 (21.7%)
|
|
|
|
### Next Batches
|
|
|
|
- **Batch 10**: Focus on remaining MUSEUM institutions (42 without Wikidata)
|
|
- **Batch 11**: Focus on ARCHIVE + LIBRARY (12 total without Wikidata)
|
|
- **Batch 12**: Cherry-pick high-scoring EDUCATION_PROVIDER institutions
|
|
|
|
**Projected 30% coverage**: Batches 9-11 combined (~35-40 total enrichments)
|
|
|
|
## Files Generated
|
|
|
|
- **Candidate records**: `data/instances/brazil/batch9_candidates_analysis.yaml`
|
|
- **This report**: `data/instances/brazil/BATCH9_CANDIDATES_REPORT.md`
|
|
|
|
## Manual Enrichment Template
|
|
|
|
For each candidate, follow this workflow:
|
|
|
|
```python
|
|
# In scripts/enrich_brazil_batch9.py
|
|
|
|
BATCH_9_ENRICHMENTS = {{
|
|
"Museo Name Example": {{
|
|
"wikidata_id": "Q12345678",
|
|
"match_score": 1.0, # Manual verification
|
|
"match_method": "Manual Wikidata search",
|
|
"verification_notes": "Verified: location, type, and name match"
|
|
}},
|
|
# ... add 10-15 entries
|
|
}}
|
|
```
|
|
|
|
---
|
|
|
|
**Status**: Ready for manual Wikidata search
|
|
**Next Action**: Create `scripts/enrich_brazil_batch9.py` with top candidates
|
|
"""
|
|
|
|
with open(REPORT_FILE, 'w', encoding='utf-8') as f:
|
|
f.write(report)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|