glam/enrich_brazil_batch11.py
2025-11-19 23:25:22 +01:00

248 lines
8.7 KiB
Python

#!/usr/bin/env python3
"""
Enrich Brazilian institutions - Batch 11
Adds Wikidata Q-numbers to 10 Brazilian institutions found via authenticated Wikidata search.
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
# Batch 11 matches from Wikidata authenticated search
BATCH11_MATCHES = {
# Universities (for repository institutions)
"UFES Digital Libraries": {
"qid": "Q10387830",
"label": "Universidade Federal do Espírito Santo",
"confidence": 0.90,
"notes": "Parent university for UFES digital libraries/repositories"
},
"UFBA Repository": {
"qid": "Q56695176",
"label": "arquivo da Universidade Federal da Bahia",
"confidence": 0.95,
"notes": "Exact match - UFBA archive/repository"
},
"UFC Repository": {
"qid": "Q2749558",
"label": "Universidade Federal do Ceará",
"confidence": 0.90,
"notes": "Parent university for UFC repository"
},
"UFG Repositories": {
"qid": "Q7894375",
"label": "Universidade Federal de Goiás",
"confidence": 0.90,
"notes": "Parent university for UFG repositories"
},
"UFMA": {
"qid": "Q5440477",
"label": "Universidade Federal do Maranhão",
"confidence": 0.92,
"notes": "Chapadinha campus - parent university match"
},
"CEPAP-UNIFAP": {
"qid": "Q7894381",
"label": "Universidade Federal do Amapá",
"confidence": 0.90,
"notes": "Parent university UNIFAP for CEPAP research center"
},
# Museums & Cultural Sites
"Museu Sacaca": {
"qid": "Q10333626",
"label": "Museu Sacaca",
"confidence": 0.98,
"notes": "Exact match - Centro de Pesquisas Museológicas Museu Sacaca"
},
"Serra da Barriga": {
"qid": "Q10370333",
"label": "Serra da Barriga",
"confidence": 0.95,
"notes": "Geographic feature with heritage significance (Quilombo dos Palmares)"
},
# Government Heritage Institutions
"FPC/IPAC": {
"qid": "Q10302963",
"label": "Instituto do Patrimônio Artístico e Cultural da Bahia",
"confidence": 0.93,
"notes": "IPAC - Bahia state heritage preservation agency"
},
"State Archives": {
"qid": "Q56692537",
"label": "Arquivo Público do Estado do Espírito Santo",
"confidence": 0.95,
"notes": "State Archive of Espírito Santo - exact match"
}
}
def find_institution_by_name(institutions, name):
"""Find institution by exact or partial name match."""
for i, inst in enumerate(institutions):
inst_name = inst.get('name', '').strip()
# Skip empty names
if not inst_name:
continue
# Exact match first (case-insensitive)
if inst_name.lower() == name.lower():
return i, inst
# If no exact match, try partial match (but still require non-empty names)
for i, inst in enumerate(institutions):
inst_name = inst.get('name', '').strip()
if inst_name and (name.lower() in inst_name.lower() or inst_name.lower() in name.lower()):
return i, inst
return None, None
def add_wikidata_identifier(institution, qid, label, confidence, notes):
"""Add Wikidata identifier to institution."""
if 'identifiers' not in institution:
institution['identifiers'] = []
# Check if Wikidata ID already exists
has_wikidata = any(
id.get('identifier_scheme') == 'Wikidata'
for id in institution['identifiers']
)
if not has_wikidata:
institution['identifiers'].append({
'identifier_scheme': 'Wikidata',
'identifier_value': qid,
'identifier_url': f'https://www.wikidata.org/wiki/{qid}'
})
# Add enrichment history
if 'provenance' not in institution:
institution['provenance'] = {}
if 'enrichment_history' not in institution['provenance']:
institution['provenance']['enrichment_history'] = []
institution['provenance']['enrichment_history'].append({
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'enrichment_type': 'WIKIDATA_IDENTIFIER',
'enrichment_method': 'WIKIDATA_AUTHENTICATED_SEARCH',
'match_score': confidence,
'verified': True,
'enrichment_source': 'https://www.wikidata.org',
'enrichment_notes': f'Batch 11: {notes}. Wikidata label: {label}'
})
# Update last_updated timestamp
institution['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()
return True
def main():
# Load global dataset
input_file = Path('data/instances/all/globalglam-20251111.yaml')
print("=" * 80)
print("BATCH 11 ENRICHMENT - Brazilian Institutions")
print("=" * 80)
print(f"\nLoading: {input_file}")
with open(input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f"Loaded {len(institutions)} institutions")
# Track enrichments
enriched = []
not_found = []
print("\n" + "=" * 80)
print("PROCESSING BATCH 11 MATCHES")
print("=" * 80)
for name, data in BATCH11_MATCHES.items():
print(f"\n🔍 Searching for: {name}")
idx, inst = find_institution_by_name(institutions, name)
if inst:
# Verify it's Brazilian
locations = inst.get('locations', [])
is_brazilian = any(loc.get('country') == 'BR' for loc in locations)
if is_brazilian:
success = add_wikidata_identifier(
inst,
data['qid'],
data['label'],
data['confidence'],
data['notes']
)
if success:
print(f" ✅ ENRICHED: {inst.get('name')}")
print(f" Added Q-number: {data['qid']} ({data['label']})")
enriched.append({
'name': inst.get('name'),
'qid': data['qid'],
'label': data['label'],
'confidence': data['confidence']
})
else:
print(f" ⚠️ Found but not Brazilian: {inst.get('name')}")
not_found.append(name)
else:
print(f" ❌ NOT FOUND in dataset")
not_found.append(name)
# Create backup
backup_file = input_file.parent / f"{input_file.stem}.batch11_backup"
print(f"\n📦 Creating backup: {backup_file}")
with open(backup_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
# Save enriched dataset
print(f"💾 Saving enriched dataset: {input_file}")
with open(input_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
# Create batch 11 enriched file
batch_file = Path('data/instances/brazil/batch11_enriched.yaml')
batch_file.parent.mkdir(parents=True, exist_ok=True)
print(f"💾 Saving batch file: {batch_file}")
with open(batch_file, 'w', encoding='utf-8') as f:
yaml.dump(enriched, f, allow_unicode=True, sort_keys=False)
# Generate statistics
print("\n" + "=" * 80)
print("BATCH 11 ENRICHMENT RESULTS")
print("=" * 80)
print(f"\n✅ Successfully enriched: {len(enriched)}")
print(f"❌ Not found/matched: {len(not_found)}")
print(f"📊 Success rate: {len(enriched)/len(BATCH11_MATCHES)*100:.1f}%")
if enriched:
print("\n📋 Enriched institutions:")
for item in enriched:
print(f"{item['name']}{item['qid']} ({item['confidence']:.0%})")
if not_found:
print("\n⚠️ Not matched:")
for name in not_found:
print(f"{name}")
# Calculate new coverage
brazil_total = len([i for i in institutions if any(loc.get('country') == 'BR' for loc in (i.get('locations') or []))])
brazil_with_q = len([i for i in institutions if any(loc.get('country') == 'BR' for loc in (i.get('locations') or [])) and any(id.get('identifier_scheme') == 'Wikidata' for id in (i.get('identifiers') or []))])
print("\n" + "=" * 80)
print("OVERALL BRAZILIAN COVERAGE")
print("=" * 80)
print(f"Total Brazilian institutions: {brazil_total}")
print(f"With Wikidata Q-numbers: {brazil_with_q}")
print(f"Coverage: {brazil_with_q/brazil_total*100:.1f}%")
print(f"Remaining to enrich: {brazil_total - brazil_with_q}")
print("\n✅ Batch 11 enrichment complete!")
if __name__ == '__main__':
main()