247 lines
8.7 KiB
Python
247 lines
8.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Brazilian institutions - Batch 12
|
|
Adds Wikidata Q-numbers to 10 Brazilian institutions found via authenticated Wikidata search.
|
|
Focus: Federal universities and historical institutes.
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Batch 12 matches from Wikidata authenticated search
|
|
BATCH12_MATCHES = {
|
|
# Federal Universities (EDUCATION_PROVIDER type)
|
|
"UFPR": {
|
|
"qid": "Q1232831",
|
|
"label": "Universidade Federal do Paraná",
|
|
"confidence": 0.95,
|
|
"notes": "Federal University of Paraná - exact match"
|
|
},
|
|
"UFPE": {
|
|
"qid": "Q2322256",
|
|
"label": "Universidade Federal de Pernambuco",
|
|
"confidence": 0.95,
|
|
"notes": "Federal University of Pernambuco - exact match"
|
|
},
|
|
"UFPI": {
|
|
"qid": "Q945699",
|
|
"label": "Universidade Federal do Piauí",
|
|
"confidence": 0.95,
|
|
"notes": "Federal University of Piauí - exact match"
|
|
},
|
|
"UFRN": {
|
|
"qid": "Q3847505",
|
|
"label": "Universidade Federal do Rio Grande do Norte",
|
|
"confidence": 0.95,
|
|
"notes": "Federal University of Rio Grande do Norte - verified via SPARQL"
|
|
},
|
|
"UFRR": {
|
|
"qid": "Q7894378",
|
|
"label": "Universidade Federal de Roraima",
|
|
"confidence": 0.95,
|
|
"notes": "Federal University of Roraima - exact match"
|
|
},
|
|
"UFS": {
|
|
"qid": "Q7894380",
|
|
"label": "Universidade Federal de Sergipe",
|
|
"confidence": 0.95,
|
|
"notes": "Federal University of Sergipe - exact match"
|
|
},
|
|
"UFT": {
|
|
"qid": "Q4481798",
|
|
"label": "Fundação Universidade Federal do Tocantins",
|
|
"confidence": 0.95,
|
|
"notes": "Federal University of Tocantins foundation - exact match"
|
|
},
|
|
"UFAM": {
|
|
"qid": "Q5440476",
|
|
"label": "Universidade Federal do Amazonas",
|
|
"confidence": 0.95,
|
|
"notes": "Federal University of Amazonas - exact match"
|
|
},
|
|
|
|
# Historical Institutes & Museums
|
|
"Instituto Histórico": {
|
|
"qid": "Q108221092",
|
|
"label": "Instituto Histórico e Geográfico de Mato Grosso",
|
|
"confidence": 0.93,
|
|
"notes": "Historical and Geographic Institute of Mato Grosso"
|
|
},
|
|
"UFMS": {
|
|
"qid": "Q5440478",
|
|
"label": "Universidade Federal de Mato Grosso do Sul",
|
|
"confidence": 0.95,
|
|
"notes": "Federal University of Mato Grosso do Sul - exact match"
|
|
}
|
|
}
|
|
|
|
def find_institution_by_name(institutions, name):
|
|
"""Find institution by exact or partial name match."""
|
|
for i, inst in enumerate(institutions):
|
|
inst_name = inst.get('name', '').strip()
|
|
# Skip empty names
|
|
if not inst_name:
|
|
continue
|
|
# Exact match first (case-insensitive)
|
|
if inst_name.lower() == name.lower():
|
|
return i, inst
|
|
|
|
# If no exact match, try partial match (but still require non-empty names)
|
|
for i, inst in enumerate(institutions):
|
|
inst_name = inst.get('name', '').strip()
|
|
if inst_name and (name.lower() in inst_name.lower() or inst_name.lower() in name.lower()):
|
|
return i, inst
|
|
|
|
return None, None
|
|
|
|
def add_wikidata_identifier(institution, qid, label, confidence, notes):
|
|
"""Add Wikidata identifier to institution."""
|
|
if 'identifiers' not in institution:
|
|
institution['identifiers'] = []
|
|
|
|
# Check if Wikidata ID already exists
|
|
has_wikidata = any(
|
|
id.get('identifier_scheme') == 'Wikidata'
|
|
for id in institution['identifiers']
|
|
)
|
|
|
|
if not has_wikidata:
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': qid,
|
|
'identifier_url': f'https://www.wikidata.org/wiki/{qid}'
|
|
})
|
|
|
|
# Add enrichment history
|
|
if 'provenance' not in institution:
|
|
institution['provenance'] = {}
|
|
|
|
if 'enrichment_history' not in institution['provenance']:
|
|
institution['provenance']['enrichment_history'] = []
|
|
|
|
institution['provenance']['enrichment_history'].append({
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'enrichment_type': 'WIKIDATA_IDENTIFIER',
|
|
'enrichment_method': 'WIKIDATA_AUTHENTICATED_SEARCH',
|
|
'match_score': confidence,
|
|
'verified': True,
|
|
'enrichment_source': 'https://www.wikidata.org',
|
|
'enrichment_notes': f'Batch 12: {notes}. Wikidata label: {label}'
|
|
})
|
|
|
|
# Update last_updated timestamp
|
|
institution['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
return True
|
|
|
|
def main():
|
|
# Load global dataset
|
|
input_file = Path('data/instances/all/globalglam-20251111.yaml')
|
|
|
|
print("=" * 80)
|
|
print("BATCH 12 ENRICHMENT - Brazilian Institutions")
|
|
print("=" * 80)
|
|
print(f"\nLoading: {input_file}")
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print(f"Loaded {len(institutions)} institutions")
|
|
|
|
# Track enrichments
|
|
enriched = []
|
|
not_found = []
|
|
|
|
print("\n" + "=" * 80)
|
|
print("PROCESSING BATCH 12 MATCHES")
|
|
print("=" * 80)
|
|
|
|
for name, data in BATCH12_MATCHES.items():
|
|
print(f"\n🔍 Searching for: {name}")
|
|
|
|
idx, inst = find_institution_by_name(institutions, name)
|
|
|
|
if inst:
|
|
# Verify it's Brazilian
|
|
locations = inst.get('locations', [])
|
|
is_brazilian = any(loc.get('country') == 'BR' for loc in locations)
|
|
|
|
if is_brazilian:
|
|
success = add_wikidata_identifier(
|
|
inst,
|
|
data['qid'],
|
|
data['label'],
|
|
data['confidence'],
|
|
data['notes']
|
|
)
|
|
|
|
if success:
|
|
print(f" ✅ ENRICHED: {inst.get('name')}")
|
|
print(f" Added Q-number: {data['qid']} ({data['label']})")
|
|
enriched.append({
|
|
'name': inst.get('name'),
|
|
'qid': data['qid'],
|
|
'label': data['label'],
|
|
'confidence': data['confidence']
|
|
})
|
|
else:
|
|
print(f" ⚠️ Found but not Brazilian: {inst.get('name')}")
|
|
not_found.append(name)
|
|
else:
|
|
print(f" ❌ NOT FOUND in dataset")
|
|
not_found.append(name)
|
|
|
|
# Create backup
|
|
backup_file = input_file.parent / f"{input_file.stem}.batch12_backup"
|
|
print(f"\n📦 Creating backup: {backup_file}")
|
|
with open(backup_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
|
|
|
|
# Save enriched dataset
|
|
print(f"💾 Saving enriched dataset: {input_file}")
|
|
with open(input_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
|
|
|
|
# Create batch 12 enriched file
|
|
batch_file = Path('data/instances/brazil/batch12_enriched.yaml')
|
|
batch_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"💾 Saving batch file: {batch_file}")
|
|
with open(batch_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(enriched, f, allow_unicode=True, sort_keys=False)
|
|
|
|
# Generate statistics
|
|
print("\n" + "=" * 80)
|
|
print("BATCH 12 ENRICHMENT RESULTS")
|
|
print("=" * 80)
|
|
print(f"\n✅ Successfully enriched: {len(enriched)}")
|
|
print(f"❌ Not found/matched: {len(not_found)}")
|
|
print(f"📊 Success rate: {len(enriched)/len(BATCH12_MATCHES)*100:.1f}%")
|
|
|
|
if enriched:
|
|
print("\n📋 Enriched institutions:")
|
|
for item in enriched:
|
|
print(f" • {item['name']} → {item['qid']} ({item['confidence']:.0%})")
|
|
|
|
if not_found:
|
|
print("\n⚠️ Not matched:")
|
|
for name in not_found:
|
|
print(f" • {name}")
|
|
|
|
# Calculate new coverage
|
|
brazil_total = len([i for i in institutions if any(loc.get('country') == 'BR' for loc in (i.get('locations') or []))])
|
|
brazil_with_q = len([i for i in institutions if any(loc.get('country') == 'BR' for loc in (i.get('locations') or [])) and any(id.get('identifier_scheme') == 'Wikidata' for id in (i.get('identifiers') or []))])
|
|
|
|
print("\n" + "=" * 80)
|
|
print("OVERALL BRAZILIAN COVERAGE")
|
|
print("=" * 80)
|
|
print(f"Total Brazilian institutions: {brazil_total}")
|
|
print(f"With Wikidata Q-numbers: {brazil_with_q}")
|
|
print(f"Coverage: {brazil_with_q/brazil_total*100:.1f}%")
|
|
print(f"Remaining to enrich: {brazil_total - brazil_with_q}")
|
|
|
|
print("\n✅ Batch 12 enrichment complete!")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|