glam/merge_brazil_batch13.py
2025-11-19 23:25:22 +01:00

153 lines
6.1 KiB
Python

#!/usr/bin/env python3
"""
Merge Brazil Batch 13 enriched institutions back into unified global dataset.
This script merges 9 verified Wikidata Q-numbers from Batch 13.
"""
import yaml
from datetime import datetime, timezone
def merge_brazil_batch13():
print("=" * 80)
print("🔀 Merging Brazil Batch 13 enriched data into unified dataset")
print("=" * 80)
# Load unified dataset
print("\n📂 Loading unified dataset...")
with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f:
all_institutions = yaml.safe_load(f)
print(f" ✅ Loaded {len(all_institutions)} institutions")
# Load enriched Brazil batch 13 data
print("\n📂 Loading Brazil Batch 13 enriched data...")
with open('data/instances/brazil/batch13_enriched.yaml', 'r', encoding='utf-8') as f:
br_batch13 = yaml.safe_load(f)
successful_matches = br_batch13['successful_matches']
print(f" ✅ Loaded {len(successful_matches)} enriched institutions")
# Create mapping of institution IDs to Q-numbers
enrichment_map = {}
for match in successful_matches:
inst_id = match.get('institution_id')
if inst_id:
enrichment_map[inst_id] = {
'qid': match['wikidata_qid'],
'label': match['wikidata_label'],
'description': match.get('wikidata_description', ''),
'confidence': match['confidence']
}
print(f" 📋 Enrichment map created for {len(enrichment_map)} institutions")
# Merge enriched data
print("\n🔄 Merging enriched data...")
merged_count = 0
updated_count = 0
not_found_count = 0
for i, inst in enumerate(all_institutions):
inst_id = inst.get('id')
if inst_id in enrichment_map:
enrichment = enrichment_map[inst_id]
# Check if institution already has Wikidata
has_wd_original = any(
idf.get('identifier_scheme') == 'Wikidata'
for idf in inst.get('identifiers', [])
)
if not has_wd_original:
# Add new Wikidata identifier
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append({
'identifier_scheme': 'Wikidata',
'identifier_value': enrichment['qid'],
'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['qid']}"
})
# Update provenance
if 'provenance' not in inst:
inst['provenance'] = {}
if 'enrichment_history' not in inst['provenance']:
inst['provenance']['enrichment_history'] = []
inst['provenance']['enrichment_history'].append({
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'enrichment_type': 'WIKIDATA_IDENTIFIER',
'enrichment_method': 'AUTHENTICATED_SEARCH_BATCH13',
'match_score': enrichment['confidence'],
'verified': True,
'enrichment_source': 'https://www.wikidata.org',
'enrichment_notes': f"Batch 13: {enrichment['label']}"
})
inst['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()
merged_count += 1
print(f" ✅ Merged: {inst['name']} -> {enrichment['qid']}")
else:
updated_count += 1
# Get existing Q-number
existing_q = next(
(idf.get('identifier_value') for idf in inst.get('identifiers', [])
if idf.get('identifier_scheme') == 'Wikidata'),
'N/A'
)
print(f" ⏭️ Already has Wikidata: {inst['name']} ({existing_q})")
# Report institutions not found in main dataset
found_ids = {inst['id'] for inst in all_institutions}
for inst_id in enrichment_map:
if inst_id not in found_ids:
not_found_count += 1
print(f" ⚠️ Not found in dataset: {inst_id}")
print(f"\n 📊 New Wikidata enrichments merged: {merged_count}")
print(f" 📊 Already enriched (skipped): {updated_count}")
print(f" 📊 Not found in main dataset: {not_found_count}")
# Save unified dataset with timestamp
timestamp = datetime.now(timezone.utc).strftime('%Y%m%d')
output_path = f'data/instances/all/globalglam-{timestamp}.yaml'
print(f"\n💾 Saving updated unified dataset to {output_path}...")
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump(all_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print(" ✅ Saved")
# Verify Brazil coverage
br_institutions = [
inst for inst in all_institutions
if any(loc.get('country') == 'BR' for loc in inst.get('locations', []))
]
br_with_wikidata = sum(
1 for inst in br_institutions
if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []))
)
print("\n" + "=" * 80)
print("📊 VERIFICATION - Brazil Institutions in Unified Dataset")
print("=" * 80)
print(f"Total Brazil institutions: {len(br_institutions)}")
print(f"With Wikidata identifiers: {br_with_wikidata}")
print(f"Coverage: {br_with_wikidata/len(br_institutions)*100:.1f}%")
print(f"Progress: {br_with_wikidata}/{len(br_institutions)}")
if merged_count > 0:
print(f"\n✅ SUCCESS: {merged_count} new Wikidata enrichments merged!")
print(f"🇧🇷 Brazil coverage improved: 67 → {br_with_wikidata} institutions")
print(f" Coverage gain: {(br_with_wikidata - 67)/len(br_institutions)*100:.1f}%")
else:
print("\n⚠️ No new enrichments to merge (already up to date)")
print("\n")
if __name__ == '__main__':
merge_brazil_batch13()