glam/archive/scripts/brazil/merge_brazil_batch8.py
2025-11-19 23:25:22 +01:00

120 lines
4.8 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Merge Brazil batch 8 enriched institutions back into unified global dataset.
This script merges the 2 missing Wikidata enrichments from batch 8.
"""
import yaml
from datetime import datetime, timezone
def merge_brazil_batch8():
print("=" * 80)
print("🔀 Merging Brazil batch 8 enriched data into unified dataset")
print("=" * 80)
# Load unified dataset
print("\n📂 Loading unified dataset...")
with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f:
all_institutions = yaml.safe_load(f)
print(f" ✅ Loaded {len(all_institutions)} institutions")
# Load enriched Brazil batch 8 data
print("\n📂 Loading Brazil batch 8 enriched data...")
with open('data/instances/brazil/brazilian_institutions_batch8_enriched.yaml', 'r', encoding='utf-8') as f:
br_batch8 = yaml.safe_load(f)
print(f" ✅ Loaded {len(br_batch8)} enriched Brazil institutions")
# Create lookup by ID URL for quick matching
br_by_id = {inst['id']: inst for inst in br_batch8}
# Merge enriched data
print("\n🔄 Merging enriched data...")
merged_count = 0
updated_count = 0
for i, inst in enumerate(all_institutions):
inst_id = inst.get('id')
if inst_id in br_by_id:
enriched_inst = br_by_id[inst_id]
# Check if this institution has Wikidata in batch 8 but not in master
has_wd_enriched = any(
idf.get('identifier_scheme') == 'Wikidata'
for idf in enriched_inst.get('identifiers', [])
)
has_wd_original = any(
idf.get('identifier_scheme') == 'Wikidata'
for idf in inst.get('identifiers', [])
)
if has_wd_enriched and not has_wd_original:
# This is a new enrichment - merge it
enriched_copy = enriched_inst.copy()
# Preserve GHCID fields from original if they exist
if 'ghcid' in inst:
enriched_copy['ghcid'] = inst['ghcid']
if 'ghcid_uuid' in inst:
enriched_copy['ghcid_uuid'] = inst['ghcid_uuid']
if 'ghcid_uuid_sha256' in inst:
enriched_copy['ghcid_uuid_sha256'] = inst['ghcid_uuid_sha256']
if 'ghcid_numeric' in inst:
enriched_copy['ghcid_numeric'] = inst['ghcid_numeric']
# Replace with enriched version
all_institutions[i] = enriched_copy
merged_count += 1
# Extract Q-number for display
q_num = next(
(idf.get('identifier_value') for idf in enriched_copy.get('identifiers', [])
if idf.get('identifier_scheme') == 'Wikidata'),
'N/A'
)
print(f" ✅ Merged: {inst['name']} -> {q_num}")
elif has_wd_enriched and has_wd_original:
# Already has Wikidata, might have other updates
updated_count += 1
print(f"\n 📊 New Wikidata enrichments merged: {merged_count}")
print(f" 📊 Already enriched (skipped): {updated_count}")
# Save unified dataset with timestamp
timestamp = datetime.now(timezone.utc).strftime('%Y%m%d')
output_path = f'data/instances/all/globalglam-{timestamp}.yaml'
print(f"\n💾 Saving updated unified dataset to {output_path}...")
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump(all_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print(" ✅ Saved")
# Verify Brazil coverage
br_institutions = [
inst for inst in all_institutions
if any(loc.get('country') == 'BR' for loc in inst.get('locations', []))
]
br_with_wikidata = sum(
1 for inst in br_institutions
if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []))
)
print("\n" + "=" * 80)
print("📊 VERIFICATION - Brazil Institutions in Unified Dataset")
print("=" * 80)
print(f"Total Brazil institutions: {len(br_institutions)}")
print(f"With Wikidata identifiers: {br_with_wikidata}")
print(f"Coverage: {br_with_wikidata/len(br_institutions)*100:.1f}%")
print(f"Progress: {br_with_wikidata}/{len(br_institutions)}")
if merged_count > 0:
print(f"\n✅ SUCCESS: {merged_count} new Wikidata enrichments merged!")
print(f"🇧🇷 Brazil coverage improved: 7 → {br_with_wikidata} institutions")
else:
print("\n⚠️ No new enrichments to merge (already up to date)")
print("\n")
if __name__ == '__main__':
merge_brazil_batch8()