120 lines
4.8 KiB
Python
Executable file
120 lines
4.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Merge Brazil batch 8 enriched institutions back into unified global dataset.
|
|
This script merges the 2 missing Wikidata enrichments from batch 8.
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
|
|
def merge_brazil_batch8():
|
|
print("=" * 80)
|
|
print("🔀 Merging Brazil batch 8 enriched data into unified dataset")
|
|
print("=" * 80)
|
|
|
|
# Load unified dataset
|
|
print("\n📂 Loading unified dataset...")
|
|
with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f:
|
|
all_institutions = yaml.safe_load(f)
|
|
print(f" ✅ Loaded {len(all_institutions)} institutions")
|
|
|
|
# Load enriched Brazil batch 8 data
|
|
print("\n📂 Loading Brazil batch 8 enriched data...")
|
|
with open('data/instances/brazil/brazilian_institutions_batch8_enriched.yaml', 'r', encoding='utf-8') as f:
|
|
br_batch8 = yaml.safe_load(f)
|
|
print(f" ✅ Loaded {len(br_batch8)} enriched Brazil institutions")
|
|
|
|
# Create lookup by ID URL for quick matching
|
|
br_by_id = {inst['id']: inst for inst in br_batch8}
|
|
|
|
# Merge enriched data
|
|
print("\n🔄 Merging enriched data...")
|
|
merged_count = 0
|
|
updated_count = 0
|
|
|
|
for i, inst in enumerate(all_institutions):
|
|
inst_id = inst.get('id')
|
|
if inst_id in br_by_id:
|
|
enriched_inst = br_by_id[inst_id]
|
|
|
|
# Check if this institution has Wikidata in batch 8 but not in master
|
|
has_wd_enriched = any(
|
|
idf.get('identifier_scheme') == 'Wikidata'
|
|
for idf in enriched_inst.get('identifiers', [])
|
|
)
|
|
has_wd_original = any(
|
|
idf.get('identifier_scheme') == 'Wikidata'
|
|
for idf in inst.get('identifiers', [])
|
|
)
|
|
|
|
if has_wd_enriched and not has_wd_original:
|
|
# This is a new enrichment - merge it
|
|
enriched_copy = enriched_inst.copy()
|
|
|
|
# Preserve GHCID fields from original if they exist
|
|
if 'ghcid' in inst:
|
|
enriched_copy['ghcid'] = inst['ghcid']
|
|
if 'ghcid_uuid' in inst:
|
|
enriched_copy['ghcid_uuid'] = inst['ghcid_uuid']
|
|
if 'ghcid_uuid_sha256' in inst:
|
|
enriched_copy['ghcid_uuid_sha256'] = inst['ghcid_uuid_sha256']
|
|
if 'ghcid_numeric' in inst:
|
|
enriched_copy['ghcid_numeric'] = inst['ghcid_numeric']
|
|
|
|
# Replace with enriched version
|
|
all_institutions[i] = enriched_copy
|
|
merged_count += 1
|
|
|
|
# Extract Q-number for display
|
|
q_num = next(
|
|
(idf.get('identifier_value') for idf in enriched_copy.get('identifiers', [])
|
|
if idf.get('identifier_scheme') == 'Wikidata'),
|
|
'N/A'
|
|
)
|
|
print(f" ✅ Merged: {inst['name']} -> {q_num}")
|
|
elif has_wd_enriched and has_wd_original:
|
|
# Already has Wikidata, might have other updates
|
|
updated_count += 1
|
|
|
|
print(f"\n 📊 New Wikidata enrichments merged: {merged_count}")
|
|
print(f" 📊 Already enriched (skipped): {updated_count}")
|
|
|
|
# Save unified dataset with timestamp
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y%m%d')
|
|
output_path = f'data/instances/all/globalglam-{timestamp}.yaml'
|
|
|
|
print(f"\n💾 Saving updated unified dataset to {output_path}...")
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(all_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
print(" ✅ Saved")
|
|
|
|
# Verify Brazil coverage
|
|
br_institutions = [
|
|
inst for inst in all_institutions
|
|
if any(loc.get('country') == 'BR' for loc in inst.get('locations', []))
|
|
]
|
|
|
|
br_with_wikidata = sum(
|
|
1 for inst in br_institutions
|
|
if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []))
|
|
)
|
|
|
|
print("\n" + "=" * 80)
|
|
print("📊 VERIFICATION - Brazil Institutions in Unified Dataset")
|
|
print("=" * 80)
|
|
print(f"Total Brazil institutions: {len(br_institutions)}")
|
|
print(f"With Wikidata identifiers: {br_with_wikidata}")
|
|
print(f"Coverage: {br_with_wikidata/len(br_institutions)*100:.1f}%")
|
|
print(f"Progress: {br_with_wikidata}/{len(br_institutions)}")
|
|
|
|
if merged_count > 0:
|
|
print(f"\n✅ SUCCESS: {merged_count} new Wikidata enrichments merged!")
|
|
print(f"🇧🇷 Brazil coverage improved: 7 → {br_with_wikidata} institutions")
|
|
else:
|
|
print("\n⚠️ No new enrichments to merge (already up to date)")
|
|
|
|
print("\n")
|
|
|
|
if __name__ == '__main__':
|
|
merge_brazil_batch8()
|