glam/merge_brazil_batch13.py

#!/usr/bin/env python3
"""
Merge Brazil Batch 13 enriched institutions back into unified global dataset.
This script merges 9 verified Wikidata Q-numbers from Batch 13.
"""

import yaml
from datetime import datetime, timezone

def merge_brazil_batch13():
    print("=" * 80)
    print("🔀 Merging Brazil Batch 13 enriched data into unified dataset")
    print("=" * 80)

    # Load unified dataset
    print("\n📂 Loading unified dataset...")
    with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f:
        all_institutions = yaml.safe_load(f)
    print(f"   ✅ Loaded {len(all_institutions)} institutions")

    # Load enriched Brazil batch 13 data
    print("\n📂 Loading Brazil Batch 13 enriched data...")
    with open('data/instances/brazil/batch13_enriched.yaml', 'r', encoding='utf-8') as f:
        br_batch13 = yaml.safe_load(f)

    successful_matches = br_batch13['successful_matches']
    print(f"   ✅ Loaded {len(successful_matches)} enriched institutions")

    # Create mapping of institution IDs to Q-numbers
    enrichment_map = {}
    for match in successful_matches:
        inst_id = match.get('institution_id')
        if inst_id:
            enrichment_map[inst_id] = {
                'qid': match['wikidata_qid'],
                'label': match['wikidata_label'],
                'description': match.get('wikidata_description', ''),
                'confidence': match['confidence']
            }

    print(f"   📋 Enrichment map created for {len(enrichment_map)} institutions")

    # Merge enriched data
    print("\n🔄 Merging enriched data...")
    merged_count = 0
    updated_count = 0
    not_found_count = 0

    for i, inst in enumerate(all_institutions):
        inst_id = inst.get('id')

        if inst_id in enrichment_map:
            enrichment = enrichment_map[inst_id]

            # Check if institution already has Wikidata
            has_wd_original = any(
                idf.get('identifier_scheme') == 'Wikidata'
                for idf in inst.get('identifiers', [])
            )

            if not has_wd_original:
                # Add new Wikidata identifier
                if 'identifiers' not in inst:
                    inst['identifiers'] = []

                inst['identifiers'].append({
                    'identifier_scheme': 'Wikidata',
                    'identifier_value': enrichment['qid'],
                    'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['qid']}"
                })

                # Update provenance
                if 'provenance' not in inst:
                    inst['provenance'] = {}

                if 'enrichment_history' not in inst['provenance']:
                    inst['provenance']['enrichment_history'] = []

                inst['provenance']['enrichment_history'].append({
                    'enrichment_date': datetime.now(timezone.utc).isoformat(),
                    'enrichment_type': 'WIKIDATA_IDENTIFIER',
                    'enrichment_method': 'AUTHENTICATED_SEARCH_BATCH13',
                    'match_score': enrichment['confidence'],
                    'verified': True,
                    'enrichment_source': 'https://www.wikidata.org',
                    'enrichment_notes': f"Batch 13: {enrichment['label']}"
                })

                inst['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()

                merged_count += 1
                print(f"   ✅ Merged: {inst['name']} -> {enrichment['qid']}")
            else:
                updated_count += 1
                # Get existing Q-number
                existing_q = next(
                    (idf.get('identifier_value') for idf in inst.get('identifiers', [])
                     if idf.get('identifier_scheme') == 'Wikidata'),
                    'N/A'
                )
                print(f"   ⏭️  Already has Wikidata: {inst['name']} ({existing_q})")

    # Report institutions not found in main dataset
    found_ids = {inst['id'] for inst in all_institutions}
    for inst_id in enrichment_map:
        if inst_id not in found_ids:
            not_found_count += 1
            print(f"   ⚠️  Not found in dataset: {inst_id}")

    print(f"\n   📊 New Wikidata enrichments merged: {merged_count}")
    print(f"   📊 Already enriched (skipped): {updated_count}")
    print(f"   📊 Not found in main dataset: {not_found_count}")

    # Save unified dataset with timestamp
    timestamp = datetime.now(timezone.utc).strftime('%Y%m%d')
    output_path = f'data/instances/all/globalglam-{timestamp}.yaml'

    print(f"\n💾 Saving updated unified dataset to {output_path}...")
    with open(output_path, 'w', encoding='utf-8') as f:
        yaml.dump(all_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)

    print("   ✅ Saved")

    # Verify Brazil coverage
    br_institutions = [
        inst for inst in all_institutions
        if any(loc.get('country') == 'BR' for loc in inst.get('locations', []))
    ]

    br_with_wikidata = sum(
        1 for inst in br_institutions
        if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []))
    )

    print("\n" + "=" * 80)
    print("📊 VERIFICATION - Brazil Institutions in Unified Dataset")
    print("=" * 80)
    print(f"Total Brazil institutions:   {len(br_institutions)}")
    print(f"With Wikidata identifiers:   {br_with_wikidata}")
    print(f"Coverage:                    {br_with_wikidata/len(br_institutions)*100:.1f}%")
    print(f"Progress:                    {br_with_wikidata}/{len(br_institutions)}")

    if merged_count > 0:
        print(f"\n✅ SUCCESS: {merged_count} new Wikidata enrichments merged!")
        print(f"🇧🇷 Brazil coverage improved: 67 → {br_with_wikidata} institutions")
        print(f"   Coverage gain: {(br_with_wikidata - 67)/len(br_institutions)*100:.1f}%")
    else:
        print("\n⚠️  No new enrichments to merge (already up to date)")

    print("\n")

if __name__ == '__main__':
    merge_brazil_batch13()