glam/merge_batch13_corrected.py

#!/usr/bin/env python3
"""
Merge Batch 13 Wikidata Enrichments - Corrected Version
Adds verified Q-numbers to 3 Brazilian institutions.
"""

import yaml
from datetime import datetime, timezone
from pathlib import Path

# Verified enrichments (IDs corrected)
ENRICHMENTS = {
    "3008281717687280329": {
        "name": "UNIR",
        "qid": "Q7894377",
        "label": "Federal University of Rondônia",
        "description": "Brazilian public university"
    },
    "709508309148680086": {
        "name": "Secult Tocantins",
        "qid": "Q108397863",
        "label": "Secretary of Culture of the State of Tocantins",
        "description": "state secretariat responsible for cultural related affairs in the state of Tocantins, Brazil"
    },
    "2519599505258789521": {
        "name": "Instituto Histórico e Geográfico de Alagoas",
        "qid": "Q10302531",
        "label": "Instituto Histórico e Geográfico de Alagoas",
        "description": "research institute and museum in Maceió, Brazil"
    }
}

def main():
    dataset_path = Path("data/instances/all/globalglam-20251111.yaml")
    backup_path = Path("data/instances/all/globalglam-20251111.yaml.bak.batch13")

    print("="*80)
    print("BRAZIL BATCH 13 WIKIDATA ENRICHMENT MERGE")
    print("="*80)

    # Load dataset
    print(f"\nLoading dataset: {dataset_path}")
    with open(dataset_path, 'r', encoding='utf-8') as f:
        institutions = list(yaml.safe_load_all(f))
        if len(institutions) == 1 and isinstance(institutions[0], list):
            institutions = institutions[0]

    print(f"Loaded {len(institutions)} institutions")

    # Track changes
    enriched_count = 0
    skipped_count = 0
    errors = []

    print(f"\n" + "="*80)
    print("PROCESSING ENRICHMENTS")
    print("="*80)

    for inst in institutions:
        if not isinstance(inst, dict):
            continue

        inst_id = str(inst.get('id', ''))

        if inst_id in ENRICHMENTS:
            enrichment = ENRICHMENTS[inst_id]

            # Check if already has Wikidata
            identifiers = inst.get('identifiers', [])
            has_wikidata = any(
                i.get('identifier_scheme') == 'Wikidata'
                for i in identifiers if isinstance(i, dict)
            )

            if has_wikidata:
                print(f"\n⚠ SKIP: {inst.get('name')} (ID: {inst_id})")
                print(f"  Already has Wikidata Q-number")
                skipped_count += 1
                continue

            # Add Wikidata identifier
            new_identifier = {
                'identifier_scheme': 'Wikidata',
                'identifier_value': enrichment['qid'],
                'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['qid']}"
            }

            if 'identifiers' not in inst or inst['identifiers'] is None:
                inst['identifiers'] = []

            inst['identifiers'].append(new_identifier)

            # Update provenance
            if 'provenance' not in inst:
                inst['provenance'] = {}

            if 'enrichment_history' not in inst['provenance']:
                inst['provenance']['enrichment_history'] = []

            inst['provenance']['enrichment_history'].append({
                'enrichment_date': datetime.now(timezone.utc).isoformat(),
                'enrichment_method': 'Wikidata authenticated entity search (Batch 13)',
                'enrichment_source': 'batch13_enriched.yaml',
                'fields_enriched': ['identifiers.Wikidata'],
                'wikidata_label': enrichment['label'],
                'wikidata_description': enrichment['description']
            })

            print(f"\n✓ ENRICHED: {inst.get('name')}")
            print(f"  ID: {inst_id}")
            print(f"  Q-number: {enrichment['qid']}")
            print(f"  Label: {enrichment['label']}")
            enriched_count += 1

    # Summary
    print(f"\n" + "="*80)
    print("MERGE SUMMARY")
    print("="*80)
    print(f"Institutions enriched: {enriched_count}")
    print(f"Institutions skipped: {skipped_count}")
    print(f"Errors: {len(errors)}")

    if errors:
        print("\nErrors:")
        for error in errors:
            print(f"  - {error}")

    if enriched_count > 0:
        # Create backup
        print(f"\nCreating backup: {backup_path}")
        import shutil
        shutil.copy2(dataset_path, backup_path)

        # Write updated dataset
        print(f"Writing updated dataset: {dataset_path}")
        with open(dataset_path, 'w', encoding='utf-8') as f:
            yaml.dump_all(
                [institutions],
                f,
                allow_unicode=True,
                default_flow_style=False,
                sort_keys=False,
                width=1000
            )

        print("\n✓ Merge completed successfully!")

        # Calculate coverage
        brazil_institutions = [
            inst for inst in institutions
            if isinstance(inst, dict) and any(
                loc.get('country') == 'BR'
                for loc in inst.get('locations', [])
                if isinstance(loc, dict)
            )
        ]

        brazil_with_wikidata = [
            inst for inst in brazil_institutions
            if any(
                i.get('identifier_scheme') == 'Wikidata'
                for i in inst.get('identifiers', [])
                if isinstance(i, dict)
            )
        ]

        coverage = len(brazil_with_wikidata) / len(brazil_institutions) * 100

        print(f"\n" + "="*80)
        print("WIKIDATA COVERAGE (BRAZIL)")
        print("="*80)
        print(f"Total Brazilian institutions: {len(brazil_institutions)}")
        print(f"With Wikidata Q-numbers: {len(brazil_with_wikidata)}")
        print(f"Coverage: {coverage:.1f}%")
        print(f"Previous coverage: 57.0% (69/121)")
        print(f"Improvement: +{len(brazil_with_wikidata) - 69} institutions (+{coverage - 57.0:.1f}%)")
    else:
        print("\nNo changes made - no merge performed.")

if __name__ == "__main__":
    main()