glam/merge_batch14.py

#!/usr/bin/env python3
"""
Merge Batch 14 Wikidata enrichments into main GlobalGLAM dataset.
Date: 2025-11-11
Institutions: 3 Brazilian heritage custodians
"""

import yaml
from datetime import datetime, timezone
import shutil

# File paths
MAIN_FILE = 'data/instances/all/globalglam-20251111.yaml'
ENRICHMENT_FILE = 'data/instances/brazil/batch14_enriched.yaml'
BACKUP_SUFFIX = '.bak.batch14'

def load_yaml(filepath):
    """Load YAML file."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)

def save_yaml(data, filepath):
    """Save YAML file."""
    with open(filepath, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

def merge_enrichments():
    """Merge Wikidata enrichments into main dataset."""

    # Create backup
    print(f"Creating backup: {MAIN_FILE}{BACKUP_SUFFIX}")
    shutil.copy2(MAIN_FILE, f"{MAIN_FILE}{BACKUP_SUFFIX}")

    # Load data
    print("Loading main dataset...")
    main_data = load_yaml(MAIN_FILE)

    print("Loading enrichment data...")
    enrichment_data = load_yaml(ENRICHMENT_FILE)

    # Enrichment mapping
    enrichments = {
        'https://w3id.org/heritage/custodian/br/mg-ufmg-tainacan-lab': {
            'qid': 'Q132140',
            'label': 'Federal University of Minas Gerais',
            'description': 'public, federal university in Belo Horizonte, state of Minas Gerais, Brazil',
            'confidence': 0.90
        },
        'https://w3id.org/heritage/custodian/br/mg-mm-gerdau': {
            'qid': 'Q10333730',
            'label': 'MM Gerdau - Mines and Metal Museum',
            'description': 'museum in Belo Horizonte, Brazil',
            'confidence': 0.95
        },
        'https://w3id.org/heritage/custodian/br/pb-pedra-do-ing': {
            'qid': 'Q3076249',
            'label': 'Ingá Stone',
            'description': 'archaeological site in Ingá, Brazil',
            'confidence': 0.95
        }
    }

    # Track merge statistics
    merged_count = 0
    not_found = []
    timestamp = datetime.now(timezone.utc).isoformat()

    # Process each institution
    for institution in main_data:
        inst_id = institution.get('id')

        if inst_id in enrichments:
            enrichment = enrichments[inst_id]

            # Check if Wikidata identifier already exists
            has_wikidata = False
            if institution.get('identifiers'):
                for ident in institution['identifiers']:
                    if ident.get('identifier_scheme') == 'Wikidata':
                        has_wikidata = True
                        print(f"⚠️  SKIPPING {institution['name']} - already has Wikidata: {ident.get('identifier_value')}")
                        break

            if not has_wikidata:
                # Add Wikidata identifier
                if not institution.get('identifiers'):
                    institution['identifiers'] = []

                institution['identifiers'].append({
                    'identifier_scheme': 'Wikidata',
                    'identifier_value': enrichment['qid'],
                    'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['qid']}"
                })

                # Add enrichment history
                if not institution.get('provenance'):
                    institution['provenance'] = {}

                if not institution['provenance'].get('enrichment_history'):
                    institution['provenance']['enrichment_history'] = []

                institution['provenance']['enrichment_history'].append({
                    'enrichment_date': timestamp,
                    'enrichment_method': 'Wikidata authenticated entity search (Batch 14)',
                    'enrichment_source': 'batch14_enriched.yaml',
                    'fields_enriched': ['identifiers.Wikidata'],
                    'wikidata_label': enrichment['label'],
                    'wikidata_description': enrichment['description'],
                    'confidence_score': enrichment['confidence']
                })

                # Update last_updated timestamp
                if institution.get('provenance'):
                    institution['provenance']['last_updated'] = timestamp

                print(f"✅ MERGED: {institution['name']} → {enrichment['qid']}")
                merged_count += 1

    # Check for institutions not found
    for inst_id in enrichments.keys():
        found = False
        for institution in main_data:
            if institution.get('id') == inst_id:
                found = True
                break
        if not found:
            not_found.append(inst_id)

    # Report results
    print(f"\n{'='*60}")
    print(f"BATCH 14 MERGE COMPLETE")
    print(f"{'='*60}")
    print(f"Total enrichments: {len(enrichments)}")
    print(f"Successfully merged: {merged_count}")
    print(f"Not found in dataset: {len(not_found)}")

    if not_found:
        print(f"\n⚠️  Institutions not found:")
        for inst_id in not_found:
            print(f"   - {inst_id}")

    # Save updated dataset
    print(f"\nSaving updated dataset to {MAIN_FILE}...")
    save_yaml(main_data, MAIN_FILE)

    print(f"\n✅ Merge complete! Backup saved to {MAIN_FILE}{BACKUP_SUFFIX}")

    # Calculate new coverage
    print(f"\nCalculating new coverage...")
    br_institutions = [inst for inst in main_data if inst.get('locations') and any(loc.get('country') == 'BR' for loc in inst['locations'])]
    with_wikidata = sum(1 for inst in br_institutions if inst.get('identifiers') and any(i.get('identifier_scheme') == 'Wikidata' for i in inst['identifiers']))

    print(f"Brazilian institutions with Wikidata: {with_wikidata}/{len(br_institutions)} ({with_wikidata/len(br_institutions)*100:.1f}%)")

if __name__ == '__main__':
    merge_enrichments()