glam/scripts/deduplicate_mexican_institutions.py

#!/usr/bin/env python3
"""
Deduplicate Mexican Heritage Institutions

Problem: Mexican institutions appear twice in the dataset:
- First entry: Missing GHCID (value is None or 'N/A')
- Second entry: Has proper GHCID assigned

This script:
1. Identifies duplicate Mexican institutions by name
2. Merges records, preferring the one with GHCID
3. Removes GHCID=N/A duplicates
4. Preserves 26 unique institutions that only have GHCID entries
5. Exports deduplicated dataset

Author: GLAM Data Extraction Project
Date: 2025-11-13
"""

import yaml
from typing import List, Dict, Any
from collections import defaultdict


def load_institutions(filepath: str) -> List[Dict[str, Any]]:
    """Load institutions from YAML file."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)


def is_mexican_institution(inst: Dict[str, Any]) -> bool:
    """Check if institution is in Mexico."""
    locations = inst.get('locations', [])
    return any(loc.get('country') == 'MX' for loc in locations)


def merge_institution_records(without_ghcid: Dict[str, Any],
                              with_ghcid: Dict[str, Any]) -> Dict[str, Any]:
    """
    Merge two institution records, preferring with_ghcid version.

    Strategy:
    - Use with_ghcid as base (has proper GHCID)
    - Add any missing fields from without_ghcid version
    - Merge lists (identifiers, collections, etc.) without duplicates
    """
    merged = with_ghcid.copy()

    # Merge identifiers (avoid duplicates)
    merged_identifiers = with_ghcid.get('identifiers', []).copy()
    existing_ids = {
        (i.get('identifier_scheme'), i.get('identifier_value'))
        for i in merged_identifiers
    }

    for identifier in without_ghcid.get('identifiers', []):
        key = (identifier.get('identifier_scheme'), identifier.get('identifier_value'))
        if key not in existing_ids:
            merged_identifiers.append(identifier)
            existing_ids.add(key)

    if merged_identifiers:
        merged['identifiers'] = merged_identifiers

    # Merge collections (avoid duplicates by name)
    merged_collections = with_ghcid.get('collections', []).copy()
    existing_collections = {c.get('collection_name') for c in merged_collections}

    for collection in without_ghcid.get('collections', []):
        if collection.get('collection_name') not in existing_collections:
            merged_collections.append(collection)
            existing_collections.add(collection.get('collection_name'))

    if merged_collections:
        merged['collections'] = merged_collections

    # Merge digital_platforms (avoid duplicates by name)
    merged_platforms = with_ghcid.get('digital_platforms', []).copy()
    existing_platforms = {p.get('platform_name') for p in merged_platforms}

    for platform in without_ghcid.get('digital_platforms', []):
        if platform.get('platform_name') not in existing_platforms:
            merged_platforms.append(platform)
            existing_platforms.add(platform.get('platform_name'))

    if merged_platforms:
        merged['digital_platforms'] = merged_platforms

    # Merge alternative_names (avoid duplicates)
    merged_alt_names = set(with_ghcid.get('alternative_names', []))
    merged_alt_names.update(without_ghcid.get('alternative_names', []))
    if merged_alt_names:
        merged['alternative_names'] = sorted(merged_alt_names)

    # Use description from with_ghcid, fallback to without_ghcid if missing
    if not merged.get('description') and without_ghcid.get('description'):
        merged['description'] = without_ghcid['description']

    return merged


def deduplicate_mexican_institutions(institutions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Deduplicate Mexican institutions.

    Returns:
        List of deduplicated institutions (Mexican + all non-Mexican)
    """
    mexican = []
    non_mexican = []

    # Separate Mexican from non-Mexican
    for inst in institutions:
        if is_mexican_institution(inst):
            mexican.append(inst)
        else:
            non_mexican.append(inst)

    print(f"Total institutions: {len(institutions)}")
    print(f"Mexican institutions: {len(mexican)}")
    print(f"Non-Mexican institutions: {len(non_mexican)}")

    # Group Mexican institutions by name
    by_name = defaultdict(list)
    for inst in mexican:
        name = inst.get('name', 'Unknown')
        by_name[name].append(inst)

    print(f"Unique Mexican institution names: {len(by_name)}")

    # Process each name group
    deduplicated_mexican = []
    duplicates_merged = 0
    unique_kept = 0

    for name, instances in by_name.items():
        if len(instances) == 1:
            # Single instance - keep as-is
            deduplicated_mexican.append(instances[0])
            unique_kept += 1
        else:
            # Multiple instances - identify with/without GHCID
            with_ghcid = [i for i in instances if i.get('ghcid') and i.get('ghcid') != 'N/A']
            without_ghcid = [i for i in instances if not i.get('ghcid') or i.get('ghcid') == 'N/A']

            if len(with_ghcid) == 1 and len(without_ghcid) == 1:
                # Standard duplicate case: merge
                merged = merge_institution_records(without_ghcid[0], with_ghcid[0])
                deduplicated_mexican.append(merged)
                duplicates_merged += 1
            elif len(with_ghcid) == 1 and len(without_ghcid) == 0:
                # Only has-GHCID version exists (should not happen in by_name grouping)
                deduplicated_mexican.append(with_ghcid[0])
                unique_kept += 1
            else:
                # Multiple GHCIDs or complex case - keep all (need manual review)
                print(f"WARNING: Complex duplicate for '{name}': {len(with_ghcid)} with GHCID, {len(without_ghcid)} without")
                deduplicated_mexican.extend(instances)

    print(f"\nDeduplication results:")
    print(f"  Duplicates merged: {duplicates_merged}")
    print(f"  Unique institutions kept: {unique_kept}")
    print(f"  Total deduplicated Mexican institutions: {len(deduplicated_mexican)}")

    # Combine deduplicated Mexican with non-Mexican
    final = non_mexican + deduplicated_mexican
    print(f"\nFinal dataset size: {len(final)} (was {len(institutions)})")
    print(f"Reduction: {len(institutions) - len(final)} duplicates removed")

    return final


def main():
    """Main execution."""
    input_file = 'data/instances/all/globalglam-20251111-brazil-campaign-final.yaml'
    output_file = 'data/instances/all/globalglam-20251113-mexico-deduplicated.yaml'

    print("="*80)
    print("Mexican Heritage Institutions Deduplication")
    print("="*80)
    print()

    # Load data
    print(f"Loading: {input_file}")
    institutions = load_institutions(input_file)

    # Deduplicate
    deduplicated = deduplicate_mexican_institutions(institutions)

    # Save
    print(f"\nSaving deduplicated dataset: {output_file}")
    with open(output_file, 'w', encoding='utf-8') as f:
        yaml.dump(deduplicated, f, allow_unicode=True, sort_keys=False, width=120)

    print("\n✓ Deduplication complete!")
    print(f"  Output: {output_file}")


if __name__ == '__main__':
    main()