glam/scripts/merge_global_datasets.py

#!/usr/bin/env python3
"""
Global Dataset Merge Script

Merges all regional ISIL datasets into a unified global heritage custodian database:
- Japan ISIL institutions (12,065 records)
- Netherlands ISIL institutions (369 records)
- EU institutions (10 records)
- Latin America institutions (304 records)

Output: Comprehensive global dataset with ~12,748 institutions

Author: GLAM Data Extraction Project
Date: 2025-11-07
"""

import yaml
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Any
from collections import Counter


def load_yaml_dataset(file_path: Path) -> List[Dict[str, Any]]:
    """Load YAML dataset from file."""
    print(f"Loading {file_path.name}...")
    with open(file_path, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

        # Handle both list and dict formats
        if isinstance(data, list):
            return data
        elif isinstance(data, dict):
            # If it's a single institution, wrap in list
            return [data]
        else:
            raise ValueError(f"Unexpected data format in {file_path}")


def analyze_dataset(institutions: List[Dict[str, Any]], name: str) -> Dict[str, Any]:
    """Analyze dataset structure and contents."""
    print(f"\n{'='*60}")
    print(f"Analyzing {name}")
    print(f"{'='*60}")

    stats = {
        'name': name,
        'total_count': len(institutions),
        'countries': Counter(),
        'institution_types': Counter(),
        'data_sources': Counter(),
        'data_tiers': Counter(),
        'has_ghcid': 0,
        'has_coordinates': 0,
        'has_website': 0,
        'has_identifiers': 0,
    }

    for inst in institutions:
        # Country distribution
        if 'locations' in inst and inst['locations']:
            country = inst['locations'][0].get('country', 'UNKNOWN')
            stats['countries'][country] += 1

            # Check for coordinates
            if inst['locations'][0].get('latitude') or inst['locations'][0].get('longitude'):
                stats['has_coordinates'] += 1

        # Institution type
        inst_type = inst.get('institution_type', 'UNKNOWN')
        stats['institution_types'][inst_type] += 1

        # Provenance
        if 'provenance' in inst:
            prov = inst['provenance']
            stats['data_sources'][prov.get('data_source', 'UNKNOWN')] += 1
            stats['data_tiers'][prov.get('data_tier', 'UNKNOWN')] += 1

        # GHCID
        if inst.get('ghcid'):
            stats['has_ghcid'] += 1

        # Identifiers
        if inst.get('identifiers'):
            stats['has_identifiers'] += 1
            # Check for website
            for identifier in inst['identifiers']:
                if identifier.get('identifier_scheme') == 'Website':
                    stats['has_website'] += 1
                    break

    # Print summary
    print(f"Total Records: {stats['total_count']:,}")
    print(f"\nCountries ({len(stats['countries'])}):")
    for country, count in stats['countries'].most_common():
        print(f"  {country}: {count:,} ({count/stats['total_count']*100:.1f}%)")

    print(f"\nInstitution Types:")
    for inst_type, count in stats['institution_types'].most_common():
        print(f"  {inst_type}: {count:,} ({count/stats['total_count']*100:.1f}%)")

    print(f"\nData Quality:")
    print(f"  GHCID Coverage: {stats['has_ghcid']:,} ({stats['has_ghcid']/stats['total_count']*100:.1f}%)")
    print(f"  Has Coordinates: {stats['has_coordinates']:,} ({stats['has_coordinates']/stats['total_count']*100:.1f}%)")
    print(f"  Has Website: {stats['has_website']:,} ({stats['has_website']/stats['total_count']*100:.1f}%)")
    print(f"  Has Identifiers: {stats['has_identifiers']:,} ({stats['has_identifiers']/stats['total_count']*100:.1f}%)")

    return stats


def deduplicate_institutions(institutions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Deduplicate institutions by GHCID (primary) or ISIL code (fallback).

    For duplicates, prefer records with:
    1. Higher data tier (TIER_1 > TIER_2 > TIER_3 > TIER_4)
    2. More complete data (more fields filled)
    3. Most recent extraction date
    """
    seen_ghcids = {}
    seen_isil = {}
    duplicates = []

    tier_priority = {
        'TIER_1_AUTHORITATIVE': 4,
        'TIER_2_VERIFIED': 3,
        'TIER_3_CROWD_SOURCED': 2,
        'TIER_4_INFERRED': 1,
    }

    def completeness_score(inst: Dict[str, Any]) -> int:
        """Calculate how complete an institution record is."""
        score = 0
        score += 1 if inst.get('name') else 0
        score += 1 if inst.get('description') else 0
        score += len(inst.get('identifiers', []))
        score += len(inst.get('locations', []))
        score += len(inst.get('digital_platforms', []))
        score += 1 if inst.get('ghcid') else 0
        if inst.get('locations'):
            loc = inst['locations'][0]
            score += 1 if loc.get('street_address') else 0
            score += 1 if loc.get('postal_code') else 0
            score += 1 if loc.get('latitude') else 0
        return score

    def is_better_record(new_inst: Dict[str, Any], existing_inst: Dict[str, Any]) -> bool:
        """Determine if new record is better than existing."""
        new_prov = new_inst.get('provenance', {})
        existing_prov = existing_inst.get('provenance', {})

        # Compare data tiers
        new_tier = tier_priority.get(new_prov.get('data_tier', ''), 0)
        existing_tier = tier_priority.get(existing_prov.get('data_tier', ''), 0)

        if new_tier != existing_tier:
            return new_tier > existing_tier

        # Compare completeness
        new_score = completeness_score(new_inst)
        existing_score = completeness_score(existing_inst)

        if new_score != existing_score:
            return new_score > existing_score

        # Compare extraction dates (more recent is better)
        new_date = new_prov.get('extraction_date', '')
        existing_date = existing_prov.get('extraction_date', '')

        return new_date > existing_date

    deduped = []

    for inst in institutions:
        ghcid = inst.get('ghcid')
        isil_code = None

        # Extract ISIL code
        if inst.get('identifiers'):
            for identifier in inst['identifiers']:
                if identifier.get('identifier_scheme') == 'ISIL':
                    isil_code = identifier.get('identifier_value')
                    break

        # Check for GHCID duplicates
        if ghcid:
            if ghcid in seen_ghcids:
                duplicates.append({
                    'ghcid': ghcid,
                    'name1': seen_ghcids[ghcid].get('name'),
                    'name2': inst.get('name'),
                })

                # Keep better record
                if is_better_record(inst, seen_ghcids[ghcid]):
                    # Remove old record from deduped
                    deduped = [i for i in deduped if i.get('ghcid') != ghcid]
                    seen_ghcids[ghcid] = inst
                    deduped.append(inst)
                # else: keep existing record
                continue
            else:
                seen_ghcids[ghcid] = inst

        # Check for ISIL duplicates (only if no GHCID)
        elif isil_code:
            if isil_code in seen_isil:
                duplicates.append({
                    'isil': isil_code,
                    'name1': seen_isil[isil_code].get('name'),
                    'name2': inst.get('name'),
                })

                # Keep better record
                if is_better_record(inst, seen_isil[isil_code]):
                    # Remove old record from deduped
                    deduped = [i for i in deduped
                              if not any(id.get('identifier_value') == isil_code
                                       for id in i.get('identifiers', [])
                                       if id.get('identifier_scheme') == 'ISIL')]
                    seen_isil[isil_code] = inst
                    deduped.append(inst)
                continue
            else:
                seen_isil[isil_code] = inst

        deduped.append(inst)

    if duplicates:
        print(f"\n⚠️  Found {len(duplicates)} duplicates (resolved by keeping best record):")
        for dup in duplicates[:10]:  # Show first 10
            if 'ghcid' in dup:
                print(f"  GHCID {dup['ghcid']}: '{dup['name1']}' vs '{dup['name2']}'")
            else:
                print(f"  ISIL {dup['isil']}: '{dup['name1']}' vs '{dup['name2']}'")
        if len(duplicates) > 10:
            print(f"  ... and {len(duplicates) - 10} more")

    return deduped


def merge_datasets(datasets: Dict[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
    """Merge all datasets with deduplication."""
    print(f"\n{'='*60}")
    print("Merging Datasets")
    print(f"{'='*60}")

    # Combine all institutions
    all_institutions = []
    for name, institutions in datasets.items():
        print(f"Adding {len(institutions):,} records from {name}")
        all_institutions.extend(institutions)

    print(f"\nTotal before deduplication: {len(all_institutions):,}")

    # Deduplicate
    merged = deduplicate_institutions(all_institutions)

    print(f"Total after deduplication: {len(merged):,}")
    print(f"Removed: {len(all_institutions) - len(merged):,} duplicates")

    return merged


def main():
    """Main execution."""
    base_path = Path('/Users/kempersc/apps/glam')

    # Define source datasets
    datasets = {
        'Japan ISIL': base_path / 'data/instances/japan/jp_institutions_resolved.yaml',  # Using collision-resolved dataset
        'Netherlands ISIL': base_path / 'data/dutch_institutions_with_ghcids.yaml',
        'EU Institutions': base_path / 'data/instances/eu_institutions.yaml',
        'Latin America': base_path / 'data/instances/latin_american_institutions_AUTHORITATIVE.yaml',
    }

    # Load all datasets
    loaded_datasets = {}
    for name, path in datasets.items():
        if path.exists():
            loaded_datasets[name] = load_yaml_dataset(path)
        else:
            print(f"⚠️  Warning: {name} not found at {path}")

    # Analyze each dataset
    stats = {}
    for name, institutions in loaded_datasets.items():
        stats[name] = analyze_dataset(institutions, name)

    # Merge datasets
    merged_institutions = merge_datasets(loaded_datasets)

    # Analyze merged dataset
    merged_stats = analyze_dataset(merged_institutions, "GLOBAL MERGED DATASET")

    # Generate output files
    output_dir = base_path / 'data/instances/global'
    output_dir.mkdir(parents=True, exist_ok=True)

    # 1. Main YAML file
    output_yaml = output_dir / 'global_heritage_institutions.yaml'
    print(f"\n{'='*60}")
    print(f"Writing merged dataset to {output_yaml.name}")
    print(f"{'='*60}")

    with open(output_yaml, 'w', encoding='utf-8') as f:
        yaml.dump(merged_institutions, f,
                 allow_unicode=True,
                 default_flow_style=False,
                 sort_keys=False,
                 width=120)

    print(f"✅ Wrote {len(merged_institutions):,} institutions to {output_yaml}")

    # 2. Statistics report
    stats_file = output_dir / 'merge_statistics.yaml'
    merge_metadata = {
        'merge_date': datetime.now(timezone.utc).isoformat(),
        'total_institutions': len(merged_institutions),
        'source_datasets': {
            name: {
                'count': len(institutions),
                'file': str(datasets[name].relative_to(base_path))
            }
            for name, institutions in loaded_datasets.items()
        },
        'regional_statistics': stats,
        'merged_statistics': merged_stats,
    }

    with open(stats_file, 'w', encoding='utf-8') as f:
        yaml.dump(merge_metadata, f,
                 allow_unicode=True,
                 default_flow_style=False,
                 sort_keys=False)

    print(f"✅ Wrote statistics to {stats_file}")

    # 3. Summary report (markdown)
    report_file = output_dir / 'merge_report.md'

    with open(report_file, 'w', encoding='utf-8') as f:
        f.write("# Global Heritage Institutions Dataset - Merge Report\n\n")
        f.write(f"**Merge Date**: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}\n\n")
        f.write(f"**Total Institutions**: {len(merged_institutions):,}\n\n")

        f.write("## Source Datasets\n\n")
        for name, institutions in loaded_datasets.items():
            f.write(f"- **{name}**: {len(institutions):,} records\n")

        f.write(f"\n## Country Distribution\n\n")
        for country, count in merged_stats['countries'].most_common():
            pct = count / merged_stats['total_count'] * 100
            f.write(f"- **{country}**: {count:,} ({pct:.1f}%)\n")

        f.write(f"\n## Institution Types\n\n")
        for inst_type, count in merged_stats['institution_types'].most_common():
            pct = count / merged_stats['total_count'] * 100
            f.write(f"- **{inst_type}**: {count:,} ({pct:.1f}%)\n")

        f.write(f"\n## Data Quality Metrics\n\n")
        total = merged_stats['total_count']
        f.write(f"- **GHCID Coverage**: {merged_stats['has_ghcid']:,} ({merged_stats['has_ghcid']/total*100:.1f}%)\n")
        f.write(f"- **Geocoded (has coordinates)**: {merged_stats['has_coordinates']:,} ({merged_stats['has_coordinates']/total*100:.1f}%)\n")
        f.write(f"- **Has Website**: {merged_stats['has_website']:,} ({merged_stats['has_website']/total*100:.1f}%)\n")
        f.write(f"- **Has Identifiers**: {merged_stats['has_identifiers']:,} ({merged_stats['has_identifiers']/total*100:.1f}%)\n")

        f.write(f"\n## Next Steps\n\n")
        f.write("1. **Geocoding**: Add coordinates to remaining institutions\n")
        f.write("2. **Enrichment**: Add Wikidata/VIAF identifiers\n")
        f.write("3. **Validation**: Schema compliance check\n")
        f.write("4. **Export**: Generate JSON-LD, GeoJSON, CSV formats\n")

    print(f"✅ Wrote report to {report_file}")

    print(f"\n{'='*60}")
    print("Global Dataset Merge Complete! 🎉")
    print(f"{'='*60}")
    print(f"Output directory: {output_dir}")
    print(f"Total institutions: {len(merged_institutions):,}")


if __name__ == '__main__':
    main()