glam/scripts/apply_collision_resolution_dutch_datasets.py

#!/usr/bin/env python3
"""
Apply Collision Resolution to Dutch Datasets

This script:
1. Parses Dutch ISIL registry (364 institutions)
2. Parses Dutch organizations CSV (1,351 institutions)
3. Deduplicates combined dataset to remove true duplicates
4. Generates GHCIDs for all institutions
5. Applies collision detection and resolution
6. Exports merged dataset with collision statistics
7. Generates detailed collision analysis report

Usage:
    python scripts/apply_collision_resolution_dutch_datasets.py

Output:
    - data/dutch_institutions_with_ghcids.yaml - Full dataset with resolved GHCIDs
    - data/dutch_collision_report.txt - Detailed collision analysis
    - data/dutch_collision_stats.json - Machine-readable statistics
    - data/dutch_deduplication_report.txt - Deduplication details
"""

import json
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Set, Tuple

import yaml

from glam_extractor.identifiers.collision_detector import GHCIDCollisionDetector
from glam_extractor.identifiers.ghcid import GHCIDGenerator, InstitutionType
from glam_extractor.identifiers.lookups import get_ghcid_components_for_dutch_city
from glam_extractor.models import HeritageCustodian
from glam_extractor.parsers.deduplicator import InstitutionDeduplicator
from glam_extractor.parsers.dutch_orgs import DutchOrgsParser
from glam_extractor.parsers.isil_registry import ISILRegistryParser


def generate_ghcid_for_institution(
    institution: HeritageCustodian, generator: GHCIDGenerator
) -> None:
    """
    Generate GHCID for a HeritageCustodian instance.

    Updates the institution in-place with ghcid, ghcid_uuid, etc.

    Args:
        institution: HeritageCustodian to update
        generator: GHCIDGenerator instance
    """
    if not institution.locations or not institution.locations[0].city:
        print(f"Warning: No city for {institution.name}, skipping GHCID generation")
        return

    city = institution.locations[0].city
    country = institution.locations[0].country or "NL"

    # Get GHCID components for this city
    try:
        component_dict = get_ghcid_components_for_dutch_city(
            city=city,
            institution_name=institution.name,
            institution_type=InstitutionType[institution.institution_type],
        )
    except (ValueError, KeyError) as e:
        print(
            f"Warning: Could not generate GHCID components for {institution.name} "
            f"in {city}: {e}"
        )
        return

    if not component_dict:
        print(f"Warning: No GHCID components for {institution.name} in {city}")
        return

    # Generate GHCID components
    components = generator.generate(
        institution_name=component_dict["institution_name"],
        english_name=component_dict["english_name"],
        institution_type=InstitutionType[institution.institution_type],
        country_code=component_dict["country_code"],
        region_code=component_dict["region_code"],
        city_locode=component_dict["city_locode"],
    )

    # Update institution with all identifier formats
    institution.ghcid = components.to_string()
    institution.ghcid_uuid = str(components.to_uuid())
    institution.ghcid_uuid_sha256 = str(components.to_uuid_sha256())
    institution.ghcid_numeric = components.to_numeric()


def analyze_collisions(
    institutions: List[HeritageCustodian],
) -> Dict[str, any]:
    """
    Analyze resolved institutions and generate collision statistics.

    Args:
        institutions: List of resolved HeritageCustodian objects

    Returns:
        Dictionary of statistics
    """
    stats = {
        "total_institutions": len(institutions),
        "institutions_with_collisions": 0,
        "collision_groups": 0,
        "first_batch_groups": 0,
        "historical_addition_groups": 0,
        "q_numbers_added": 0,
        "wikidata_q_numbers": 0,
        "synthetic_q_numbers": 0,
        "ghcid_changes": 0,
        "collisions_by_city": defaultdict(int),
        "collisions_by_type": defaultdict(int),
        "largest_collision_group": 0,
    }

    # Group by base GHCID (without Q-number)
    base_ghcid_groups = defaultdict(list)
    for inst in institutions:
        if inst.ghcid:
            base_ghcid = inst.ghcid.split("-Q")[0]
            base_ghcid_groups[base_ghcid].append(inst)

    # Analyze collision groups
    for base_ghcid, group in base_ghcid_groups.items():
        if len(group) > 1:
            stats["collision_groups"] += 1
            stats["largest_collision_group"] = max(
                stats["largest_collision_group"], len(group)
            )

            # Determine if first batch or historical addition
            extraction_dates = [inst.provenance.extraction_date for inst in group]
            unique_dates = set(d.date() for d in extraction_dates)

            if len(unique_dates) == 1:
                stats["first_batch_groups"] += 1
            else:
                stats["historical_addition_groups"] += 1

            # Count institutions in this collision
            for inst in group:
                stats["institutions_with_collisions"] += 1

                # Track city and type
                if inst.locations and inst.locations[0].city:
                    stats["collisions_by_city"][inst.locations[0].city] += 1
                stats["collisions_by_type"][inst.institution_type] += 1

                # Check if Q-number was added
                if inst.ghcid and "-Q" in inst.ghcid:
                    stats["q_numbers_added"] += 1

                    # Check if Wikidata or synthetic
                    if any(
                        i.identifier_scheme == "Wikidata"
                        for i in (inst.identifiers or [])
                    ):
                        stats["wikidata_q_numbers"] += 1
                    else:
                        stats["synthetic_q_numbers"] += 1

                # Check if GHCID changed (has history)
                if inst.ghcid_history and len(inst.ghcid_history) > 1:
                    stats["ghcid_changes"] += 1

    return stats


def generate_collision_report(
    institutions: List[HeritageCustodian], stats: Dict[str, any]
) -> str:
    """
    Generate human-readable collision analysis report.

    Args:
        institutions: List of resolved HeritageCustodian objects
        stats: Statistics dictionary from analyze_collisions()

    Returns:
        Multi-line string report
    """
    lines = [
        "=" * 80,
        "GHCID Collision Analysis Report - Dutch Heritage Institutions",
        "=" * 80,
        "",
        f"Generated: {datetime.now(timezone.utc).isoformat()}",
        "",
        "OVERVIEW",
        "-" * 80,
        f"Total institutions processed: {stats['total_institutions']:,}",
        f"Institutions with collisions: {stats['institutions_with_collisions']:,} "
        f"({stats['institutions_with_collisions'] / stats['total_institutions'] * 100:.1f}%)",
        f"Collision groups detected: {stats['collision_groups']:,}",
        f"  - First batch collisions: {stats['first_batch_groups']:,}",
        f"  - Historical additions: {stats['historical_addition_groups']:,}",
        f"Largest collision group: {stats['largest_collision_group']} institutions",
        "",
        "Q-NUMBER ASSIGNMENT",
        "-" * 80,
        f"Q-numbers added: {stats['q_numbers_added']:,}",
        f"  - From Wikidata: {stats['wikidata_q_numbers']:,}",
        f"  - Synthetic (generated): {stats['synthetic_q_numbers']:,}",
        f"GHCID changes tracked: {stats['ghcid_changes']:,}",
        "",
        "COLLISIONS BY CITY",
        "-" * 80,
    ]

    # Sort cities by collision count
    cities_sorted = sorted(
        stats["collisions_by_city"].items(), key=lambda x: x[1], reverse=True
    )
    for city, count in cities_sorted[:20]:  # Top 20 cities
        lines.append(f"  {city:<30} {count:>4} institutions")

    lines.extend([
        "",
        "COLLISIONS BY INSTITUTION TYPE",
        "-" * 80,
    ])

    # Sort types by collision count
    types_sorted = sorted(
        stats["collisions_by_type"].items(), key=lambda x: x[1], reverse=True
    )
    for inst_type, count in types_sorted:
        lines.append(f"  {inst_type:<30} {count:>4} institutions")

    lines.extend([
        "",
        "DETAILED COLLISION GROUPS",
        "-" * 80,
        "",
    ])

    # Group by base GHCID for detailed listing
    base_ghcid_groups = defaultdict(list)
    for inst in institutions:
        if inst.ghcid:
            base_ghcid = inst.ghcid.split("-Q")[0]
            base_ghcid_groups[base_ghcid].append(inst)

    # Show collision groups (2+ institutions per base GHCID)
    collision_groups = [
        (base_ghcid, group)
        for base_ghcid, group in base_ghcid_groups.items()
        if len(group) > 1
    ]
    collision_groups.sort(key=lambda x: len(x[1]), reverse=True)

    for i, (base_ghcid, group) in enumerate(collision_groups[:50], 1):  # Top 50 groups
        lines.append(f"{i}. Base GHCID: {base_ghcid}")
        lines.append(f"   {len(group)} institutions:")

        for inst in group:
            q_suffix = ""
            if inst.ghcid and "-Q" in inst.ghcid:
                q_suffix = f" → {inst.ghcid}"

            city = inst.locations[0].city if inst.locations else "Unknown"
            lines.append(
                f"     - {inst.name} ({city}){q_suffix}"
            )

        lines.append("")

    lines.append("=" * 80)

    return "\n".join(lines)


def main():
    """Main execution function"""
    print("GHCID Collision Resolution - Dutch Datasets")
    print("=" * 80)

    # Initialize parsers and detector
    isil_parser = ISILRegistryParser()
    dutch_parser = DutchOrgsParser()
    detector = GHCIDCollisionDetector()
    generator = GHCIDGenerator()

    # Parse ISIL registry
    print("\n1. Parsing ISIL registry...")
    isil_csv = Path("data/ISIL-codes_2025-08-01.csv")
    if not isil_csv.exists():
        print(f"ERROR: ISIL registry not found at {isil_csv}")
        return

    isil_institutions = isil_parser.parse_and_convert(isil_csv)
    print(f"   Loaded {len(isil_institutions):,} institutions from ISIL registry")

    # Parse Dutch organizations
    print("\n2. Parsing Dutch organizations CSV...")
    dutch_csv = Path(
        "data/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv"
    )
    if not dutch_csv.exists():
        print(f"ERROR: Dutch organizations CSV not found at {dutch_csv}")
        return

    dutch_institutions = dutch_parser.parse_and_convert(dutch_csv)
    print(f"   Loaded {len(dutch_institutions):,} institutions from Dutch orgs CSV")

    # Combine datasets
    all_institutions = isil_institutions + dutch_institutions
    print(f"\n3. Combined dataset: {len(all_institutions):,} institutions")

    # Deduplicate institutions before GHCID generation
    print("\n4. Deduplicating institutions...")
    deduplicator = InstitutionDeduplicator()
    deduplicated_institutions = deduplicator.deduplicate(
        all_institutions,
        merge_metadata=True
    )

    duplicates_removed = len(all_institutions) - len(deduplicated_institutions)
    print(f"   Removed {duplicates_removed:,} duplicates")
    print(f"   {len(deduplicated_institutions):,} unique institutions remain")

    # Generate deduplication report
    dedup_report_lines = [
        "Dutch Dataset Deduplication Report",
        "=" * 80,
        f"\nGenerated: {datetime.now(timezone.utc).isoformat()}",
        f"\nTotal institutions (before deduplication): {len(all_institutions):,}",
        f"Unique institutions (after deduplication): {len(deduplicated_institutions):,}",
        f"Duplicates removed: {duplicates_removed:,}",
        f"\nDuplicate groups detected: {len(deduplicator.duplicate_groups)}",
        "\n" + "=" * 80,
        "\nDuplicate Groups:\n"
    ]

    for i, group in enumerate(deduplicator.duplicate_groups, 1):
        dedup_report_lines.append(f"\nGroup {i} ({len(group)} duplicates):")
        for inst in group:
            city = inst.locations[0].city if inst.locations else "Unknown"
            tier = inst.provenance.data_tier if inst.provenance else "Unknown"
            dedup_report_lines.append(f"  - {inst.name} ({city}) [{tier}]")

    dedup_report = "\n".join(dedup_report_lines)

    # Use deduplicated set for GHCID generation
    all_institutions = deduplicated_institutions

    # Generate GHCIDs for all institutions
    print("\n5. Generating GHCIDs...")
    successful_ghcids = 0
    for inst in all_institutions:
        if not inst.ghcid:  # Only generate if not already present
            generate_ghcid_for_institution(inst, generator)
            if inst.ghcid:
                successful_ghcids += 1

    print(f"   Generated GHCIDs for {successful_ghcids:,} institutions")

    # Filter institutions with GHCIDs for collision detection
    institutions_with_ghcids = [inst for inst in all_institutions if inst.ghcid]
    print(
        f"   {len(institutions_with_ghcids):,} institutions have GHCIDs "
        f"({len(all_institutions) - len(institutions_with_ghcids):,} without)"
    )

    # Detect and resolve collisions
    print("\n6. Resolving collisions...")
    resolved_institutions = detector.resolve_collisions(institutions_with_ghcids)
    print(f"   Resolved {len(resolved_institutions):,} institutions")

    # Analyze collisions
    print("\n7. Analyzing collision patterns...")
    stats = analyze_collisions(resolved_institutions)

    # Generate report
    print("\n8. Generating collision report...")
    report = generate_collision_report(resolved_institutions, stats)

    # Export results
    output_dir = Path("data")
    output_dir.mkdir(exist_ok=True)

    # Export merged dataset
    print("\n9. Exporting results...")

    # Convert to dict for YAML serialization
    institutions_dict = [
        {
            "id": inst.id,
            "name": inst.name,
            "institution_type": inst.institution_type,
            "ghcid": inst.ghcid,
            "ghcid_uuid": inst.ghcid_uuid,
            "ghcid_uuid_sha256": inst.ghcid_uuid_sha256,
            "ghcid_numeric": inst.ghcid_numeric,
            "identifiers": [
                {
                    "identifier_scheme": i.identifier_scheme,
                    "identifier_value": i.identifier_value,
                    "identifier_url": str(i.identifier_url) if i.identifier_url else None,
                }
                for i in (inst.identifiers or [])
            ],
            "locations": [
                {
                    "city": loc.city,
                    "country": loc.country,
                    "street_address": loc.street_address,
                }
                for loc in (inst.locations or [])
            ],
            "provenance": {
                "data_source": inst.provenance.data_source,
                "data_tier": inst.provenance.data_tier,
                "extraction_date": inst.provenance.extraction_date.isoformat(),
                "confidence_score": inst.provenance.confidence_score,
            },
            "ghcid_history": [
                {
                    "ghcid": entry.ghcid,
                    "ghcid_numeric": entry.ghcid_numeric,
                    "valid_from": entry.valid_from.isoformat(),
                    "valid_to": entry.valid_to.isoformat() if entry.valid_to else None,
                    "reason": entry.reason,
                }
                for entry in (inst.ghcid_history or [])
            ] if inst.ghcid_history else [],
        }
        for inst in resolved_institutions
    ]

    yaml_path = output_dir / "dutch_institutions_with_ghcids.yaml"
    with open(yaml_path, "w") as f:
        yaml.dump(institutions_dict, f, default_flow_style=False, sort_keys=False)
    print(f"   ✓ Exported dataset to {yaml_path}")

    # Export collision report
    report_path = output_dir / "dutch_collision_report.txt"
    with open(report_path, "w") as f:
        f.write(report)
    print(f"   ✓ Exported collision report to {report_path}")

    # Export statistics
    stats_path = output_dir / "dutch_collision_stats.json"
    # Convert defaultdict to regular dict for JSON serialization
    stats_serializable = {
        k: dict(v) if isinstance(v, defaultdict) else v
        for k, v in stats.items()
    }
    with open(stats_path, "w") as f:
        json.dump(stats_serializable, f, indent=2, default=str)
    print(f"   ✓ Exported statistics to {stats_path}")

    # Export deduplication report
    dedup_report_path = output_dir / "dutch_deduplication_report.txt"
    with open(dedup_report_path, "w") as f:
        f.write(dedup_report)
    print(f"   ✓ Exported deduplication report to {dedup_report_path}")

    # Print summary
    print("\n" + "=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print(f"Initial institutions: {len(isil_institutions) + len(dutch_institutions):,}")
    print(f"Duplicates removed: {duplicates_removed:,}")
    print(f"Unique institutions: {stats['total_institutions']:,}")
    print(f"Collision groups: {stats['collision_groups']:,}")
    print(f"Q-numbers added: {stats['q_numbers_added']:,}")
    print(f"  - Wikidata: {stats['wikidata_q_numbers']:,}")
    print(f"  - Synthetic: {stats['synthetic_q_numbers']:,}")
    print("\nOutput files:")
    print(f"  - {yaml_path}")
    print(f"  - {report_path}")
    print(f"  - {stats_path}")
    print(f"  - {dedup_report_path}")
    print("=" * 80)


if __name__ == "__main__":
    main()