glam/crosslink_dutch_datasets.py

#!/usr/bin/env python3
"""
Cross-link and merge ISIL registry with Dutch organizations dataset.
Demonstrates TIER_1 data source merging using ISIL codes as primary key.
"""

from pathlib import Path
from typing import Dict, List, Optional
from glam_extractor.parsers.isil_registry import ISILRegistryParser
from glam_extractor.parsers.dutch_orgs import DutchOrgsParser
from glam_extractor.models import HeritageCustodian, DigitalPlatform, Identifier
from dataclasses import dataclass

@dataclass
class MergedRecord:
    """Represents a merged institution record from multiple sources"""
    custodian: HeritageCustodian
    isil_code: str
    in_registry: bool
    in_orgs: bool
    platforms: List[DigitalPlatform]
    enrichment_notes: List[str]


def merge_custodians(
    isil_custodian: Optional[HeritageCustodian],
    orgs_custodian: Optional[HeritageCustodian],
    isil_code: str
) -> MergedRecord:
    """
    Merge two custodian records, preferring ISIL registry for core data
    and enriching with platform information from organizations dataset.
    """
    enrichment_notes = []

    # Determine base record (prefer ISIL registry as it's authoritative)
    if isil_custodian and orgs_custodian:
        base = isil_custodian
        platforms = orgs_custodian.digital_platforms or []
        enrichment_notes.append("Merged ISIL registry + organizations data")
        enrichment_notes.append(f"Added {len(platforms)} digital platforms from orgs dataset")

        # Check for name differences
        if isil_custodian.name != orgs_custodian.name:
            enrichment_notes.append(
                f"Name variation: Registry='{isil_custodian.name}' vs "
                f"Orgs='{orgs_custodian.name}'"
            )

        # Check for location differences
        if (isil_custodian.locations and orgs_custodian.locations and
            isil_custodian.locations[0].city != orgs_custodian.locations[0].city):
            enrichment_notes.append(
                f"Location difference: Registry={isil_custodian.locations[0].city} vs "
                f"Orgs={orgs_custodian.locations[0].city}"
            )

        in_registry = True
        in_orgs = True

    elif isil_custodian:
        base = isil_custodian
        platforms = []
        enrichment_notes.append("Only in ISIL registry (no org data available)")
        in_registry = True
        in_orgs = False

    elif orgs_custodian:
        base = orgs_custodian
        platforms = orgs_custodian.digital_platforms or []
        enrichment_notes.append("Only in organizations dataset (not in ISIL registry)")
        in_registry = False
        in_orgs = True

    else:
        raise ValueError("At least one custodian must be provided")

    return MergedRecord(
        custodian=base,
        isil_code=isil_code,
        in_registry=in_registry,
        in_orgs=in_orgs,
        platforms=platforms,
        enrichment_notes=enrichment_notes
    )


def main():
    isil_path = Path("data/ISIL-codes_2025-08-01.csv")
    dutch_orgs_path = Path("data/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv")

    print("=" * 70)
    print("DUTCH HERITAGE DATASETS CROSS-LINKING")
    print("=" * 70)
    print()

    # Parse both datasets
    print("📄 Loading datasets...")
    isil_parser = ISILRegistryParser()
    dutch_parser = DutchOrgsParser()

    isil_custodians = isil_parser.parse_and_convert(isil_path)
    dutch_custodians = dutch_parser.parse_and_convert(dutch_orgs_path)

    print(f"✅ Loaded {len(isil_custodians)} ISIL records")
    print(f"✅ Loaded {len(dutch_custodians)} organization records")
    print()

    # Build lookup dictionaries by ISIL code
    print("🔗 Building ISIL code indexes...")

    isil_by_code: Dict[str, HeritageCustodian] = {}
    for custodian in isil_custodians:
        for identifier in custodian.identifiers:
            if identifier.identifier_scheme == "ISIL":
                isil_by_code[identifier.identifier_value] = custodian
                break

    orgs_by_code: Dict[str, HeritageCustodian] = {}
    for custodian in dutch_custodians:
        for identifier in custodian.identifiers:
            if identifier.identifier_scheme == "ISIL":
                orgs_by_code[identifier.identifier_value] = custodian
                break

    print(f"✅ Indexed {len(isil_by_code)} ISIL codes from registry")
    print(f"✅ Indexed {len(orgs_by_code)} ISIL codes from organizations")
    print()

    # Merge records
    print("🔄 Cross-linking records by ISIL code...")
    all_isil_codes = set(isil_by_code.keys()) | set(orgs_by_code.keys())

    merged_records: List[MergedRecord] = []

    for isil_code in sorted(all_isil_codes):
        isil_record = isil_by_code.get(isil_code)
        orgs_record = orgs_by_code.get(isil_code)

        merged = merge_custodians(isil_record, orgs_record, isil_code)
        merged_records.append(merged)

    print(f"✅ Created {len(merged_records)} merged records")
    print()

    # Statistics
    print("📊 Merge Statistics:")
    print("-" * 70)

    both = sum(1 for r in merged_records if r.in_registry and r.in_orgs)
    only_registry = sum(1 for r in merged_records if r.in_registry and not r.in_orgs)
    only_orgs = sum(1 for r in merged_records if not r.in_registry and r.in_orgs)

    print(f"In both datasets:         {both:4d} ({both/len(merged_records)*100:.1f}%)")
    print(f"Only in ISIL registry:    {only_registry:4d}")
    print(f"Only in organizations:    {only_orgs:4d}")
    print()

    with_platforms = sum(1 for r in merged_records if r.platforms)
    print(f"Records with platforms:   {with_platforms:4d} ({with_platforms/len(merged_records)*100:.1f}%)")
    print()

    # Show enrichment examples
    print("✨ Enrichment Examples (First 10 with platforms):")
    print("-" * 70)

    enriched_count = 0
    for record in merged_records:
        if record.platforms and enriched_count < 10:
            enriched_count += 1
            print(f"{enriched_count}. {record.custodian.name}")
            print(f"   ISIL: {record.isil_code}")
            print(f"   Status: {'Registry+Orgs' if record.in_registry and record.in_orgs else 'Orgs only'}")

            if record.custodian.locations:
                print(f"   Location: {record.custodian.locations[0].city}")

            print(f"   Platforms: {len(record.platforms)}")
            for platform in record.platforms[:3]:  # Show up to 3 platforms
                print(f"     - {platform.platform_type}: {platform.platform_name}")

            if record.enrichment_notes:
                print(f"   Notes: {record.enrichment_notes[0]}")
            print()

    # Show conflict examples (name mismatches)
    print("⚠️  Name Conflict Examples:")
    print("-" * 70)

    conflicts = [r for r in merged_records
                 if any("Name variation" in note for note in r.enrichment_notes)]

    for i, record in enumerate(conflicts[:5], 1):
        print(f"{i}. {record.isil_code}")
        note = [n for n in record.enrichment_notes if "Name variation" in n][0]
        print(f"   {note}")
        print()

    if len(conflicts) > 5:
        print(f"   ... and {len(conflicts) - 5} more name conflicts")
        print()

    # Organizations without ISIL codes (candidates for assignment)
    orgs_without_isil = [
        c for c in dutch_custodians
        if not any(i.identifier_scheme == "ISIL" for i in c.identifiers)
    ]

    print("💡 ISIL Assignment Candidates:")
    print("-" * 70)
    print(f"Organizations without ISIL codes: {len(orgs_without_isil)}")
    print()
    print("Sample candidates (first 10):")
    for i, custodian in enumerate(orgs_without_isil[:10], 1):
        location = custodian.locations[0].city if custodian.locations else "Unknown"
        print(f"{i:2d}. {custodian.name}")
        print(f"    Type: {custodian.institution_type}, Location: {location}")
    print()

    # Summary recommendations
    print("🎯 Cross-linking Summary:")
    print("-" * 70)
    print(f"✅ Successfully linked {both} institutions via ISIL codes")
    print(f"✅ Enriched {with_platforms} records with digital platform data")
    print(f"⚠️  Found {len(conflicts)} name conflicts requiring review")
    print(f"💡 Identified {len(orgs_without_isil)} candidates for ISIL code assignment")
    print()

    print("Next steps:")
    print("1. Export merged records to JSON-LD/RDF for SPARQL queries")
    print("2. Review and resolve name conflicts")
    print("3. Geocode all locations for spatial analysis")
    print("4. Create provenance graph showing data lineage")
    print("5. Submit ISIL code applications for qualified organizations")
    print()

    print("✅ Cross-linking complete!")


if __name__ == "__main__":
    main()