glam/scripts/create_custodians_from_linkedin.py

#!/usr/bin/env python3
"""
Create new NL-*.yaml custodian files from unmatched LinkedIn profiles.

This script:
1. Loads Dutch candidates from data/custodian/linkedin/_unmatched_analysis.json
2. Resolves city/province using GeoNames database
3. Generates GHCID identifiers following project rules
4. Creates skeleton custodian files with linkedin_enrichment

Usage:
    python scripts/create_custodians_from_linkedin.py --dry-run --limit 10
    python scripts/create_custodians_from_linkedin.py --limit 50
    python scripts/create_custodians_from_linkedin.py  # Process all 452

Key Rules Applied:
- Rule 8: Filter legal forms (Stichting, B.V., etc.) from abbreviations
- GeoNames is authoritative for settlement resolution
- admin1_code mapping to ISO 3166-2 province codes
"""

import argparse
import hashlib
import json
import re
import sqlite3
import sys
import unicodedata
import uuid
from datetime import datetime, timezone
from pathlib import Path

import yaml

# Project paths
PROJECT_ROOT = Path(__file__).parent.parent
LINKEDIN_DIR = PROJECT_ROOT / "data" / "custodian" / "linkedin"
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
GEONAMES_DB = PROJECT_ROOT / "data" / "reference" / "geonames.db"
UNMATCHED_FILE = LINKEDIN_DIR / "_unmatched_analysis.json"

# GHCID namespace UUID for v5 generation
GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8")  # URL namespace

# Dutch province mapping: GeoNames admin1_code → ISO 3166-2 code
ADMIN1_TO_ISO = {
    "01": "DR",  # Drenthe
    "02": "FR",  # Friesland (note: also used for other provinces in some GeoNames data)
    "03": "GE",  # Gelderland
    "04": "GR",  # Groningen
    "05": "LI",  # Limburg
    "06": "NB",  # Noord-Brabant
    "07": "NH",  # Noord-Holland
    "09": "UT",  # Utrecht
    "10": "ZE",  # Zeeland
    "11": "ZH",  # Zuid-Holland
    "15": "OV",  # Overijssel
    "16": "FL",  # Flevoland
}

# Province name to ISO code (for when LinkedIn gives province name as city)
PROVINCE_NAME_TO_ISO = {
    "drenthe": "DR",
    "friesland": "FR",
    "fryslân": "FR",
    "gelderland": "GE",
    "groningen": "GR",
    "limburg": "LI",
    "noord-brabant": "NB",
    "brabant": "NB",
    "noord-holland": "NH",
    "utrecht": "UT",
    "zeeland": "ZE",
    "zuid-holland": "ZH",
    "overijssel": "OV",
    "flevoland": "FL",
}

# Dutch legal form words to skip in abbreviation (Rule 8)
LEGAL_FORM_WORDS = {
    # Dutch
    "stichting", "coöperatie", "cooperatie", "maatschap",
    "bv", "b.v.", "nv", "n.v.", "vof", "v.o.f.", "cv", "c.v.",
    # English
    "foundation", "trust", "inc", "incorporated", "ltd", "limited",
    "llc", "corp", "corporation",
}

# Dutch prepositions/articles to skip in abbreviation
SKIP_WORDS = {
    "de", "het", "een", "van", "voor", "in", "op", "te", "den", "der",
    "des", "'s", "aan", "bij", "met", "naar", "om", "tot", "uit",
    "over", "onder", "door", "en", "of", "the", "a", "an", "of", "and",
}

# Institution type inference patterns
# Patterns matched against NAME (high priority) and INDUSTRY (lower priority)
TYPE_PATTERNS = {
    "M": [  # Museum
        r"\bmuseum\b", r"\bmusea\b", r"\bkunsthal\b", r"\bkunsthuis\b", r"\bgalerie\b",
        r"\btentoonstelling\b", r"\bexpositie\b", r"\bcollectie\b",
    ],
    "A": [  # Archive
        r"\barchief\b", r"\barchieven\b", r"\barchive\b", r"\bdocumentatie\b",
        r"\berfgoedcentrum\b", r"historisch\s+centrum",
    ],
    "L": [  # Library
        r"\bbibliotheek\b", r"\bbibliotheken\b", r"\blibrary\b", r"\bmediatheek\b",
    ],
    "S": [  # Society/Kring
        r"\bvereniging\b", r"\bgenootschap\b", r"\bkring\b", r"\bbond\b", r"stichting.*erfgoed",
        r"\bheemkunde\b", r"\boudheidkunde\b", r"historische.*vereniging",
    ],
    "R": [  # Research
        r"\bonderzoek\b", r"\bresearch\b", r"\binstituut\b", r"\bkenniscentrum\b",
    ],
    "E": [  # Education
        r"\buniversiteit\b", r"\bhogeschool\b", r"\bacademie\b", r"\bschool\b",
    ],
    "B": [  # Botanical/Zoo
        r"\bdierentuin\b", r"\bzoo\b", r"\bbotanische\b", r"\barboretum\b", r"\bhortus\b",
    ],
}


def normalize_text(text: str) -> str:
    """Normalize unicode text, remove diacritics."""
    normalized = unicodedata.normalize("NFD", text)
    ascii_text = "".join(c for c in normalized if unicodedata.category(c) != "Mn")
    return ascii_text.lower()


def generate_city_code(city_name: str) -> str:
    """Generate 3-letter city code from city name.

    Rules:
    - Single word: first 3 letters → Amsterdam = AMS
    - Dutch article (de, het, den, 's): article initial + 2 from main → Den Haag = DHA
    - Multi-word: initials (up to 3) → Nieuw Amsterdam = NAM
    """
    if not city_name:
        return "XXX"

    # Normalize
    clean = normalize_text(city_name)
    words = clean.split()

    if not words:
        return "XXX"

    # Single word
    if len(words) == 1:
        return words[0][:3].upper()

    # Check for Dutch articles at start
    dutch_articles = {"de", "het", "den", "'s", "s"}
    if words[0] in dutch_articles:
        # Article initial + 2 from next word
        if len(words) > 1:
            article_initial = words[0][0] if words[0] != "'s" else "S"
            return (article_initial + words[1][:2]).upper()

    # Multi-word: take initials
    initials = "".join(w[0] for w in words if w not in dutch_articles)
    return initials[:3].upper()


def extract_abbreviation_from_name(name: str) -> str:
    """Extract abbreviation from institution name.

    Rules (per AGENTS.md):
    - Use first letter of each significant word
    - Skip prepositions, articles, conjunctions
    - Skip legal form words (Stichting, B.V., etc.)
    - Remove diacritics, uppercase, max 10 chars
    """
    if not name:
        return "UNK"

    # Normalize
    clean = normalize_text(name)

    # Remove punctuation except spaces
    clean = re.sub(r"[^\w\s]", " ", clean)

    words = clean.split()

    # Filter out skip words and legal forms
    significant_words = []
    for word in words:
        word_lower = word.lower()
        if word_lower in SKIP_WORDS:
            continue
        if word_lower in LEGAL_FORM_WORDS:
            continue
        # Skip digits
        if word.isdigit():
            continue
        significant_words.append(word)

    if not significant_words:
        # Fallback: use first 3 letters of original
        return name[:3].upper()

    # Take first letter of each significant word
    abbrev = "".join(w[0] for w in significant_words)
    return abbrev[:10].upper()


def infer_institution_type(name: str, industry: str) -> list[str]:
    """Infer institution type from name and industry.

    Priority: Name patterns > Industry patterns > Industry keywords
    If name clearly indicates museum, archive, etc., industry is ignored.

    Returns list of type codes (e.g., ["M"], ["A", "L"]).
    """
    name_types = set()
    industry_types = set()

    name_lower = name.lower() if name else ""
    industry_lower = industry.lower() if industry else ""

    # First pass: check name-based patterns (high priority)
    for type_code, patterns in TYPE_PATTERNS.items():
        for pattern in patterns:
            if re.search(pattern, name_lower):
                name_types.add(type_code)
                break

    # If name clearly identifies type, return just that (skip industry)
    if name_types:
        return sorted(name_types)

    # Second pass: check industry-based patterns
    for type_code, patterns in TYPE_PATTERNS.items():
        for pattern in patterns:
            if re.search(pattern, industry_lower):
                industry_types.add(type_code)
                break

    # Industry keyword inference (lower priority)
    if "museum" in industry_lower or "historical site" in industry_lower:
        industry_types.add("M")
    if "librar" in industry_lower:
        industry_types.add("L")
    if "archiv" in industry_lower:
        industry_types.add("A")

    if industry_types:
        return sorted(industry_types)

    # Default to Unknown if no inference possible
    return ["U"]


def lookup_city_geonames(city_name: str, conn: sqlite3.Connection) -> dict | None:
    """Look up city in GeoNames database.

    Returns dict with geonames_id, name, admin1_code, admin1_name, etc.
    or None if not found.
    """
    if not city_name:
        return None

    # First try exact match
    cursor = conn.execute(
        """
        SELECT geonames_id, name, ascii_name, admin1_code, admin1_name,
               latitude, longitude, feature_code, population
        FROM cities
        WHERE country_code = 'NL'
          AND (name = ? OR ascii_name = ?)
          AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
        ORDER BY population DESC
        LIMIT 1
        """,
        (city_name, city_name),
    )
    row = cursor.fetchone()

    if row:
        return {
            "geonames_id": row[0],
            "name": row[1],
            "ascii_name": row[2],
            "admin1_code": row[3],
            "admin1_name": row[4],
            "latitude": row[5],
            "longitude": row[6],
            "feature_code": row[7],
            "population": row[8],
        }

    # Try case-insensitive match
    cursor = conn.execute(
        """
        SELECT geonames_id, name, ascii_name, admin1_code, admin1_name,
               latitude, longitude, feature_code, population
        FROM cities
        WHERE country_code = 'NL'
          AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
          AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
        ORDER BY population DESC
        LIMIT 1
        """,
        (city_name, city_name),
    )
    row = cursor.fetchone()

    if row:
        return {
            "geonames_id": row[0],
            "name": row[1],
            "ascii_name": row[2],
            "admin1_code": row[3],
            "admin1_name": row[4],
            "latitude": row[5],
            "longitude": row[6],
            "feature_code": row[7],
            "population": row[8],
        }

    return None


def infer_city_from_name(institution_name: str, conn: sqlite3.Connection) -> dict | None:
    """Try to infer city from institution name (e.g., 'Museum Spakenburg' → Spakenburg)."""
    # Common patterns: "Museum X", "X Museum", "Archief X"
    name_lower = institution_name.lower()

    # Extract potential city names from institution name
    # Remove common institution type words
    type_words = {"museum", "archief", "bibliotheek", "galerie", "kunsthal", "stichting"}

    words = institution_name.split()
    potential_cities = []

    for word in words:
        word_clean = re.sub(r"[^\w]", "", word)
        if word_clean.lower() not in type_words and word_clean.lower() not in SKIP_WORDS:
            potential_cities.append(word_clean)

    # Try each potential city
    for city_candidate in potential_cities:
        result = lookup_city_geonames(city_candidate, conn)
        if result:
            return result

    return None


def generate_ghcid_uuids(ghcid_string: str) -> dict:
    """Generate UUID v5 and SHA-256 based UUIDs from GHCID string."""
    # UUID v5 (SHA-1) - Primary
    ghcid_uuid = uuid.uuid5(GHCID_NAMESPACE, ghcid_string)

    # UUID v8 (SHA-256 based) - Secondary
    sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
    # Use first 16 bytes to form UUID
    uuid_bytes = bytearray(sha256_hash[:16])
    # Set version to 8 (custom)
    uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80
    # Set variant to RFC 4122
    uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80
    ghcid_uuid_sha256 = uuid.UUID(bytes=bytes(uuid_bytes))

    # 64-bit numeric ID
    ghcid_numeric = int.from_bytes(sha256_hash[:8], byteorder="big")

    return {
        "ghcid_uuid": str(ghcid_uuid),
        "ghcid_uuid_sha256": str(ghcid_uuid_sha256),
        "ghcid_numeric": ghcid_numeric,
    }


def load_yaml(filepath: Path) -> dict:
    """Load a YAML file."""
    with open(filepath, "r", encoding="utf-8") as f:
        return yaml.safe_load(f) or {}


def save_yaml(filepath: Path, data: dict) -> None:
    """Save data to a YAML file with nice formatting."""
    with open(filepath, "w", encoding="utf-8") as f:
        yaml.dump(
            data,
            f,
            default_flow_style=False,
            allow_unicode=True,
            sort_keys=False,
            width=120,
        )


def check_ghcid_collision(ghcid: str) -> bool:
    """Check if a GHCID already exists in the custodian directory."""
    filename = f"{ghcid}.yaml"
    return (CUSTODIAN_DIR / filename).exists()


def resolve_location(candidate: dict, conn: sqlite3.Connection) -> dict:
    """Resolve location for a candidate.

    Returns dict with:
        - province_code: ISO 3166-2 code (e.g., "NH")
        - city_code: 3-letter city code (e.g., "AMS")
        - city_name: Full city name
        - geonames_info: GeoNames lookup result
        - resolution_method: How the location was resolved
    """
    city = candidate.get("city", "").strip()
    name = candidate.get("name", "")

    result = {
        "province_code": "XX",
        "city_code": "XXX",
        "city_name": None,
        "geonames_info": None,
        "resolution_method": "UNRESOLVED",
    }

    # Check if "city" is actually a province name
    city_lower = city.lower() if city else ""
    if city_lower in PROVINCE_NAME_TO_ISO:
        result["province_code"] = PROVINCE_NAME_TO_ISO[city_lower]
        result["resolution_method"] = "PROVINCE_FROM_CITY_FIELD"

        # Try to infer city from institution name
        geonames = infer_city_from_name(name, conn)
        if geonames:
            # Use the inferred city's actual province (more accurate than LinkedIn's)
            admin1 = geonames.get("admin1_code", "")
            if admin1 in ADMIN1_TO_ISO:
                result["province_code"] = ADMIN1_TO_ISO[admin1]
            result["city_name"] = geonames["name"]
            result["city_code"] = generate_city_code(geonames["name"])
            result["geonames_info"] = geonames
            result["resolution_method"] = "CITY_INFERRED_FROM_NAME"

        return result

    # Try GeoNames lookup for city
    if city:
        geonames = lookup_city_geonames(city, conn)
        if geonames:
            admin1 = geonames.get("admin1_code", "")
            result["province_code"] = ADMIN1_TO_ISO.get(admin1, "XX")
            result["city_name"] = geonames["name"]
            result["city_code"] = generate_city_code(geonames["name"])
            result["geonames_info"] = geonames
            result["resolution_method"] = "GEONAMES_LOOKUP"
            return result

    # Try to infer city from institution name
    geonames = infer_city_from_name(name, conn)
    if geonames:
        admin1 = geonames.get("admin1_code", "")
        result["province_code"] = ADMIN1_TO_ISO.get(admin1, "XX")
        result["city_name"] = geonames["name"]
        result["city_code"] = generate_city_code(geonames["name"])
        result["geonames_info"] = geonames
        result["resolution_method"] = "CITY_INFERRED_FROM_NAME"
        return result

    return result


def create_custodian_from_linkedin(
    candidate: dict,
    linkedin_data: dict,
    location_info: dict,
    institution_types: list[str],
) -> tuple[str, dict]:
    """Create a custodian YAML structure from LinkedIn data.

    Returns tuple of (ghcid, data_dict).
    """
    name = candidate.get("name", "Unknown")
    slug = candidate.get("slug", "")

    # Generate GHCID components
    province = location_info["province_code"]
    city = location_info["city_code"]
    primary_type = institution_types[0] if institution_types else "U"
    abbrev = extract_abbreviation_from_name(name)

    # Build GHCID string
    ghcid_string = f"NL-{province}-{city}-{primary_type}-{abbrev}"

    # Handle collisions by adding name suffix
    if check_ghcid_collision(ghcid_string):
        # Add snake_case name suffix
        name_suffix = normalize_text(name).replace(" ", "_")
        name_suffix = re.sub(r"[^a-z0-9_]", "", name_suffix)
        name_suffix = re.sub(r"_+", "_", name_suffix).strip("_")
        ghcid_string = f"{ghcid_string}-{name_suffix}"

    # Generate UUIDs
    uuids = generate_ghcid_uuids(ghcid_string)
    timestamp = datetime.now(timezone.utc).isoformat()

    # Build custodian data structure
    data = {
        "custodian_name": {
            "emic_name": name,
            "emic_name_source": "linkedin",
        },
        "institution_type": institution_types,
        "linkedin_enrichment": {
            "linkedin_url": linkedin_data.get("linkedin_url"),
            "linkedin_slug": slug,
            "industry": linkedin_data.get("industry"),
            "website": linkedin_data.get("website"),
            "follower_count": linkedin_data.get("follower_count"),
            "staff_count": linkedin_data.get("staff_count"),
            "heritage_staff_count": linkedin_data.get("heritage_staff_count"),
            "heritage_staff": linkedin_data.get("heritage_staff", []),
            "enrichment_timestamp": timestamp,
            "provenance": {
                "source": "linkedin_company_scrape",
                "original_file": f"data/custodian/linkedin/{slug}.yaml",
                "schema_version": linkedin_data.get("provenance", {}).get("schema_version", "1.0.0"),
            },
        },
        "location": {
            "city": location_info.get("city_name") or candidate.get("city"),
            "region": location_info["province_code"],
            "country": "NL",
        },
        "ghcid": {
            "ghcid_current": ghcid_string,
            "ghcid_original": ghcid_string,
            "ghcid_uuid": uuids["ghcid_uuid"],
            "ghcid_uuid_sha256": uuids["ghcid_uuid_sha256"],
            "ghcid_numeric": uuids["ghcid_numeric"],
            "record_id": str(uuid.uuid4()),  # UUID v4 for database record ID
            "generation_timestamp": timestamp,
            "ghcid_history": [
                {
                    "ghcid": ghcid_string,
                    "ghcid_numeric": uuids["ghcid_numeric"],
                    "valid_from": timestamp,
                    "valid_to": None,
                    "reason": "Initial GHCID assignment from LinkedIn batch import",
                }
            ],
            "location_resolution": {
                "method": location_info["resolution_method"],
                "city_code": location_info["city_code"],
                "region_code": location_info["province_code"],
                "country_code": "NL",
            },
        },
        "provenance": {
            "schema_version": "1.0.0",
            "generated_at": timestamp,
            "sources": {
                "linkedin": [
                    {
                        "source_type": "linkedin_company_profile",
                        "data_tier": "TIER_4_INFERRED",
                        "source_file": f"data/custodian/linkedin/{slug}.yaml",
                        "extraction_timestamp": timestamp,
                        "claims_extracted": [
                            "name",
                            "industry",
                            "location",
                            "website",
                            "staff_count",
                            "heritage_staff",
                        ],
                    }
                ],
            },
            "data_tier_summary": {
                "TIER_4_INFERRED": ["linkedin_company_profile"],
            },
            "notes": [
                "Created from unmatched LinkedIn company profile",
                f"Location resolution method: {location_info['resolution_method']}",
            ],
        },
    }

    # Add GeoNames info if available
    if location_info.get("geonames_info"):
        geo = location_info["geonames_info"]
        data["ghcid"]["location_resolution"]["geonames_id"] = geo.get("geonames_id")
        data["ghcid"]["location_resolution"]["geonames_name"] = geo.get("name")
        data["ghcid"]["location_resolution"]["feature_code"] = geo.get("feature_code")
        data["ghcid"]["location_resolution"]["admin1_code"] = geo.get("admin1_code")
        if geo.get("latitude") and geo.get("longitude"):
            data["location"]["coordinates"] = {
                "latitude": geo["latitude"],
                "longitude": geo["longitude"],
                "source": "geonames",
            }

    return ghcid_string, data


def main():
    parser = argparse.ArgumentParser(
        description="Create NL-*.yaml custodian files from unmatched LinkedIn profiles"
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Show what would be created without writing files",
    )
    parser.add_argument(
        "--limit",
        type=int,
        default=None,
        help="Limit number of candidates to process",
    )
    parser.add_argument(
        "--offset",
        type=int,
        default=0,
        help="Start from this index in the candidate list",
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        help="Show detailed output for each candidate",
    )
    args = parser.parse_args()

    # Load unmatched analysis
    print(f"Loading unmatched analysis from {UNMATCHED_FILE}...")
    with open(UNMATCHED_FILE, "r") as f:
        analysis = json.load(f)

    candidates = analysis.get("dutch_list", [])
    print(f"  Found {len(candidates)} Dutch candidates")

    # Apply offset and limit
    if args.offset:
        candidates = candidates[args.offset:]
        print(f"  Starting from index {args.offset}")
    if args.limit:
        candidates = candidates[: args.limit]
        print(f"  Processing {len(candidates)} candidates (limit={args.limit})")

    # Connect to GeoNames database
    if not GEONAMES_DB.exists():
        print(f"ERROR: GeoNames database not found: {GEONAMES_DB}")
        sys.exit(1)

    conn = sqlite3.connect(GEONAMES_DB)
    print(f"Connected to GeoNames database")

    # Statistics
    stats = {
        "processed": 0,
        "created": 0,
        "skipped_no_linkedin": 0,
        "skipped_collision": 0,
        "location_resolved": 0,
        "location_unresolved": 0,
        "resolution_methods": {},
    }

    created_files = []

    for candidate in candidates:
        slug = candidate.get("slug", "")
        name = candidate.get("name", "Unknown")

        stats["processed"] += 1

        # Load full LinkedIn data
        linkedin_file = LINKEDIN_DIR / f"{slug}.yaml"
        if not linkedin_file.exists():
            if args.verbose:
                print(f"  SKIP: No LinkedIn file for {slug}")
            stats["skipped_no_linkedin"] += 1
            continue

        linkedin_data = load_yaml(linkedin_file)

        # Resolve location
        location_info = resolve_location(candidate, conn)

        # Track resolution method
        method = location_info["resolution_method"]
        stats["resolution_methods"][method] = stats["resolution_methods"].get(method, 0) + 1

        if method != "UNRESOLVED":
            stats["location_resolved"] += 1
        else:
            stats["location_unresolved"] += 1

        # Infer institution type from name (primary) and industry (fallback)
        # Name-based inference is more reliable than LinkedIn's pre-assigned types
        industry = candidate.get("industry", "") or linkedin_data.get("industry", "")
        institution_types = infer_institution_type(name, industry)

        # Only use LinkedIn's pre-assigned types if our inference returned Unknown
        # LinkedIn types are often wrong (e.g., "Libraries" industry → L type for museums)
        if institution_types == ["U"] and linkedin_data.get("institution_type"):
            institution_types = linkedin_data["institution_type"]

        # Create custodian data
        ghcid, data = create_custodian_from_linkedin(
            candidate, linkedin_data, location_info, institution_types
        )

        # Check collision (already handled in create function, but double-check)
        output_file = CUSTODIAN_DIR / f"{ghcid}.yaml"
        if output_file.exists():
            if args.verbose:
                print(f"  COLLISION: {ghcid} already exists")
            stats["skipped_collision"] += 1
            continue

        if args.dry_run:
            print(f"  [DRY-RUN] Would create: {output_file.name}")
            print(f"            Name: {name}")
            print(f"            Type: {institution_types}")
            print(f"            Location: {location_info['city_name']} ({location_info['province_code']})")
            print(f"            Resolution: {method}")
            if args.verbose:
                print(f"            GHCID: {ghcid}")
                print(f"            UUID: {data['ghcid']['ghcid_uuid']}")
        else:
            save_yaml(output_file, data)
            print(f"  Created: {output_file.name} ({name})")

        stats["created"] += 1
        created_files.append({"ghcid": ghcid, "name": name, "file": str(output_file.name)})

    conn.close()

    # Print summary
    print("\n" + "=" * 60)
    print("Summary")
    print("=" * 60)
    print(f"Processed:           {stats['processed']}")
    print(f"Created:             {stats['created']}")
    print(f"Skipped (no file):   {stats['skipped_no_linkedin']}")
    print(f"Skipped (collision): {stats['skipped_collision']}")
    print(f"Location resolved:   {stats['location_resolved']}")
    print(f"Location unresolved: {stats['location_unresolved']}")
    print("\nResolution methods:")
    for method, count in sorted(stats["resolution_methods"].items()):
        print(f"  {method}: {count}")

    if args.dry_run:
        print("\n[DRY-RUN] No files were created.")

    return 0


if __name__ == "__main__":
    sys.exit(main())