glam/scripts/extract_timeline_events.py

#!/usr/bin/env python3
"""
Extract timeline events from timeline event sources data and update custodian YAML files.

This script extracts historical events with FULL PROVENANCE from source archives files
and stores them in custodian YAML files following the CustodianTimelineEvent schema.

Schema: schemas/20251121/linkml/modules/classes/CustodianTimelineEvent.yaml

Event types extracted (OrganizationalChangeEventTypeEnum):
- FOUNDING: opgericht, gesticht, ontstaan
- MERGER: fusie, fuseerde, samengevoegd, ging op in
- DISSOLUTION: opgeheven, gesloten
- RENAMING: hernoemd, nieuwe naam
- TRANSFER: verhuisd, verplaatst, gevestigd (physical move)
- EXPANSION: uitgebreid, verbouwd, nieuwbouw, gemoderniseerd
- SPLIT: opgesplitst
- SPIN_OFF: afgesplitst
- REDUCTION: ingekrompen
- REORGANIZATION: herstructurering

EXCLUDED (not in enum):
- reopening: not a recognized event type
- predecessor: relationship, not event
- friends_org: separate organization

Output structure in custodian YAML:
    timeline_enrichment:
      timeline_events:
        - event_type: FOUNDING
          event_date: "2005-04-30"
          date_precision: day
          approximate: false
          description: "..."
          source_urls: [...]
          linkup_query: "..."
          linkup_answer: "..."
          fetch_timestamp: "2025-12-15T16:04:38Z"
          archive_path: web/0002/linkup/linkup_founding_20251215T160438Z.json
          extraction_method: linkup_answer_regex
          extraction_timestamp: "2025-12-16T10:00:00Z"
          data_tier: TIER_4_INFERRED
"""

import json
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import yaml

# =============================================================================
# CONFIGURATION
# =============================================================================

# Minimum year for institutional events (filters out historical references)
MIN_YEAR = 1800

# Dutch month names to numbers
DUTCH_MONTHS = {
    'januari': '01', 'februari': '02', 'maart': '03', 'april': '04',
    'mei': '05', 'juni': '06', 'juli': '07', 'augustus': '08',
    'september': '09', 'oktober': '10', 'november': '11', 'december': '12'
}

# =============================================================================
# EVENT TYPE MAPPING (script patterns → OrganizationalChangeEventTypeEnum)
# =============================================================================

# Maps internal event type names to enum values
EVENT_TYPE_MAP = {
    'founding': 'FOUNDING',
    'merger': 'MERGER',
    'dissolution': 'DISSOLUTION',
    'name_change': 'RENAMING',
    'relocation': 'TRANSFER',  # Physical move maps to TRANSFER
    'expansion': 'EXPANSION',
    'split': 'SPLIT',
    'spin_off': 'SPIN_OFF',
    'reduction': 'REDUCTION',
    'reorganization': 'REORGANIZATION',
}

# Event type keywords (Dutch) - only types that map to enum
EVENT_KEYWORDS = {
    'founding': [
        r'opgericht\s+(?:op|in)',
        r'gesticht\s+(?:op|in)',
        r'werd\s+opgericht',
        r'is\s+opgericht',
        r'ontstaan\s+in',
        r'opgericht\s+op\s+\d',
    ],
    'merger': [
        r'fusie\s+(?:van|tussen|met)',
        r'fuseerde\s+met',
        r'samengevoegd\s+met',
        r'ging\s+(?:ook\s+)?(?:hier\s+)?in\s+op',
        r'ging\s+op\s+in',
        r'voortgekomen\s+uit\s+een\s+fusie',
        r'ontstaan\s+uit\s+een\s+fusie',
    ],
    'relocation': [
        r'verhuisd\s+naar',
        r'verplaatst\s+naar',
        r'nieuwe\s+locatie',
        r'betrok\s+(?:een\s+)?nieuw\s+pand',
        r'gevestigd\s+(?:aan|in|op)',
    ],
    'expansion': [
        r'uitgebreid\s+(?:en|in)',
        r'verbouwd\s+in',
        r'nieuwbouw\s+in',
        r'gemoderniseerd',
        r'werd\s+(?:in\s+\d{4}\s+)?uitgebreid',
    ],
    'name_change': [
        r'hernoemd\s+(?:naar|tot)',
        r'nieuwe\s+naam',
        r'naam\s+gewijzigd',
        r'naam\s+veranderd',
    ],
    'dissolution': [
        r'opgeheven\s+in',
        r'gesloten\s+in',
        r'opgegaan\s+in',
        r'beëindigd\s+in',
    ],
    'split': [
        r'opgesplitst\s+in',
        r'verdeeld\s+in',
    ],
    'spin_off': [
        r'afgesplitst\s+(?:van|uit)',
        r'verzelfstandigd',
    ],
    'reduction': [
        r'ingekrompen',
        r'afgebouwd',
    ],
    'reorganization': [
        r'herstructurering',
        r'gereorganiseerd',
        r'reorganisatie',
    ],
}

# Date extraction patterns with precision detection
DATE_PATTERNS = [
    # Full date: "30 april 2005" or "op 30 april 2005" → day precision
    (r'(?:op\s+)?(\d{1,2})\s+(' + '|'.join(DUTCH_MONTHS.keys()) + r')\s+(\d{4})', 'full', 'day'),
    # Full date: "30-4-2005" or "30/4/2005" → day precision
    (r'(\d{1,2})[-/](\d{1,2})[-/](\d{4})', 'numeric', 'day'),
    # Year with context: "in 1854", "sinds 2005", "vanaf 2006" → year precision
    (r'(?:in|sinds|vanaf|anno|per)\s+(\d{4})', 'year', 'year'),
    # Year in parentheses: "(2000)" → year precision
    (r'\((\d{4})\)', 'year', 'year'),
    # Approximate: "circa 1900", "rond 2000", "ongeveer 1980" → year precision, approximate=True
    (r'(?:circa|rond|ongeveer)\s+(\d{4})', 'circa', 'year'),
    # Year only after "werd" or "is": "werd in 1980" → year precision
    (r'werd\s+in\s+(\d{4})', 'year', 'year'),
    # Decade reference: "begin jaren '90", "eind jaren '80" → decade precision
    (r"(?:begin|eind|midden)\s+jaren\s+'?(\d{2})", 'decade', 'decade'),
]

# =============================================================================
# DATE PARSING
# =============================================================================


def parse_dutch_date(match: tuple, date_type: str) -> tuple[str, bool]:
    """
    Parse a Dutch date match to ISO format.

    Returns:
        Tuple of (iso_date_string, is_approximate)
    """
    if date_type == 'full':
        day, month_name, year = match
        month = DUTCH_MONTHS.get(month_name.lower(), '01')
        return f"{year}-{month}-{int(day):02d}", False
    elif date_type == 'numeric':
        day, month, year = match
        return f"{year}-{int(month):02d}-{int(day):02d}", False
    elif date_type == 'year':
        year = match[0] if isinstance(match, tuple) else match
        return f"{year}", False
    elif date_type == 'circa':
        year = match[0] if isinstance(match, tuple) else match
        return f"{year}", True
    elif date_type == 'decade':
        # "jaren '90" → "1990"
        decade = match[0] if isinstance(match, tuple) else match
        century = '19' if int(decade) > 20 else '20'
        return f"{century}{decade}", True  # Decades are approximate
    return "", False


def is_valid_year(date_str: str) -> bool:
    """Check if the year in a date string is >= MIN_YEAR."""
    try:
        year = int(date_str[:4])
        return year >= MIN_YEAR
    except (ValueError, IndexError):
        return False


def detect_date_precision(iso_date: str, date_type: str) -> str:
    """
    Detect the precision of a date based on format and extraction type.

    Returns: day, month, year, decade, century, or unknown
    """
    if date_type == 'decade':
        return 'decade'
    elif date_type in ('full', 'numeric'):
        # Check if it's a full date (YYYY-MM-DD)
        if len(iso_date) == 10 and iso_date.count('-') == 2:
            return 'day'
        elif len(iso_date) == 7 and iso_date.count('-') == 1:
            return 'month'
    # Default to year for year-only extractions
    return 'year'


# =============================================================================
# EVENT EXTRACTION
# =============================================================================


def find_closest_date(
    text: str,
    event_match_pos: int,
    search_window: int = 150
) -> tuple[str, str, bool, int] | None:
    """
    Find the closest date to an event keyword match position.

    Uses proximity scoring to prefer dates closer to the event keyword,
    searching both before and after the match position.

    Args:
        text: Full text to search
        event_match_pos: Character position of the event keyword match
        search_window: How many characters before/after to search

    Returns:
        Tuple of (iso_date, date_type, is_approximate, distance) or None
    """
    text_lower = text.lower()
    start = max(0, event_match_pos - search_window)
    end = min(len(text), event_match_pos + search_window)
    context = text_lower[start:end]

    best_date = None
    best_distance = float('inf')

    for date_pattern, date_type, _ in DATE_PATTERNS:
        for date_match in re.finditer(date_pattern, context):
            # Calculate distance from event keyword to date
            # event_match_pos relative to context start = event_match_pos - start
            event_pos_in_context = event_match_pos - start
            date_pos_in_context = date_match.start()
            distance = abs(date_pos_in_context - event_pos_in_context)

            if distance < best_distance:
                match_groups = date_match.groups()
                iso_date, is_approx = parse_dutch_date(match_groups, date_type)

                # Filter out dates before MIN_YEAR
                if iso_date and is_valid_year(iso_date):
                    best_date = (iso_date, date_type, is_approx, distance)
                    best_distance = distance

    return best_date


def extract_events_from_structured_text(text: str) -> list[dict]:
    """
    Extract events from structured text (bullet points, numbered lists).

    Handles patterns like:
    - Stadsarchief Deventer opgericht in 1838
    - Samengevoegd met Athenaeumbibliotheek in 1999

    Returns:
        List of event dicts with date correctly associated per line
    """
    events = []

    # Split on newlines and bullet markers
    lines = re.split(r'\n|(?:^|\s)[-•*]\s+', text)

    for line in lines:
        line = line.strip()
        if not line or len(line) < 10:
            continue

        line_lower = line.lower()

        # For each event type, check if this line mentions it
        for event_type, patterns in EVENT_KEYWORDS.items():
            for pattern in patterns:
                match = re.search(pattern, line_lower)
                if match:
                    # Store match position for use in lambda (avoids type checker issue)
                    match_pos = match.start()

                    # Find date within THIS line only (tight coupling)
                    for date_pattern, date_type, _ in DATE_PATTERNS:
                        date_matches = list(re.finditer(date_pattern, line_lower))
                        if date_matches:
                            # Prefer date closest to the event keyword
                            best_match = min(
                                date_matches,
                                key=lambda m: abs(m.start() - match_pos)
                            )
                            iso_date, is_approx = parse_dutch_date(
                                best_match.groups(), date_type
                            )

                            if iso_date and is_valid_year(iso_date):
                                date_precision = detect_date_precision(iso_date, date_type)

                                # Clean description from original line
                                description = line.strip()
                                description = re.sub(r'\s+', ' ', description)
                                if len(description) > 200:
                                    description = description[:200] + '...'

                                events.append({
                                    'internal_type': event_type,
                                    'date': iso_date,
                                    'date_precision': date_precision,
                                    'description': description,
                                    'approximate': is_approx,
                                })
                                break  # Found date for this event
                    break  # Found event type for this line

    return events


def extract_events_from_text(
    text: str,
    source_url: str | None = None
) -> list[dict]:
    """
    Extract historical events with dates from text.

    Uses a two-phase approach:
    1. First, try to extract from structured text (bullet points, lists)
       where date-event associations are clear per line
    2. Then, extract from prose using proximity-based date matching

    Args:
        text: Text to search for events
        source_url: URL source for provenance

    Returns:
        List of event dictionaries with internal event_type names
    """
    events = []

    # Phase 1: Extract from structured text (bullet points, lists)
    # These have clear date-event associations per line
    if '\n' in text or re.search(r'[-•*]\s+', text):
        structured_events = extract_events_from_structured_text(text)
        events.extend(structured_events)

    # Phase 2: Extract from prose with proximity-based matching
    text_lower = text.lower()

    for event_type, patterns in EVENT_KEYWORDS.items():
        for pattern in patterns:
            matches = list(re.finditer(pattern, text_lower))
            for match in matches:
                # Check if we already found this event in structured extraction
                # by looking at similar descriptions
                already_found = False
                match_text = text_lower[match.start():match.end()]
                for existing in events:
                    if (existing['internal_type'] == event_type and
                        match_text in existing.get('description', '').lower()):
                        already_found = True
                        break

                if already_found:
                    continue

                # Use proximity-based date finding
                date_info = find_closest_date(text, match.start(), search_window=150)

                if date_info is None:
                    continue

                iso_date, date_type, is_approx, _ = date_info
                date_precision = detect_date_precision(iso_date, date_type)

                # Extract description from original case text
                start = max(0, match.start() - 50)
                end = min(len(text), match.end() + 150)
                orig_context = text[start:end]

                # Clean up the description
                desc_start = match.start() - start
                desc_end = min(desc_start + 150, len(orig_context))
                description = orig_context[desc_start:desc_end].strip()
                description = re.sub(r'\s+', ' ', description)
                description = description.split('.')[0]  # First sentence
                if len(description) > 200:
                    description = description[:200] + '...'

                event = {
                    'internal_type': event_type,
                    'date': iso_date,
                    'date_precision': date_precision,
                    'description': description,
                    'approximate': is_approx,
                }
                if source_url:
                    event['source_url'] = source_url

                events.append(event)

    return events


def deduplicate_events(events: list[dict]) -> list[dict]:
    """
    Remove duplicate events based on date, type, and approximate status.
    """
    seen = set()
    unique = []
    for event in events:
        key = (event.get('date'), event.get('internal_type'), event.get('approximate', False))
        if key not in seen:
            seen.add(key)
            unique.append(event)
    return unique


# =============================================================================
# LINKUP JSON PARSING
# =============================================================================


def extract_institution_name_from_query(query: str) -> str | None:
    """
    Extract institution name from linkup query string.

    The query format is typically: "Institution Name" city opgericht OR gesticht...
    """
    # Try to extract quoted name first
    match = re.search(r'"([^"]+)"', query)
    if match:
        return match.group(1)
    return None


def is_source_relevant(source_name: str, source_url: str, institution_name: str | None) -> bool:
    """
    Check if a source is relevant to the target institution.
    """
    if not institution_name:
        return True

    inst_lower = institution_name.lower()
    key_words = [w for w in inst_lower.split() if len(w) > 3]

    source_lower = source_name.lower()
    url_lower = source_url.lower()

    for word in key_words:
        if word in source_lower or word in url_lower:
            return True

    if 'wikipedia' in url_lower:
        for word in key_words:
            if word in source_lower:
                return True

    return False


def parse_linkup_json(json_path: Path, include_sources: bool = False) -> dict:
    """
    Parse a timeline source file and extract events with full provenance.

    Args:
        json_path: Path to linkup JSON file
        include_sources: If True, also extract from source snippets

    Returns:
        Dict with 'events' list and 'provenance' metadata
    """
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except (json.JSONDecodeError, FileNotFoundError) as e:
        print(f"  Warning: Could not parse {json_path}: {e}")
        return {'events': [], 'provenance': {}}

    api_response = data.get('api_response', {})
    query = data.get('query', '')
    fetch_timestamp = data.get('fetch_timestamp', '')
    institution_name = extract_institution_name_from_query(query)

    # Extract from main answer (most reliable)
    answer = api_response.get('answer', '')
    events = []
    if answer:
        events.extend(extract_events_from_text(answer))

    # Collect source URLs for provenance
    source_urls = []
    sources = api_response.get('sources', [])
    for source in sources:
        url = source.get('url', '')
        if url and is_source_relevant(source.get('name', ''), url, institution_name):
            source_urls.append(url)

    # Optionally extract from sources (higher noise)
    if include_sources:
        for source in sources:
            source_name = source.get('name', '')
            source_url = source.get('url', '')
            snippet = source.get('snippet', '')

            if snippet and is_source_relevant(source_name, source_url, institution_name):
                source_events = extract_events_from_text(snippet, source_url)
                events.extend(source_events)

    events = deduplicate_events(events)

    # Calculate relative archive path
    # Path format: data/custodian/web/NNNN/linkup/linkup_XXX.json
    # We want: web/NNNN/linkup/linkup_XXX.json
    parts = json_path.parts
    try:
        web_idx = parts.index('web')
        archive_path = '/'.join(parts[web_idx:])
    except ValueError:
        archive_path = str(json_path)

    provenance = {
        'linkup_query': query,
        'linkup_answer': answer,
        'fetch_timestamp': fetch_timestamp,
        'archive_path': archive_path,
        'source_urls': source_urls[:5],  # Limit to top 5 sources
    }

    return {'events': events, 'provenance': provenance}


# =============================================================================
# YAML UPDATE
# =============================================================================


def load_mapping(mapping_path: Path) -> dict[int, str]:
    """
    Load entry number to GHCID mapping.
    """
    mapping = {}
    with open(mapping_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split(' ', 1)
            if len(parts) == 2:
                try:
                    entry_num = int(parts[0])
                    ghcid = parts[1]
                    if entry_num not in mapping:
                        mapping[entry_num] = ghcid
                except ValueError:
                    continue
    return mapping


def convert_to_linkup_timeline_event(
    event: dict,
    provenance: dict,
    extraction_timestamp: str
) -> dict | None:
    """
    Convert internal event format to CustodianTimelineEvent schema format.

    Args:
        event: Internal event dict with 'internal_type', 'date', etc.
        provenance: Provenance dict from parse_linkup_json
        extraction_timestamp: ISO timestamp when extraction script ran

    Returns:
        Dict conforming to CustodianTimelineEvent schema
    """
    internal_type = event.get('internal_type', '')
    enum_type = EVENT_TYPE_MAP.get(internal_type)

    if not enum_type:
        return None  # Skip events that don't map to enum

    return {
        'event_type': enum_type,
        'event_date': event.get('date'),
        'date_precision': event.get('date_precision', 'year'),
        'approximate': event.get('approximate', False),
        'description': event.get('description', ''),
        'source_urls': provenance.get('source_urls', []),
        'linkup_query': provenance.get('linkup_query', ''),
        'linkup_answer': provenance.get('linkup_answer', ''),
        'fetch_timestamp': provenance.get('fetch_timestamp', ''),
        'archive_path': provenance.get('archive_path', ''),
        'extraction_method': 'linkup_answer_regex',
        'extraction_timestamp': extraction_timestamp,
        'data_tier': 'TIER_4_INFERRED',
    }


def update_yaml_timeline_enrichment(
    yaml_path: Path,
    events: list[dict],
    provenance: dict,
    extraction_timestamp: str,
    dry_run: bool = False
) -> bool:
    """
    Update a custodian YAML file with CustodianTimelineEvent records.

    Writes to: timeline_enrichment.timeline_events (NOT timespan.events)

    Args:
        yaml_path: Path to custodian YAML file
        events: List of internal event dicts
        provenance: Provenance dict from parse_linkup_json
        extraction_timestamp: ISO timestamp for extraction
        dry_run: If True, don't write changes

    Returns:
        True if file was updated
    """
    if not yaml_path.exists():
        print(f"  Warning: YAML file not found: {yaml_path}")
        return False

    try:
        with open(yaml_path, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
    except yaml.YAMLError as e:
        print(f"  Warning: Could not parse YAML {yaml_path}: {e}")
        return False

    if data is None:
        data = {}

    # Initialize timeline_enrichment if not exists
    if 'timeline_enrichment' not in data:
        data['timeline_enrichment'] = {}

    timeline_enrichment = data['timeline_enrichment']

    # Initialize timeline_events array if not exists
    if 'timeline_events' not in timeline_enrichment:
        timeline_enrichment['timeline_events'] = []

    timeline_events = timeline_enrichment['timeline_events']

    # Get existing event keys to avoid duplicates
    existing_keys = {
        (e.get('event_date'), e.get('event_type'))
        for e in timeline_events
    }

    # Convert and add new events
    new_count = 0
    for event in events:
        linkup_event = convert_to_linkup_timeline_event(event, provenance, extraction_timestamp)

        if linkup_event is None:
            continue  # Skip events that don't map to enum

        key = (linkup_event.get('event_date'), linkup_event.get('event_type'))
        if key not in existing_keys:
            timeline_events.append(linkup_event)
            existing_keys.add(key)
            new_count += 1

    # Sort events by date
    timeline_events.sort(key=lambda e: e.get('event_date') or '')

    if new_count > 0 and not dry_run:
        with open(yaml_path, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    return new_count > 0


# =============================================================================
# MAIN
# =============================================================================


def main():
    """Main entry point."""
    import argparse

    parser = argparse.ArgumentParser(
        description='Extract timeline events from source archives with full provenance'
    )
    parser.add_argument('--dry-run', action='store_true', help='Do not write changes')
    parser.add_argument('--limit', type=int, help='Limit number of entries to process')
    parser.add_argument('--entry', type=int, help='Process specific entry number')
    parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
    parser.add_argument('--include-sources', action='store_true',
                        help='Also extract from source snippets (higher noise)')
    args = parser.parse_args()

    # Paths
    base_path = Path('/Users/kempersc/apps/glam')
    mapping_path = base_path / 'data/custodian/web/_entry_to_ghcid.txt'
    web_path = base_path / 'data/custodian/web'
    custodian_path = base_path / 'data/custodian'

    # Extraction timestamp (when this script runs)
    extraction_timestamp = datetime.now(timezone.utc).isoformat()

    # Load mapping
    print("Loading entry-to-GHCID mapping...")
    mapping = load_mapping(mapping_path)
    print(f"  Loaded {len(mapping)} mappings")

    # Process entries
    processed = 0
    updated = 0
    total_events = 0

    entries_to_process = [args.entry] if args.entry else sorted(mapping.keys())
    if args.limit:
        entries_to_process = entries_to_process[:args.limit]

    print(f"\nProcessing {len(entries_to_process)} entries...")
    print(f"Extraction timestamp: {extraction_timestamp}")

    for entry_num in entries_to_process:
        ghcid = mapping.get(entry_num)
        if not ghcid:
            continue

        # Find linkup JSON files
        entry_dir = web_path / f"{entry_num:04d}" / 'linkup'
        if not entry_dir.exists():
            if args.verbose:
                print(f"  Skipping entry {entry_num}: no linkup directory")
            continue

        # Find all linkup JSON files (founding, merger, etc.)
        json_files = list(entry_dir.glob('linkup_*.json'))
        if not json_files:
            if args.verbose:
                print(f"  Skipping entry {entry_num}: no linkup JSON files")
            continue

        # Process all JSON files for this entry
        all_events = []
        combined_provenance = {
            'linkup_query': '',
            'linkup_answer': '',
            'fetch_timestamp': '',
            'archive_path': '',
            'source_urls': [],
        }

        for json_file in json_files:
            result = parse_linkup_json(json_file, include_sources=args.include_sources)
            events = result['events']
            provenance = result['provenance']

            # Use provenance from first file with data
            if provenance.get('linkup_answer') and not combined_provenance['linkup_answer']:
                combined_provenance = provenance

            # For multiple JSON files, keep the events but note they may have different provenance
            for event in events:
                event['_archive_path'] = provenance.get('archive_path', '')

            all_events.extend(events)

        all_events = deduplicate_events(all_events)

        if not all_events:
            if args.verbose:
                print(f"  Entry {entry_num} ({ghcid}): no events extracted")
            processed += 1
            continue

        # Update YAML file
        yaml_file = custodian_path / f"{ghcid}.yaml"

        if args.verbose or args.dry_run:
            print(f"\n  Entry {entry_num} ({ghcid}):")
            for event in all_events:
                internal_type = event.get('internal_type', '')
                enum_type = EVENT_TYPE_MAP.get(internal_type, 'UNKNOWN')
                approx = " ~" if event.get('approximate') else ""
                prec = event.get('date_precision', 'year')
                print(f"    - {event['date']}{approx} [{prec}] {enum_type}: {event['description'][:50]}...")

        if update_yaml_timeline_enrichment(
            yaml_file, all_events, combined_provenance, extraction_timestamp, dry_run=args.dry_run
        ):
            updated += 1
            total_events += len(all_events)
            if not args.verbose:
                print(f"  Updated {ghcid}: +{len(all_events)} events")

        processed += 1

    # Summary
    print(f"\n{'=' * 60}")
    print(f"Summary:")
    print(f"  Entries processed: {processed}")
    print(f"  YAML files updated: {updated}")
    print(f"  Total events added: {total_events}")
    print(f"  Output location: timeline_enrichment.timeline_events")
    print(f"  Schema: CustodianTimelineEvent (TIER_4_INFERRED)")
    if args.dry_run:
        print("  (DRY RUN - no files modified)")


if __name__ == '__main__':
    main()