glam/scripts/reprocess_linkup_archives.py

#!/usr/bin/env python3
"""
Reprocess existing Linkup archives to extract founding dates that were missed
due to incomplete date parsing patterns.

This script:
1. Finds NL custodian files without TimeSpan AND without Wikidata inception
2. Checks if they have existing Linkup search results archived
3. Re-parses those archives with improved date extraction patterns
4. Updates the custodian files with newly extracted TimeSpan data

Usage:
    python scripts/reprocess_linkup_archives.py --dry-run
    python scripts/reprocess_linkup_archives.py --verbose
"""

import argparse
import json
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import yaml

# Project paths
PROJECT_ROOT = Path(__file__).parent.parent
DATA_DIR = PROJECT_ROOT / "data" / "custodian"
WEB_ARCHIVE_DIR = DATA_DIR / "web"


class PreserveQuotesDumper(yaml.SafeDumper):
    """Custom YAML dumper that preserves string formatting."""
    pass


def str_representer(dumper, data):
    if '\n' in data:
        return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
    return dumper.represent_scalar('tag:yaml.org,2002:str', data)


PreserveQuotesDumper.add_representer(str, str_representer)


# ============================================================================
# Date Extraction (improved patterns)
# ============================================================================

def parse_year_from_text(text: str) -> Optional[Tuple[int, str]]:
    """
    Extract founding year from text using pattern matching.
    Returns (year, context) tuple or None.
    """
    founding_patterns = [
        (r'opgericht\s+(?:in\s+)?(\d{4})', 'opgericht'),
        (r'gesticht\s+(?:in\s+)?(\d{4})', 'gesticht'),
        (r'sinds\s+(\d{4})', 'sinds'),
        (r'founded\s+(?:in\s+)?(\d{4})', 'founded'),
        (r'established\s+(?:in\s+)?(\d{4})', 'established'),
        (r'gestart\s+(?:in\s+)?(\d{4})', 'gestart'),
        (r'begonnen\s+(?:in\s+)?(\d{4})', 'begonnen'),
        (r'ontstaan\s+(?:in\s+)?(\d{4})', 'ontstaan'),
        (r'geopend\s+(?:in\s+)?(\d{4})', 'geopend'),
        (r'opened\s+(?:in\s+)?(\d{4})', 'opened'),
        (r'opening\s+(?:in\s+)?(\d{4})', 'opening'),
        # "in YYYY opgericht" pattern
        (r'in\s+(\d{4})\s+opgericht', 'in_year_opgericht'),
        (r'in\s+(\d{4})\s+gesticht', 'in_year_gesticht'),
        (r'in\s+(\d{4})\s+geopend', 'in_year_geopend'),
    ]

    text_lower = text.lower()

    for pattern, context in founding_patterns:
        match = re.search(pattern, text_lower)
        if match:
            year = int(match.group(1))
            if 1500 <= year <= datetime.now().year:
                return (year, context)

    return None


def parse_full_date_from_text(text: str) -> Optional[Tuple[str, str]]:
    """
    Extract full date (day/month/year) from text.
    Returns (ISO date string, context) or None.
    """
    dutch_months = {
        'januari': 1, 'februari': 2, 'maart': 3, 'april': 4,
        'mei': 5, 'juni': 6, 'juli': 7, 'augustus': 8,
        'september': 9, 'oktober': 10, 'november': 11, 'december': 12
    }

    english_months = {
        'january': 1, 'february': 2, 'march': 3, 'april': 4,
        'may': 5, 'june': 6, 'july': 7, 'august': 8,
        'september': 9, 'october': 10, 'november': 11, 'december': 12
    }

    text_lower = text.lower()

    # Numeric format: "23-11-2005" or "23/11/2005" (DD-MM-YYYY, European)
    numeric_pattern = r'(\d{1,2})[-/](\d{1,2})[-/](\d{4})'
    match = re.search(numeric_pattern, text)
    if match:
        day, month, year = int(match.group(1)), int(match.group(2)), int(match.group(3))
        if 1 <= day <= 31 and 1 <= month <= 12 and 1500 <= year <= datetime.now().year:
            return (f"{year}-{month:02d}-{day:02d}T00:00:00Z", "full_date_numeric")

    # Dutch format: "15 maart 1985"
    for month_name, month_num in dutch_months.items():
        pattern = rf'(\d{{1,2}})\s+{month_name}\s+(\d{{4}})'
        match = re.search(pattern, text_lower)
        if match:
            day, year = int(match.group(1)), int(match.group(2))
            if 1 <= day <= 31 and 1500 <= year <= datetime.now().year:
                return (f"{year}-{month_num:02d}-{day:02d}T00:00:00Z", "full_date_nl")

    # English format: "March 15, 1985"
    for month_name, month_num in english_months.items():
        pattern = rf'{month_name}\s+(\d{{1,2}}),?\s+(\d{{4}})'
        match = re.search(pattern, text_lower)
        if match:
            day, year = int(match.group(1)), int(match.group(2))
            if 1 <= day <= 31 and 1500 <= year <= datetime.now().year:
                return (f"{year}-{month_num:02d}-{day:02d}T00:00:00Z", "full_date_en")

    return None


def extract_dates_from_archive(archive_data: Dict[str, Any]) -> Dict[str, Any]:
    """
    Extract founding dates from archived Linkup search results.
    """
    extracted = {
        "founding_date": None,
        "date_precision": None,
        "source_url": None,
        "source_urls": [],
        "context": None,
        "all_dates_found": [],
    }

    api_response = archive_data.get("api_response", {})
    sources = api_response.get("sources", [])

    for result in sources:
        content = result.get("content", "") or result.get("snippet", "") or ""
        url = result.get("url", "")

        if not content:
            continue

        extracted["source_urls"].append(url)

        # Try full date first (day precision)
        full_date = parse_full_date_from_text(content)
        if full_date:
            date_str, context = full_date
            extracted["all_dates_found"].append({
                "date": date_str,
                "precision": "day",
                "url": url,
                "context": context,
            })
            if not extracted["founding_date"]:
                extracted["founding_date"] = date_str
                extracted["date_precision"] = "day"
                extracted["source_url"] = url
                extracted["context"] = context

        # Try year only (if no full date found yet)
        if not extracted["founding_date"]:
            year_result = parse_year_from_text(content)
            if year_result:
                year, context = year_result
                date_str = f"{year}-01-01T00:00:00Z"
                extracted["all_dates_found"].append({
                    "date": date_str,
                    "precision": "year",
                    "url": url,
                    "context": context,
                })
                if not extracted["founding_date"]:
                    extracted["founding_date"] = date_str
                    extracted["date_precision"] = "year"
                    extracted["source_url"] = url
                    extracted["context"] = context

    return extracted


def create_timespan_from_extracted(extracted: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """Create CIDOC-CRM TimeSpan from extracted date information."""
    if not extracted.get("founding_date"):
        return None

    founding_date = extracted["founding_date"]
    precision = extracted.get("date_precision", "year")

    timespan = {}

    if precision == "day":
        timespan["begin_of_the_begin"] = founding_date
        timespan["end_of_the_begin"] = founding_date
    elif precision == "year":
        year_match = re.match(r'(\d{4})', founding_date)
        if year_match:
            year = year_match.group(1)
            timespan["begin_of_the_begin"] = f"{year}-01-01T00:00:00Z"
            timespan["end_of_the_begin"] = f"{year}-12-31T23:59:59Z"
    else:
        timespan["begin_of_the_begin"] = founding_date
        timespan["end_of_the_begin"] = founding_date

    timespan["begin_of_the_end"] = None
    timespan["end_of_the_end"] = None

    return timespan


# ============================================================================
# Main Processing
# ============================================================================

def find_missing_timespan_files() -> List[Path]:
    """Find NL files without TimeSpan AND without Wikidata inception."""
    missing = []
    for filepath in DATA_DIR.glob("NL-*.yaml"):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)

            if not data:
                continue

            # Skip if already has TimeSpan
            timespan = data.get("timespan", {})
            if timespan and timespan.get("begin_of_the_begin"):
                continue

            # Skip if has Wikidata inception
            wikidata = data.get("wikidata_enrichment", {})
            if wikidata.get("wikidata_inception"):
                continue

            missing.append(filepath)
        except Exception:
            continue

    return missing


def get_archive_path(entry_index: int) -> Path:
    """Get archive directory for a custodian entry."""
    entry_str = f"{entry_index:04d}"
    return WEB_ARCHIVE_DIR / entry_str / "linkup"


def process_file(filepath: Path, dry_run: bool = False, verbose: bool = False) -> Tuple[bool, str]:
    """
    Process a single file: check for existing archive, extract dates, update file.
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
    except Exception as e:
        return (False, f"Error reading: {e}")

    if not data:
        return (False, "Empty file")

    entry_index = data.get("entry_index")
    if entry_index is None:
        return (False, "No entry_index")

    # Check for existing archive
    archive_dir = get_archive_path(entry_index)
    if not archive_dir.exists():
        return (False, "No linkup archive")

    # Find most recent archive file
    archive_files = sorted(archive_dir.glob("linkup_founding_*.json"))
    if not archive_files:
        return (False, "No archive files found")

    latest_archive = archive_files[-1]

    if verbose:
        print(f"  Using archive: {latest_archive.name}")

    # Load archive
    try:
        with open(latest_archive, 'r', encoding='utf-8') as f:
            archive_data = json.load(f)
    except Exception as e:
        return (False, f"Error loading archive: {e}")

    # Extract dates with improved patterns
    extracted = extract_dates_from_archive(archive_data)

    if not extracted.get("founding_date"):
        return (False, "No founding date found in archive")

    if verbose:
        print(f"  Found date: {extracted['founding_date']} ({extracted['date_precision']}) via {extracted['context']}")

    if dry_run:
        return (True, f"Would add TimeSpan: {extracted['founding_date']}")

    # Create TimeSpan
    timespan = create_timespan_from_extracted(extracted)
    if not timespan:
        return (False, "Could not create TimeSpan")

    # Add sources and notes
    timespan["sources"] = [f"Linkup web search: {extracted.get('source_url', 'archived results')}"]
    if extracted.get("context"):
        timespan["notes"] = f"Found via pattern: {extracted['context']}"

    # Update data
    data["timespan"] = timespan

    # Add provenance
    if "provenance" not in data:
        data["provenance"] = {"sources": {}}
    if "sources" not in data["provenance"]:
        data["provenance"]["sources"] = {}

    data["provenance"]["sources"]["linkup_timespan_reprocessed"] = [{
        "source_type": "linkup_web_search_reprocessed",
        "reprocess_timestamp": datetime.now(timezone.utc).isoformat(),
        "original_archive": str(latest_archive.relative_to(PROJECT_ROOT)),
        "source_urls": extracted.get("source_urls", [])[:5],
        "claims_extracted": ["timespan_begin"],
        "data_tier": "TIER_4_INFERRED",
    }]

    # Write back
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, Dumper=PreserveQuotesDumper,
                     default_flow_style=False, allow_unicode=True, sort_keys=False)
    except Exception as e:
        return (False, f"Error writing: {e}")

    return (True, f"Added TimeSpan: {extracted['founding_date']} ({extracted['date_precision']})")


def main():
    parser = argparse.ArgumentParser(description="Reprocess Linkup archives for missing TimeSpan data")
    parser.add_argument("--dry-run", action="store_true", help="Don't modify files")
    parser.add_argument("--verbose", action="store_true", help="Show detailed progress")
    parser.add_argument("--limit", type=int, default=None, help="Limit number of files to process")
    args = parser.parse_args()

    print("Finding NL files without TimeSpan or Wikidata inception...")
    missing_files = find_missing_timespan_files()
    print(f"Found {len(missing_files)} files to check")

    if args.limit:
        missing_files = missing_files[:args.limit]
        print(f"Processing first {args.limit} files")

    stats = {
        "processed": 0,
        "enriched": 0,
        "no_archive": 0,
        "no_date_found": 0,
        "errors": 0,
    }

    for filepath in missing_files:
        stats["processed"] += 1

        if args.verbose:
            print(f"\n[{stats['processed']}/{len(missing_files)}] {filepath.name}")

        success, message = process_file(filepath, dry_run=args.dry_run, verbose=args.verbose)

        if success:
            stats["enriched"] += 1
            if not args.verbose:
                print(f"✓ {filepath.name}: {message}")
        else:
            if "No linkup archive" in message or "No archive files" in message:
                stats["no_archive"] += 1
            elif "No founding date" in message:
                stats["no_date_found"] += 1
            else:
                stats["errors"] += 1
            if args.verbose:
                print(f"  Skip: {message}")

    print(f"\n{'='*60}")
    print("Summary:")
    print(f"  Total processed: {stats['processed']}")
    print(f"  Enriched:        {stats['enriched']}")
    print(f"  No archive:      {stats['no_archive']}")
    print(f"  No date found:   {stats['no_date_found']}")
    print(f"  Errors:          {stats['errors']}")

    if args.dry_run:
        print("\n(Dry run - no files modified)")


if __name__ == "__main__":
    main()