glam/scripts/enrich_nde_full_dataset.py

#!/usr/bin/env python3
"""
Enrich NDE Dutch Heritage Organizations with Wikidata Q-numbers - FULL DATASET

This script processes all 1,351 organizations, using the Wikidata MCP service to find
matching Wikidata entities. It includes:
- Batch processing with progress tracking
- Rate limiting for API compliance
- Comprehensive logging
- Automatic backup before modifications
- Recovery from interruptions

Usage:
    python3 scripts/enrich_nde_full_dataset.py [--start-index INDEX] [--batch-size SIZE]
"""

import yaml
import json
import sys
import argparse
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Tuple
import time

class FullDatasetEnricher:
    """Enriches all NDE organizations with Wikidata Q-numbers."""

    def __init__(self, data_path: Path, sparql_log_dir: Path,
                 start_index: int = 0, batch_size: int = 50):
        self.data_path = data_path
        self.sparql_log_dir = sparql_log_dir
        self.sparql_log_dir.mkdir(parents=True, exist_ok=True)
        self.start_index = start_index
        self.batch_size = batch_size

        # Statistics
        self.stats = {
            'total_records': 0,
            'already_enriched': 0,
            'newly_enriched': 0,
            'no_match_found': 0,
            'multiple_matches': 0,
            'queries_executed': 0,
            'errors': 0,
            'skipped': 0
        }

        # Query log
        self.query_log = []

        # Progress tracking
        self.checkpoint_file = self.sparql_log_dir / "enrichment_checkpoint.json"

    def load_checkpoint(self) -> Optional[int]:
        """Load last checkpoint to resume from interruption."""
        if self.checkpoint_file.exists():
            with open(self.checkpoint_file, 'r') as f:
                checkpoint = json.load(f)
                return checkpoint.get('last_processed_index', -1)
        return None

    def save_checkpoint(self, index: int, stats: Dict):
        """Save checkpoint for recovery."""
        checkpoint = {
            'last_processed_index': index,
            'timestamp': datetime.now(timezone.utc).isoformat(),
            'statistics': stats
        }
        with open(self.checkpoint_file, 'w') as f:
            json.dump(checkpoint, f, indent=2)

    def search_wikidata(self, org_name: str, org_type: Optional[str] = None,
                       city: Optional[str] = None) -> Optional[str]:
        """
        Search Wikidata for organization using MCP service.

        NOTE: This function uses PLACEHOLDER logic. In production, you would call:
        - wikidata_authenticated_search_entity(query)
        - wikidata_authenticated_execute_sparql(query)
        - wikidata_authenticated_get_metadata(entity_id)

        Args:
            org_name: Organization name
            org_type: Type of organization
            city: City name

        Returns:
            Q-number if found, None otherwise
        """
        # Build search query
        search_query = org_name
        if city:
            search_query += f" {city}"
        if org_type and org_type != 'historische vereniging':
            search_query += f" {org_type}"

        # Log the search
        timestamp = datetime.now(timezone.utc).isoformat()
        self.query_log.append({
            'timestamp': timestamp,
            'organization': org_name,
            'search_query': search_query,
            'method': 'search_entity'
        })

        print(f"      Search: {search_query[:80]}")

        # PLACEHOLDER: In production, call Wikidata MCP service here
        # q_number = wikidata_authenticated_search_entity(search_query)

        # For now, return None to indicate "would search here"
        return None

    def search_by_municipality(self, org_name: str, city: str) -> Optional[str]:
        """
        Search for municipality archive by city name.
        Uses SPARQL query for precise matching.
        """
        if not city or 'gemeente' not in org_name.lower():
            return None

        # Extract municipality name from org_name or use city
        municipality_name = city

        print(f"      SPARQL: Searching municipality '{municipality_name}'")

        # PLACEHOLDER: In production, execute SPARQL query
        # sparql_query = f"""
        # SELECT ?item ?itemLabel WHERE {{
        #   ?item wdt:P31 wd:Q2039348 .
        #   ?item rdfs:label "{municipality_name}"@nl .
        #   SERVICE wikibase:label {{ bd:serviceParam wikibase:language "nl,en". }}
        # }}
        # """
        # results = wikidata_authenticated_execute_sparql(sparql_query)

        return None

    def enrich_record(self, record: Dict, record_idx: int) -> Tuple[Dict, str]:
        """
        Enrich a single record with Wikidata Q-number.

        Args:
            record: Organization record
            record_idx: Record index

        Returns:
            Tuple of (updated_record, status_message)
        """
        org_name = record.get('organisatie', 'Unknown')
        org_type = record.get('type_organisatie', '')
        city = record.get('plaatsnaam_bezoekadres', '')

        # Skip if already enriched
        if record.get('wikidata_id'):
            self.stats['already_enriched'] += 1
            return record, f"✓ Already enriched: {record['wikidata_id']}"

        # Skip if already marked as no match
        if record.get('wikidata_enrichment_status') == 'no_match_found':
            self.stats['skipped'] += 1
            return record, "○ Skipped (previously marked no match)"

        print(f"\n   [{record_idx + 1}/{self.stats['total_records']}] {org_name[:60]}")
        print(f"      Type: {org_type} | City: {city}")

        # Try different search strategies
        q_number = None

        # Strategy 1: Direct search
        q_number = self.search_wikidata(org_name, org_type, city)

        # Strategy 2: Municipality search (for archives)
        if not q_number and org_type == 'archief' and 'gemeente' in org_name.lower():
            q_number = self.search_by_municipality(org_name, city)

        # Update record based on result
        if q_number:
            record['wikidata_id'] = q_number
            self.stats['newly_enriched'] += 1
            self.stats['queries_executed'] += 1
            return record, f"✓ Matched: {q_number}"
        else:
            # Mark as no match for future runs
            record['wikidata_enrichment_status'] = 'pending_review'
            self.stats['no_match_found'] += 1
            self.stats['queries_executed'] += 1
            return record, "✗ No match found"

    def run_enrichment(self):
        """Run the full dataset enrichment with batch processing."""
        print("=" * 80)
        print("NDE WIKIDATA ENRICHMENT - FULL DATASET")
        print("=" * 80)
        print()

        # Check for checkpoint
        checkpoint_idx = self.load_checkpoint()
        if checkpoint_idx is not None and checkpoint_idx >= self.start_index:
            print(f"⚠️  Found checkpoint at index {checkpoint_idx}")
            response = input(f"   Resume from index {checkpoint_idx + 1}? (y/n): ")
            if response.lower() == 'y':
                self.start_index = checkpoint_idx + 1
                print(f"   Resuming from index {self.start_index}")

        # Load data
        print(f"\n📂 Loading data from {self.data_path.name}...")
        with open(self.data_path, 'r', encoding='utf-8') as f:
            records = yaml.safe_load(f)

        self.stats['total_records'] = len(records)
        remaining = self.stats['total_records'] - self.start_index

        print(f"   Total records: {self.stats['total_records']}")
        print(f"   Starting at index: {self.start_index}")
        print(f"   Remaining to process: {remaining}")
        print(f"   Batch size: {self.batch_size}")
        print()

        # Create backup
        if self.start_index == 0:  # Only backup at start
            backup_path = self.data_path.parent / f"{self.data_path.stem}.backup.{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.yaml"
            print(f"💾 Creating backup: {backup_path.name}")
            with open(backup_path, 'w', encoding='utf-8') as f:
                yaml.dump(records, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
            print()

        # Process records in batches
        start_time = time.time()

        for batch_start in range(self.start_index, len(records), self.batch_size):
            batch_end = min(batch_start + self.batch_size, len(records))
            batch_num = (batch_start // self.batch_size) + 1
            total_batches = (len(records) + self.batch_size - 1) // self.batch_size

            print(f"\n{'='*80}")
            print(f"BATCH {batch_num}/{total_batches}: Records {batch_start + 1} - {batch_end}")
            print(f"{'='*80}")

            batch_start_time = time.time()

            # Process batch
            for idx in range(batch_start, batch_end):
                try:
                    records[idx], status = self.enrich_record(records[idx], idx)
                    print(f"      {status}")

                    # Save checkpoint every 10 records
                    if (idx + 1) % 10 == 0:
                        self.save_checkpoint(idx, self.stats)

                    # Rate limiting: pause briefly between requests
                    time.sleep(0.5)  # 2 requests per second max

                except Exception as e:
                    print(f"      ✗ Error: {e}")
                    self.stats['errors'] += 1
                    continue

            # Batch complete
            batch_elapsed = time.time() - batch_start_time
            print(f"\n   Batch complete in {batch_elapsed:.1f}s")

            # Save progress after each batch
            print(f"   💾 Saving progress...")
            with open(self.data_path, 'w', encoding='utf-8') as f:
                yaml.dump(records, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

            self.save_checkpoint(batch_end - 1, self.stats)

            # Progress update
            progress = (batch_end / len(records)) * 100
            enriched_pct = (self.stats['newly_enriched'] / batch_end * 100) if batch_end > 0 else 0
            print(f"   Progress: {progress:.1f}% | Enriched: {self.stats['newly_enriched']} ({enriched_pct:.1f}%)")

            # Estimated time remaining
            elapsed = time.time() - start_time
            records_done = batch_end - self.start_index
            if records_done > 0:
                avg_time_per_record = elapsed / records_done
                remaining_records = len(records) - batch_end
                eta_seconds = avg_time_per_record * remaining_records
                eta_minutes = eta_seconds / 60
                print(f"   ETA: {eta_minutes:.1f} minutes")

            # Pause between batches (rate limiting)
            if batch_end < len(records):
                print(f"\n   ⏸  Pausing 30s between batches (rate limiting)...")
                time.sleep(30)

        # Save final results
        print(f"\n{'='*80}")
        print("SAVING FINAL RESULTS")
        print(f"{'='*80}")

        print(f"💾 Writing enriched data to {self.data_path}...")
        with open(self.data_path, 'w', encoding='utf-8') as f:
            yaml.dump(records, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

        # Save master log
        self.save_master_log()

        # Print final statistics
        total_elapsed = time.time() - start_time
        self.print_final_statistics(total_elapsed)

        # Clean up checkpoint
        if self.checkpoint_file.exists():
            self.checkpoint_file.unlink()
            print("\n✓ Checkpoint file removed")

        return records

    def save_master_log(self):
        """Save master log of enrichment run."""
        timestamp = datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')
        log_path = self.sparql_log_dir / f"enrichment_log_full_dataset_{timestamp}.json"

        master_log = {
            'enrichment_date': datetime.now(timezone.utc).isoformat(),
            'enrichment_method': 'Wikidata MCP service - full dataset batch processing',
            'start_index': self.start_index,
            'batch_size': self.batch_size,
            'statistics': self.stats,
            'query_count': len(self.query_log),
            'queries': self.query_log[:100]  # First 100 queries only
        }

        with open(log_path, 'w', encoding='utf-8') as f:
            json.dump(master_log, f, indent=2, ensure_ascii=False)

        print(f"\n📊 Enrichment log saved: {log_path.name}")

    def print_final_statistics(self, elapsed_seconds: float):
        """Print final enrichment statistics."""
        print(f"\n{'='*80}")
        print("FINAL STATISTICS")
        print(f"{'='*80}")

        print(f"Total records:            {self.stats['total_records']}")
        print(f"Already enriched:         {self.stats['already_enriched']}")
        print(f"Newly enriched:           {self.stats['newly_enriched']}")
        print(f"No match found:           {self.stats['no_match_found']}")
        print(f"Skipped (previous):       {self.stats['skipped']}")
        print(f"Errors:                   {self.stats['errors']}")
        print(f"Queries executed:         {self.stats['queries_executed']}")
        print()

        total_enriched = self.stats['already_enriched'] + self.stats['newly_enriched']
        enrichment_rate = (total_enriched / self.stats['total_records']) * 100 if self.stats['total_records'] > 0 else 0

        print(f"Total enriched:           {total_enriched} ({enrichment_rate:.1f}%)")
        print(f"Success rate (new):       {(self.stats['newly_enriched'] / (self.stats['newly_enriched'] + self.stats['no_match_found']) * 100) if (self.stats['newly_enriched'] + self.stats['no_match_found']) > 0 else 0:.1f}%")
        print()
        print(f"Time elapsed:             {elapsed_seconds / 60:.1f} minutes")
        print(f"Average per record:       {elapsed_seconds / self.stats['total_records']:.2f}s")
        print()


def main():
    """Main entry point."""
    parser = argparse.ArgumentParser(description='Enrich NDE dataset with Wikidata Q-numbers')
    parser.add_argument('--start-index', type=int, default=10,
                       help='Start index (default: 10, skip test batch)')
    parser.add_argument('--batch-size', type=int, default=50,
                       help='Batch size (default: 50)')

    args = parser.parse_args()

    # Paths
    base_dir = Path(__file__).parent.parent
    data_path = base_dir / "data" / "nde" / "voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.yaml"
    sparql_log_dir = base_dir / "data" / "nde" / "sparql"

    # Verify file exists
    if not data_path.exists():
        print(f"❌ Error: Data file not found: {data_path}")
        sys.exit(1)

    # Create enricher
    enricher = FullDatasetEnricher(
        data_path=data_path,
        sparql_log_dir=sparql_log_dir,
        start_index=args.start_index,
        batch_size=args.batch_size
    )

    # Confirm before starting
    print(f"\n⚠️  WARNING: This will process {1351 - args.start_index} records")
    print(f"   This process will take approximately 2-3 hours")
    print(f"   A backup will be created before any modifications")
    print()
    response = input("Continue? (yes/no): ")

    if response.lower() != 'yes':
        print("Aborted.")
        sys.exit(0)

    # Run enrichment
    print("\n🚀 Starting full dataset enrichment...")
    print()

    enricher.run_enrichment()

    print(f"\n{'='*80}")
    print("✅ ENRICHMENT COMPLETE!")
    print(f"{'='*80}")
    print()
    print("Next steps:")
    print("1. Review records marked 'pending_review'")
    print("2. Run validation script to check Q-numbers")
    print("3. Generate final enrichment report")
    print()


if __name__ == "__main__":
    main()