#!/usr/bin/env python3
"""
Remove wikidata_enrichment from files with duplicate Wikidata entity IDs.

These files have incorrect Wikidata entity ID assignments where the same Q-number
was incorrectly assigned to multiple different institutions.

The script:
1. Reads the list of affected files from /tmp/wikidata_duplicates_to_clean.txt
2. For each file, removes the wikidata_enrichment section
3. Adds a provenance note documenting the removal
4. Preserves all other data
"""

import os
import sys
from datetime import datetime, timezone
from pathlib import Path
import yaml

# Preserve order in YAML output
class OrderedDumper(yaml.SafeDumper):
    pass

def represent_ordereddict(dumper, data):
    return dumper.represent_mapping('tag:yaml.org,2002:map', data.items())

def str_representer(dumper, data):
    if '\n' in data:
        return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
    return dumper.represent_scalar('tag:yaml.org,2002:str', data)

OrderedDumper.add_representer(str, str_representer)

def remove_wikidata_enrichment(file_path: Path) -> tuple[bool, str]:
    """
    Remove wikidata_enrichment from a file.
    
    Returns:
        tuple of (success: bool, message: str)
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Parse YAML
        data = yaml.safe_load(content)
        
        if data is None:
            return False, "Empty or invalid YAML"
        
        # Check if wikidata_enrichment exists
        if 'wikidata_enrichment' not in data:
            return False, "No wikidata_enrichment found"
        
        # Get the old wikidata ID for documentation
        old_wikidata_id = None
        if isinstance(data.get('wikidata_enrichment'), dict):
            old_wikidata_id = data['wikidata_enrichment'].get('wikidata_id')
        
        # Remove wikidata_enrichment
        del data['wikidata_enrichment']
        
        # Add provenance note
        timestamp = datetime.now(timezone.utc).isoformat()
        note = f"Removed incorrect wikidata_enrichment on {timestamp}. "
        if old_wikidata_id:
            note += f"Previous Wikidata ID {old_wikidata_id} was incorrectly assigned (duplicate across multiple institutions). "
        note += "Re-enrichment required with proper matching."
        
        # Update or create provenance
        if 'provenance' not in data:
            data['provenance'] = {}
        
        if isinstance(data['provenance'], dict):
            existing_notes = data['provenance'].get('notes', '')
            # Handle case where notes is a list
            if isinstance(existing_notes, list):
                existing_notes.append(note)
                data['provenance']['notes'] = existing_notes
            elif existing_notes:
                data['provenance']['notes'] = existing_notes + '\n\n' + note
            else:
                data['provenance']['notes'] = note
        
        # Write back
        with open(file_path, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, Dumper=OrderedDumper, allow_unicode=True, 
                     default_flow_style=False, sort_keys=False, width=120)
        
        return True, f"Removed wikidata_enrichment (was {old_wikidata_id})"
        
    except Exception as e:
        return False, f"Error: {e}"


def main():
    # Read the list of files to clean
    list_file = Path('/tmp/wikidata_duplicates_to_clean.txt')
    if not list_file.exists():
        print(f"Error: {list_file} not found")
        sys.exit(1)
    
    with open(list_file, 'r') as f:
        filenames = [line.strip() for line in f if line.strip()]
    
    print(f"Found {len(filenames)} files to clean")
    
    # Base directory for custodian files
    base_dir = Path('/Users/kempersc/apps/glam/data/custodian')
    
    success_count = 0
    skip_count = 0
    error_count = 0
    
    for i, filename in enumerate(filenames):
        file_path = base_dir / filename
        
        if not file_path.exists():
            print(f"[{i+1}/{len(filenames)}] SKIP (not found): {filename}")
            skip_count += 1
            continue
        
        success, message = remove_wikidata_enrichment(file_path)
        
        if success:
            print(f"[{i+1}/{len(filenames)}] OK: {filename} - {message}")
            success_count += 1
        else:
            if "No wikidata_enrichment" in message:
                print(f"[{i+1}/{len(filenames)}] SKIP (no wikidata): {filename}")
                skip_count += 1
            else:
                print(f"[{i+1}/{len(filenames)}] ERROR: {filename} - {message}")
                error_count += 1
    
    print(f"\n=== Summary ===")
    print(f"Total files: {len(filenames)}")
    print(f"Successfully cleaned: {success_count}")
    print(f"Skipped: {skip_count}")
    print(f"Errors: {error_count}")


if __name__ == '__main__':
    main()