#!/usr/bin/env python3
"""
Revert incorrectly enriched NL-XX-XXX files.

These files were incorrectly updated with wrong location data from LinkedIn HTML
extraction. The script was extracting the FIRST headquarter JSON found in HTML,
which often belonged to a different company (sidebar, related companies, etc.).

This script:
1. Finds NL-XX-XXX files where ghcid_current doesn't contain XX-XXX
2. Restores ghcid_current to match ghcid_original
3. Resets location to unknown state
4. Removes incorrect location_resolution block
"""

import os
import re
import yaml
from pathlib import Path
from datetime import datetime, timezone

# Directory containing custodian files
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")

def find_mismatched_files():
    """Find NL-XX-XXX files where ghcid_current was incorrectly updated."""
    mismatched = []
    
    for f in CUSTODIAN_DIR.glob("NL-XX-XXX-*.yaml"):
        try:
            with open(f, 'r', encoding='utf-8') as file:
                content = yaml.safe_load(file)
            
            if not content:
                continue
                
            ghcid = content.get('ghcid', {})
            ghcid_current = ghcid.get('ghcid_current', '')
            ghcid_original = ghcid.get('ghcid_original', '')
            
            # If ghcid_current doesn't contain XX-XXX, it was incorrectly updated
            if ghcid_current and 'XX-XXX' not in ghcid_current:
                mismatched.append({
                    'file': f,
                    'ghcid_current': ghcid_current,
                    'ghcid_original': ghcid_original,
                    'content': content
                })
        except Exception as e:
            print(f"Error reading {f}: {e}")
    
    return mismatched

def revert_file(file_info):
    """Revert a file to its correct XX-XXX state."""
    f = file_info['file']
    content = file_info['content']
    ghcid_original = file_info['ghcid_original']
    
    # Restore ghcid_current to original
    if 'ghcid' in content:
        content['ghcid']['ghcid_current'] = ghcid_original
        
        # Remove incorrect location_resolution if it exists
        if 'location_resolution' in content['ghcid']:
            del content['ghcid']['location_resolution']
        
        # Update ghcid_history to reflect reversion
        if 'ghcid_history' in content['ghcid']:
            content['ghcid']['ghcid_history'] = [{
                'ghcid': ghcid_original,
                'ghcid_numeric': content['ghcid'].get('ghcid_numeric'),
                'valid_from': datetime.now(timezone.utc).isoformat(),
                'valid_to': None,
                'reason': 'Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored'
            }]
    
    # Reset location to unknown state
    if 'location' in content:
        content['location'] = {
            'city': None,
            'region': None,
            'country': 'NL'
        }
    
    # Add note to provenance about reversion
    if 'provenance' not in content:
        content['provenance'] = {}
    if 'notes' not in content['provenance']:
        content['provenance']['notes'] = []
    content['provenance']['notes'].append(
        f"Reverted incorrect location enrichment on {datetime.now(timezone.utc).strftime('%Y-%m-%d')} - "
        "LinkedIn HTML extraction was extracting wrong company's data"
    )
    
    # Write back
    with open(f, 'w', encoding='utf-8') as file:
        yaml.dump(content, file, default_flow_style=False, allow_unicode=True, sort_keys=False)
    
    return True

def main():
    print("Finding incorrectly enriched NL-XX-XXX files...")
    mismatched = find_mismatched_files()
    
    print(f"\nFound {len(mismatched)} files to revert:")
    
    # Show distribution of incorrect cities
    city_counts = {}
    for m in mismatched:
        match = re.search(r'NL-([A-Z]{2})-([A-Z]{3})', m['ghcid_current'])
        if match:
            city = f"NL-{match.group(1)}-{match.group(2)}"
            city_counts[city] = city_counts.get(city, 0) + 1
    
    print("\nDistribution of incorrect city assignments:")
    for city, count in sorted(city_counts.items(), key=lambda x: -x[1]):
        print(f"  {city}: {count} files")
    
    # Confirm before proceeding
    print(f"\nWill revert {len(mismatched)} files to XX-XXX state.")
    response = input("Proceed? (y/n): ").strip().lower()
    
    if response != 'y':
        print("Aborted.")
        return
    
    # Revert files
    reverted = 0
    for m in mismatched:
        try:
            if revert_file(m):
                reverted += 1
                print(f"Reverted: {m['file'].name}")
                print(f"  {m['ghcid_current']} -> {m['ghcid_original']}")
        except Exception as e:
            print(f"Error reverting {m['file']}: {e}")
    
    print(f"\n✅ Successfully reverted {reverted}/{len(mismatched)} files")

if __name__ == "__main__":
    main()