#!/usr/bin/env python3 """ Revert incorrectly enriched NL-XX-XXX files. These files were incorrectly updated with wrong location data from LinkedIn HTML extraction. The script was extracting the FIRST headquarter JSON found in HTML, which often belonged to a different company (sidebar, related companies, etc.). This script: 1. Finds NL-XX-XXX files where ghcid_current doesn't contain XX-XXX 2. Restores ghcid_current to match ghcid_original 3. Resets location to unknown state 4. Removes incorrect location_resolution block """ import os import re import yaml from pathlib import Path from datetime import datetime, timezone # Directory containing custodian files CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") def find_mismatched_files(): """Find NL-XX-XXX files where ghcid_current was incorrectly updated.""" mismatched = [] for f in CUSTODIAN_DIR.glob("NL-XX-XXX-*.yaml"): try: with open(f, 'r', encoding='utf-8') as file: content = yaml.safe_load(file) if not content: continue ghcid = content.get('ghcid', {}) ghcid_current = ghcid.get('ghcid_current', '') ghcid_original = ghcid.get('ghcid_original', '') # If ghcid_current doesn't contain XX-XXX, it was incorrectly updated if ghcid_current and 'XX-XXX' not in ghcid_current: mismatched.append({ 'file': f, 'ghcid_current': ghcid_current, 'ghcid_original': ghcid_original, 'content': content }) except Exception as e: print(f"Error reading {f}: {e}") return mismatched def revert_file(file_info): """Revert a file to its correct XX-XXX state.""" f = file_info['file'] content = file_info['content'] ghcid_original = file_info['ghcid_original'] # Restore ghcid_current to original if 'ghcid' in content: content['ghcid']['ghcid_current'] = ghcid_original # Remove incorrect location_resolution if it exists if 'location_resolution' in content['ghcid']: del content['ghcid']['location_resolution'] # Update ghcid_history to reflect reversion if 'ghcid_history' in content['ghcid']: content['ghcid']['ghcid_history'] = [{ 'ghcid': ghcid_original, 'ghcid_numeric': content['ghcid'].get('ghcid_numeric'), 'valid_from': datetime.now(timezone.utc).isoformat(), 'valid_to': None, 'reason': 'Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored' }] # Reset location to unknown state if 'location' in content: content['location'] = { 'city': None, 'region': None, 'country': 'NL' } # Add note to provenance about reversion if 'provenance' not in content: content['provenance'] = {} if 'notes' not in content['provenance']: content['provenance']['notes'] = [] content['provenance']['notes'].append( f"Reverted incorrect location enrichment on {datetime.now(timezone.utc).strftime('%Y-%m-%d')} - " "LinkedIn HTML extraction was extracting wrong company's data" ) # Write back with open(f, 'w', encoding='utf-8') as file: yaml.dump(content, file, default_flow_style=False, allow_unicode=True, sort_keys=False) return True def main(): print("Finding incorrectly enriched NL-XX-XXX files...") mismatched = find_mismatched_files() print(f"\nFound {len(mismatched)} files to revert:") # Show distribution of incorrect cities city_counts = {} for m in mismatched: match = re.search(r'NL-([A-Z]{2})-([A-Z]{3})', m['ghcid_current']) if match: city = f"NL-{match.group(1)}-{match.group(2)}" city_counts[city] = city_counts.get(city, 0) + 1 print("\nDistribution of incorrect city assignments:") for city, count in sorted(city_counts.items(), key=lambda x: -x[1]): print(f" {city}: {count} files") # Confirm before proceeding print(f"\nWill revert {len(mismatched)} files to XX-XXX state.") response = input("Proceed? (y/n): ").strip().lower() if response != 'y': print("Aborted.") return # Revert files reverted = 0 for m in mismatched: try: if revert_file(m): reverted += 1 print(f"Reverted: {m['file'].name}") print(f" {m['ghcid_current']} -> {m['ghcid_original']}") except Exception as e: print(f"Error reverting {m['file']}: {e}") print(f"\n✅ Successfully reverted {reverted}/{len(mismatched)} files") if __name__ == "__main__": main()