glam/scripts/revert_incorrect_xxx_enrichment.py
2025-12-17 10:11:56 +01:00

142 lines
4.8 KiB
Python

#!/usr/bin/env python3
"""
Revert incorrectly enriched NL-XX-XXX files.
These files were incorrectly updated with wrong location data from LinkedIn HTML
extraction. The script was extracting the FIRST headquarter JSON found in HTML,
which often belonged to a different company (sidebar, related companies, etc.).
This script:
1. Finds NL-XX-XXX files where ghcid_current doesn't contain XX-XXX
2. Restores ghcid_current to match ghcid_original
3. Resets location to unknown state
4. Removes incorrect location_resolution block
"""
import os
import re
import yaml
from pathlib import Path
from datetime import datetime, timezone
# Directory containing custodian files
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
def find_mismatched_files():
"""Find NL-XX-XXX files where ghcid_current was incorrectly updated."""
mismatched = []
for f in CUSTODIAN_DIR.glob("NL-XX-XXX-*.yaml"):
try:
with open(f, 'r', encoding='utf-8') as file:
content = yaml.safe_load(file)
if not content:
continue
ghcid = content.get('ghcid', {})
ghcid_current = ghcid.get('ghcid_current', '')
ghcid_original = ghcid.get('ghcid_original', '')
# If ghcid_current doesn't contain XX-XXX, it was incorrectly updated
if ghcid_current and 'XX-XXX' not in ghcid_current:
mismatched.append({
'file': f,
'ghcid_current': ghcid_current,
'ghcid_original': ghcid_original,
'content': content
})
except Exception as e:
print(f"Error reading {f}: {e}")
return mismatched
def revert_file(file_info):
"""Revert a file to its correct XX-XXX state."""
f = file_info['file']
content = file_info['content']
ghcid_original = file_info['ghcid_original']
# Restore ghcid_current to original
if 'ghcid' in content:
content['ghcid']['ghcid_current'] = ghcid_original
# Remove incorrect location_resolution if it exists
if 'location_resolution' in content['ghcid']:
del content['ghcid']['location_resolution']
# Update ghcid_history to reflect reversion
if 'ghcid_history' in content['ghcid']:
content['ghcid']['ghcid_history'] = [{
'ghcid': ghcid_original,
'ghcid_numeric': content['ghcid'].get('ghcid_numeric'),
'valid_from': datetime.now(timezone.utc).isoformat(),
'valid_to': None,
'reason': 'Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored'
}]
# Reset location to unknown state
if 'location' in content:
content['location'] = {
'city': None,
'region': None,
'country': 'NL'
}
# Add note to provenance about reversion
if 'provenance' not in content:
content['provenance'] = {}
if 'notes' not in content['provenance']:
content['provenance']['notes'] = []
content['provenance']['notes'].append(
f"Reverted incorrect location enrichment on {datetime.now(timezone.utc).strftime('%Y-%m-%d')} - "
"LinkedIn HTML extraction was extracting wrong company's data"
)
# Write back
with open(f, 'w', encoding='utf-8') as file:
yaml.dump(content, file, default_flow_style=False, allow_unicode=True, sort_keys=False)
return True
def main():
print("Finding incorrectly enriched NL-XX-XXX files...")
mismatched = find_mismatched_files()
print(f"\nFound {len(mismatched)} files to revert:")
# Show distribution of incorrect cities
city_counts = {}
for m in mismatched:
match = re.search(r'NL-([A-Z]{2})-([A-Z]{3})', m['ghcid_current'])
if match:
city = f"NL-{match.group(1)}-{match.group(2)}"
city_counts[city] = city_counts.get(city, 0) + 1
print("\nDistribution of incorrect city assignments:")
for city, count in sorted(city_counts.items(), key=lambda x: -x[1]):
print(f" {city}: {count} files")
# Confirm before proceeding
print(f"\nWill revert {len(mismatched)} files to XX-XXX state.")
response = input("Proceed? (y/n): ").strip().lower()
if response != 'y':
print("Aborted.")
return
# Revert files
reverted = 0
for m in mismatched:
try:
if revert_file(m):
reverted += 1
print(f"Reverted: {m['file'].name}")
print(f" {m['ghcid_current']} -> {m['ghcid_original']}")
except Exception as e:
print(f"Error reverting {m['file']}: {e}")
print(f"\n✅ Successfully reverted {reverted}/{len(mismatched)} files")
if __name__ == "__main__":
main()