142 lines
4.8 KiB
Python
142 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Revert incorrectly enriched NL-XX-XXX files.
|
|
|
|
These files were incorrectly updated with wrong location data from LinkedIn HTML
|
|
extraction. The script was extracting the FIRST headquarter JSON found in HTML,
|
|
which often belonged to a different company (sidebar, related companies, etc.).
|
|
|
|
This script:
|
|
1. Finds NL-XX-XXX files where ghcid_current doesn't contain XX-XXX
|
|
2. Restores ghcid_current to match ghcid_original
|
|
3. Resets location to unknown state
|
|
4. Removes incorrect location_resolution block
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
# Directory containing custodian files
|
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
|
|
|
def find_mismatched_files():
|
|
"""Find NL-XX-XXX files where ghcid_current was incorrectly updated."""
|
|
mismatched = []
|
|
|
|
for f in CUSTODIAN_DIR.glob("NL-XX-XXX-*.yaml"):
|
|
try:
|
|
with open(f, 'r', encoding='utf-8') as file:
|
|
content = yaml.safe_load(file)
|
|
|
|
if not content:
|
|
continue
|
|
|
|
ghcid = content.get('ghcid', {})
|
|
ghcid_current = ghcid.get('ghcid_current', '')
|
|
ghcid_original = ghcid.get('ghcid_original', '')
|
|
|
|
# If ghcid_current doesn't contain XX-XXX, it was incorrectly updated
|
|
if ghcid_current and 'XX-XXX' not in ghcid_current:
|
|
mismatched.append({
|
|
'file': f,
|
|
'ghcid_current': ghcid_current,
|
|
'ghcid_original': ghcid_original,
|
|
'content': content
|
|
})
|
|
except Exception as e:
|
|
print(f"Error reading {f}: {e}")
|
|
|
|
return mismatched
|
|
|
|
def revert_file(file_info):
|
|
"""Revert a file to its correct XX-XXX state."""
|
|
f = file_info['file']
|
|
content = file_info['content']
|
|
ghcid_original = file_info['ghcid_original']
|
|
|
|
# Restore ghcid_current to original
|
|
if 'ghcid' in content:
|
|
content['ghcid']['ghcid_current'] = ghcid_original
|
|
|
|
# Remove incorrect location_resolution if it exists
|
|
if 'location_resolution' in content['ghcid']:
|
|
del content['ghcid']['location_resolution']
|
|
|
|
# Update ghcid_history to reflect reversion
|
|
if 'ghcid_history' in content['ghcid']:
|
|
content['ghcid']['ghcid_history'] = [{
|
|
'ghcid': ghcid_original,
|
|
'ghcid_numeric': content['ghcid'].get('ghcid_numeric'),
|
|
'valid_from': datetime.now(timezone.utc).isoformat(),
|
|
'valid_to': None,
|
|
'reason': 'Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored'
|
|
}]
|
|
|
|
# Reset location to unknown state
|
|
if 'location' in content:
|
|
content['location'] = {
|
|
'city': None,
|
|
'region': None,
|
|
'country': 'NL'
|
|
}
|
|
|
|
# Add note to provenance about reversion
|
|
if 'provenance' not in content:
|
|
content['provenance'] = {}
|
|
if 'notes' not in content['provenance']:
|
|
content['provenance']['notes'] = []
|
|
content['provenance']['notes'].append(
|
|
f"Reverted incorrect location enrichment on {datetime.now(timezone.utc).strftime('%Y-%m-%d')} - "
|
|
"LinkedIn HTML extraction was extracting wrong company's data"
|
|
)
|
|
|
|
# Write back
|
|
with open(f, 'w', encoding='utf-8') as file:
|
|
yaml.dump(content, file, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return True
|
|
|
|
def main():
|
|
print("Finding incorrectly enriched NL-XX-XXX files...")
|
|
mismatched = find_mismatched_files()
|
|
|
|
print(f"\nFound {len(mismatched)} files to revert:")
|
|
|
|
# Show distribution of incorrect cities
|
|
city_counts = {}
|
|
for m in mismatched:
|
|
match = re.search(r'NL-([A-Z]{2})-([A-Z]{3})', m['ghcid_current'])
|
|
if match:
|
|
city = f"NL-{match.group(1)}-{match.group(2)}"
|
|
city_counts[city] = city_counts.get(city, 0) + 1
|
|
|
|
print("\nDistribution of incorrect city assignments:")
|
|
for city, count in sorted(city_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {city}: {count} files")
|
|
|
|
# Confirm before proceeding
|
|
print(f"\nWill revert {len(mismatched)} files to XX-XXX state.")
|
|
response = input("Proceed? (y/n): ").strip().lower()
|
|
|
|
if response != 'y':
|
|
print("Aborted.")
|
|
return
|
|
|
|
# Revert files
|
|
reverted = 0
|
|
for m in mismatched:
|
|
try:
|
|
if revert_file(m):
|
|
reverted += 1
|
|
print(f"Reverted: {m['file'].name}")
|
|
print(f" {m['ghcid_current']} -> {m['ghcid_original']}")
|
|
except Exception as e:
|
|
print(f"Error reverting {m['file']}: {e}")
|
|
|
|
print(f"\n✅ Successfully reverted {reverted}/{len(mismatched)} files")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|