glam/scripts/revert_belgium_fuzzy.py
2025-12-21 00:01:54 +01:00

105 lines
3.1 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Revert incorrect Belgium fuzzy match enrichments.
Removes wikidata_enrichment blocks that were added via fuzzy_name_match
from Belgium files, EXCEPT for the two verified correct matches:
- BE-VAN-ANT-A-FFA.yaml (FOMU)
- BE-VLG-ANT-A-MHKAMH.yaml (M HKA)
"""
import yaml
from pathlib import Path
import sys
# Files with CORRECT matches - do not revert
KEEP_FILES = {
"BE-VAN-ANT-A-FFA.yaml", # FOMU - Q2635059
"BE-VLG-ANT-A-MHKAMH.yaml", # M HKA - Q1573755
}
def revert_file(filepath: Path) -> bool:
"""Remove wikidata_enrichment block if it was from fuzzy_name_match."""
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Parse YAML
try:
data = yaml.safe_load(content)
except yaml.YAMLError as e:
print(f" ERROR parsing {filepath.name}: {e}")
return False
if not data:
return False
# Check if wikidata_enrichment exists and was from fuzzy match
enrichment = data.get('wikidata_enrichment')
if not enrichment:
return False
if enrichment.get('matched_by') != 'fuzzy_name_match':
print(f" SKIP {filepath.name}: not fuzzy_name_match (matched_by={enrichment.get('matched_by')})")
return False
# Store what we're removing for logging
removed_id = enrichment.get('wikidata_id')
removed_name = enrichment.get('matched_name')
# Remove the enrichment block
del data['wikidata_enrichment']
# Write back
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f" REVERTED {filepath.name}: removed {removed_id} ({removed_name})")
return True
def main():
data_dir = Path("/Users/kempersc/apps/glam/data/custodian")
# Find all BE files with fuzzy_name_match
be_files = list(data_dir.glob("BE-*.yaml"))
reverted = 0
skipped_keep = 0
skipped_other = 0
print(f"Scanning {len(be_files)} Belgium files...")
print()
for filepath in sorted(be_files):
# Skip files we want to keep
if filepath.name in KEEP_FILES:
# Verify it has wikidata_enrichment
with open(filepath, 'r') as f:
data = yaml.safe_load(f)
if data and data.get('wikidata_enrichment'):
print(f" KEEP {filepath.name}: correct match ({data['wikidata_enrichment'].get('wikidata_id')})")
skipped_keep += 1
continue
# Check if file has fuzzy_name_match enrichment
with open(filepath, 'r') as f:
content = f.read()
if 'matched_by: fuzzy_name_match' not in content:
continue
# Revert this file
if revert_file(filepath):
reverted += 1
else:
skipped_other += 1
print()
print(f"Summary:")
print(f" Reverted: {reverted}")
print(f" Kept (correct): {skipped_keep}")
print(f" Skipped (other): {skipped_other}")
if __name__ == "__main__":
main()