glam/scripts/revert_belgium_fuzzy.py

#!/usr/bin/env python3
"""
Revert incorrect Belgium fuzzy match enrichments.

Removes wikidata_enrichment blocks that were added via fuzzy_name_match
from Belgium files, EXCEPT for the two verified correct matches:
- BE-VAN-ANT-A-FFA.yaml (FOMU)
- BE-VLG-ANT-A-MHKAMH.yaml (M HKA)
"""

import yaml
from pathlib import Path
import sys

# Files with CORRECT matches - do not revert
KEEP_FILES = {
    "BE-VAN-ANT-A-FFA.yaml",      # FOMU - Q2635059
    "BE-VLG-ANT-A-MHKAMH.yaml",   # M HKA - Q1573755
}

def revert_file(filepath: Path) -> bool:
    """Remove wikidata_enrichment block if it was from fuzzy_name_match."""
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()

    # Parse YAML
    try:
        data = yaml.safe_load(content)
    except yaml.YAMLError as e:
        print(f"  ERROR parsing {filepath.name}: {e}")
        return False

    if not data:
        return False

    # Check if wikidata_enrichment exists and was from fuzzy match
    enrichment = data.get('wikidata_enrichment')
    if not enrichment:
        return False

    if enrichment.get('matched_by') != 'fuzzy_name_match':
        print(f"  SKIP {filepath.name}: not fuzzy_name_match (matched_by={enrichment.get('matched_by')})")
        return False

    # Store what we're removing for logging
    removed_id = enrichment.get('wikidata_id')
    removed_name = enrichment.get('matched_name')

    # Remove the enrichment block
    del data['wikidata_enrichment']

    # Write back
    with open(filepath, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    print(f"  REVERTED {filepath.name}: removed {removed_id} ({removed_name})")
    return True


def main():
    data_dir = Path("/Users/kempersc/apps/glam/data/custodian")

    # Find all BE files with fuzzy_name_match
    be_files = list(data_dir.glob("BE-*.yaml"))

    reverted = 0
    skipped_keep = 0
    skipped_other = 0

    print(f"Scanning {len(be_files)} Belgium files...")
    print()

    for filepath in sorted(be_files):
        # Skip files we want to keep
        if filepath.name in KEEP_FILES:
            # Verify it has wikidata_enrichment
            with open(filepath, 'r') as f:
                data = yaml.safe_load(f)
            if data and data.get('wikidata_enrichment'):
                print(f"  KEEP {filepath.name}: correct match ({data['wikidata_enrichment'].get('wikidata_id')})")
                skipped_keep += 1
            continue

        # Check if file has fuzzy_name_match enrichment
        with open(filepath, 'r') as f:
            content = f.read()

        if 'matched_by: fuzzy_name_match' not in content:
            continue

        # Revert this file
        if revert_file(filepath):
            reverted += 1
        else:
            skipped_other += 1

    print()
    print(f"Summary:")
    print(f"  Reverted: {reverted}")
    print(f"  Kept (correct): {skipped_keep}")
    print(f"  Skipped (other): {skipped_other}")


if __name__ == "__main__":
    main()