#!/usr/bin/env python3 """ Revert incorrect Belgium fuzzy match enrichments. Removes wikidata_enrichment blocks that were added via fuzzy_name_match from Belgium files, EXCEPT for the two verified correct matches: - BE-VAN-ANT-A-FFA.yaml (FOMU) - BE-VLG-ANT-A-MHKAMH.yaml (M HKA) """ import yaml from pathlib import Path import sys # Files with CORRECT matches - do not revert KEEP_FILES = { "BE-VAN-ANT-A-FFA.yaml", # FOMU - Q2635059 "BE-VLG-ANT-A-MHKAMH.yaml", # M HKA - Q1573755 } def revert_file(filepath: Path) -> bool: """Remove wikidata_enrichment block if it was from fuzzy_name_match.""" with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Parse YAML try: data = yaml.safe_load(content) except yaml.YAMLError as e: print(f" ERROR parsing {filepath.name}: {e}") return False if not data: return False # Check if wikidata_enrichment exists and was from fuzzy match enrichment = data.get('wikidata_enrichment') if not enrichment: return False if enrichment.get('matched_by') != 'fuzzy_name_match': print(f" SKIP {filepath.name}: not fuzzy_name_match (matched_by={enrichment.get('matched_by')})") return False # Store what we're removing for logging removed_id = enrichment.get('wikidata_id') removed_name = enrichment.get('matched_name') # Remove the enrichment block del data['wikidata_enrichment'] # Write back with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print(f" REVERTED {filepath.name}: removed {removed_id} ({removed_name})") return True def main(): data_dir = Path("/Users/kempersc/apps/glam/data/custodian") # Find all BE files with fuzzy_name_match be_files = list(data_dir.glob("BE-*.yaml")) reverted = 0 skipped_keep = 0 skipped_other = 0 print(f"Scanning {len(be_files)} Belgium files...") print() for filepath in sorted(be_files): # Skip files we want to keep if filepath.name in KEEP_FILES: # Verify it has wikidata_enrichment with open(filepath, 'r') as f: data = yaml.safe_load(f) if data and data.get('wikidata_enrichment'): print(f" KEEP {filepath.name}: correct match ({data['wikidata_enrichment'].get('wikidata_id')})") skipped_keep += 1 continue # Check if file has fuzzy_name_match enrichment with open(filepath, 'r') as f: content = f.read() if 'matched_by: fuzzy_name_match' not in content: continue # Revert this file if revert_file(filepath): reverted += 1 else: skipped_other += 1 print() print(f"Summary:") print(f" Reverted: {reverted}") print(f" Kept (correct): {skipped_keep}") print(f" Skipped (other): {skipped_other}") if __name__ == "__main__": main()