105 lines
3.1 KiB
Python
Executable file
105 lines
3.1 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Revert incorrect Belgium fuzzy match enrichments.
|
|
|
|
Removes wikidata_enrichment blocks that were added via fuzzy_name_match
|
|
from Belgium files, EXCEPT for the two verified correct matches:
|
|
- BE-VAN-ANT-A-FFA.yaml (FOMU)
|
|
- BE-VLG-ANT-A-MHKAMH.yaml (M HKA)
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
import sys
|
|
|
|
# Files with CORRECT matches - do not revert
|
|
KEEP_FILES = {
|
|
"BE-VAN-ANT-A-FFA.yaml", # FOMU - Q2635059
|
|
"BE-VLG-ANT-A-MHKAMH.yaml", # M HKA - Q1573755
|
|
}
|
|
|
|
def revert_file(filepath: Path) -> bool:
|
|
"""Remove wikidata_enrichment block if it was from fuzzy_name_match."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Parse YAML
|
|
try:
|
|
data = yaml.safe_load(content)
|
|
except yaml.YAMLError as e:
|
|
print(f" ERROR parsing {filepath.name}: {e}")
|
|
return False
|
|
|
|
if not data:
|
|
return False
|
|
|
|
# Check if wikidata_enrichment exists and was from fuzzy match
|
|
enrichment = data.get('wikidata_enrichment')
|
|
if not enrichment:
|
|
return False
|
|
|
|
if enrichment.get('matched_by') != 'fuzzy_name_match':
|
|
print(f" SKIP {filepath.name}: not fuzzy_name_match (matched_by={enrichment.get('matched_by')})")
|
|
return False
|
|
|
|
# Store what we're removing for logging
|
|
removed_id = enrichment.get('wikidata_id')
|
|
removed_name = enrichment.get('matched_name')
|
|
|
|
# Remove the enrichment block
|
|
del data['wikidata_enrichment']
|
|
|
|
# Write back
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f" REVERTED {filepath.name}: removed {removed_id} ({removed_name})")
|
|
return True
|
|
|
|
|
|
def main():
|
|
data_dir = Path("/Users/kempersc/apps/glam/data/custodian")
|
|
|
|
# Find all BE files with fuzzy_name_match
|
|
be_files = list(data_dir.glob("BE-*.yaml"))
|
|
|
|
reverted = 0
|
|
skipped_keep = 0
|
|
skipped_other = 0
|
|
|
|
print(f"Scanning {len(be_files)} Belgium files...")
|
|
print()
|
|
|
|
for filepath in sorted(be_files):
|
|
# Skip files we want to keep
|
|
if filepath.name in KEEP_FILES:
|
|
# Verify it has wikidata_enrichment
|
|
with open(filepath, 'r') as f:
|
|
data = yaml.safe_load(f)
|
|
if data and data.get('wikidata_enrichment'):
|
|
print(f" KEEP {filepath.name}: correct match ({data['wikidata_enrichment'].get('wikidata_id')})")
|
|
skipped_keep += 1
|
|
continue
|
|
|
|
# Check if file has fuzzy_name_match enrichment
|
|
with open(filepath, 'r') as f:
|
|
content = f.read()
|
|
|
|
if 'matched_by: fuzzy_name_match' not in content:
|
|
continue
|
|
|
|
# Revert this file
|
|
if revert_file(filepath):
|
|
reverted += 1
|
|
else:
|
|
skipped_other += 1
|
|
|
|
print()
|
|
print(f"Summary:")
|
|
print(f" Reverted: {reverted}")
|
|
print(f" Kept (correct): {skipped_keep}")
|
|
print(f" Skipped (other): {skipped_other}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|