#!/usr/bin/env python3 """ Remove wikidata_enrichment from files with duplicate Wikidata entity IDs. These files have incorrect Wikidata entity ID assignments where the same Q-number was incorrectly assigned to multiple different institutions. The script: 1. Reads the list of affected files from /tmp/wikidata_duplicates_to_clean.txt 2. For each file, removes the wikidata_enrichment section 3. Adds a provenance note documenting the removal 4. Preserves all other data """ import os import sys from datetime import datetime, timezone from pathlib import Path import yaml # Preserve order in YAML output class OrderedDumper(yaml.SafeDumper): pass def represent_ordereddict(dumper, data): return dumper.represent_mapping('tag:yaml.org,2002:map', data.items()) def str_representer(dumper, data): if '\n' in data: return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) OrderedDumper.add_representer(str, str_representer) def remove_wikidata_enrichment(file_path: Path) -> tuple[bool, str]: """ Remove wikidata_enrichment from a file. Returns: tuple of (success: bool, message: str) """ try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Parse YAML data = yaml.safe_load(content) if data is None: return False, "Empty or invalid YAML" # Check if wikidata_enrichment exists if 'wikidata_enrichment' not in data: return False, "No wikidata_enrichment found" # Get the old wikidata ID for documentation old_wikidata_id = None if isinstance(data.get('wikidata_enrichment'), dict): old_wikidata_id = data['wikidata_enrichment'].get('wikidata_id') # Remove wikidata_enrichment del data['wikidata_enrichment'] # Add provenance note timestamp = datetime.now(timezone.utc).isoformat() note = f"Removed incorrect wikidata_enrichment on {timestamp}. " if old_wikidata_id: note += f"Previous Wikidata ID {old_wikidata_id} was incorrectly assigned (duplicate across multiple institutions). " note += "Re-enrichment required with proper matching." # Update or create provenance if 'provenance' not in data: data['provenance'] = {} if isinstance(data['provenance'], dict): existing_notes = data['provenance'].get('notes', '') # Handle case where notes is a list if isinstance(existing_notes, list): existing_notes.append(note) data['provenance']['notes'] = existing_notes elif existing_notes: data['provenance']['notes'] = existing_notes + '\n\n' + note else: data['provenance']['notes'] = note # Write back with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, Dumper=OrderedDumper, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) return True, f"Removed wikidata_enrichment (was {old_wikidata_id})" except Exception as e: return False, f"Error: {e}" def main(): # Read the list of files to clean list_file = Path('/tmp/wikidata_duplicates_to_clean.txt') if not list_file.exists(): print(f"Error: {list_file} not found") sys.exit(1) with open(list_file, 'r') as f: filenames = [line.strip() for line in f if line.strip()] print(f"Found {len(filenames)} files to clean") # Base directory for custodian files base_dir = Path('/Users/kempersc/apps/glam/data/custodian') success_count = 0 skip_count = 0 error_count = 0 for i, filename in enumerate(filenames): file_path = base_dir / filename if not file_path.exists(): print(f"[{i+1}/{len(filenames)}] SKIP (not found): {filename}") skip_count += 1 continue success, message = remove_wikidata_enrichment(file_path) if success: print(f"[{i+1}/{len(filenames)}] OK: {filename} - {message}") success_count += 1 else: if "No wikidata_enrichment" in message: print(f"[{i+1}/{len(filenames)}] SKIP (no wikidata): {filename}") skip_count += 1 else: print(f"[{i+1}/{len(filenames)}] ERROR: {filename} - {message}") error_count += 1 print(f"\n=== Summary ===") print(f"Total files: {len(filenames)}") print(f"Successfully cleaned: {success_count}") print(f"Skipped: {skip_count}") print(f"Errors: {error_count}") if __name__ == '__main__': main()