#!/usr/bin/env python3 """ Fast fix for generic platform names - processes only files from stdin or file list. """ import yaml import sys from pathlib import Path from datetime import datetime, timezone GENERIC_NAMES = {'Home Website', 'Homepage Website', 'Welkom Website'} INVALID_TYPES = {'ONLINEMARKETING', 'ONLINEBRANDING', 'ONLINEWEBSITE', 'ONLINE'} def fix_file(filepath: Path, dry_run: bool = False) -> dict: """Fix a single file.""" stats = {'name_fixed': False, 'types_fixed': False, 'old_name': None, 'new_name': None, 'removed_types': []} with open(filepath, 'r', encoding='utf-8') as f: content = f.read() data = yaml.safe_load(content) if not data or 'digital_platform_v2' not in data: return stats dpv2 = data['digital_platform_v2'] modified = False # Fix generic names current_name = dpv2.get('platform_name', '') if current_name in GENERIC_NAMES: org_name = None if 'original_entry' in data and data['original_entry'].get('organisatie'): org_name = data['original_entry']['organisatie'] elif 'museum_register_enrichment' in data and data['museum_register_enrichment'].get('museum_name'): org_name = data['museum_register_enrichment']['museum_name'] elif 'wikidata_enrichment' in data and data['wikidata_enrichment'].get('wikidata_label_nl'): org_name = data['wikidata_enrichment']['wikidata_label_nl'] if org_name: new_name = f"{org_name} Website" stats['old_name'] = current_name stats['new_name'] = new_name stats['name_fixed'] = True dpv2['platform_name'] = new_name modified = True # Fix invalid types if 'platform_type' in dpv2 and isinstance(dpv2['platform_type'], list): original_types = dpv2['platform_type'].copy() filtered_types = [t for t in original_types if t not in INVALID_TYPES] if len(filtered_types) < len(original_types): stats['removed_types'] = [t for t in original_types if t in INVALID_TYPES] stats['types_fixed'] = True dpv2['platform_type'] = filtered_types if filtered_types else ['INSTITUTIONAL_WEBSITE'] modified = True if modified: if '_transformation_metadata' not in dpv2: dpv2['_transformation_metadata'] = {} dpv2['_transformation_metadata']['quality_fix_date'] = datetime.now(timezone.utc).isoformat() if not dry_run: with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return stats def main(): dry_run = '--dry-run' in sys.argv file_list = sys.argv[1] if len(sys.argv) > 1 and not sys.argv[1].startswith('--') else None if file_list: with open(file_list) as f: files = [Path(line.strip()) for line in f if line.strip()] else: files = [Path(line.strip()) for line in sys.stdin if line.strip()] fixed_names = 0 fixed_types = 0 for filepath in files: if not filepath.exists(): continue stats = fix_file(filepath, dry_run=dry_run) if stats['name_fixed'] or stats['types_fixed']: if stats['name_fixed']: fixed_names += 1 print(f"✓ {filepath.name}: '{stats['old_name']}' → '{stats['new_name']}'") if stats['types_fixed']: fixed_types += 1 print(f" Removed: {stats['removed_types']}") print(f"\n{'[DRY RUN] ' if dry_run else ''}Fixed: {fixed_names} names, {fixed_types} type lists") if __name__ == '__main__': main()