#!/usr/bin/env python3 """ Fix generic platform names ('Home Website', 'Homepage Website') by using the organisatie field from original_entry. Also filters invalid platform types (ONLINEMARKETING, ONLINEBRANDING). """ import yaml import os import sys from pathlib import Path from datetime import datetime, timezone # Custom YAML representer to preserve formatting def str_representer(dumper, data): if '\n' in data: return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) yaml.add_representer(str, str_representer) GENERIC_NAMES = {'Home Website', 'Homepage Website', 'Welkom Website'} INVALID_TYPES = {'ONLINEMARKETING', 'ONLINEBRANDING', 'ONLINEWEBSITE', 'ONLINE'} def fix_file(filepath: Path, dry_run: bool = False) -> dict: """Fix a single file. Returns stats dict.""" stats = { 'name_fixed': False, 'types_fixed': False, 'old_name': None, 'new_name': None, 'removed_types': [] } with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: return stats # Check if file has digital_platform_v2 if 'digital_platform_v2' not in data: return stats dpv2 = data['digital_platform_v2'] modified = False # Fix 1: Generic platform names current_name = dpv2.get('platform_name', '') if current_name in GENERIC_NAMES: # Try to get organisation name org_name = None if 'original_entry' in data and 'organisatie' in data['original_entry']: org_name = data['original_entry']['organisatie'] elif 'museum_register_enrichment' in data and 'museum_name' in data['museum_register_enrichment']: org_name = data['museum_register_enrichment']['museum_name'] elif 'wikidata_enrichment' in data and 'wikidata_label_nl' in data['wikidata_enrichment']: org_name = data['wikidata_enrichment']['wikidata_label_nl'] if org_name: new_name = f"{org_name} Website" stats['old_name'] = current_name stats['new_name'] = new_name stats['name_fixed'] = True dpv2['platform_name'] = new_name modified = True # Fix 2: Invalid platform types if 'platform_type' in dpv2 and isinstance(dpv2['platform_type'], list): original_types = dpv2['platform_type'].copy() filtered_types = [t for t in original_types if t not in INVALID_TYPES] if len(filtered_types) < len(original_types): stats['removed_types'] = [t for t in original_types if t in INVALID_TYPES] stats['types_fixed'] = True dpv2['platform_type'] = filtered_types if filtered_types else ['INSTITUTIONAL_WEBSITE'] modified = True # Add fix metadata if modified: if '_transformation_metadata' not in dpv2: dpv2['_transformation_metadata'] = {} dpv2['_transformation_metadata']['quality_fix_date'] = datetime.now(timezone.utc).isoformat() if stats['name_fixed']: dpv2['_transformation_metadata']['name_source'] = 'organisatie_field' if not dry_run: with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return stats def main(): import argparse parser = argparse.ArgumentParser(description='Fix generic platform names') parser.add_argument('--dry-run', action='store_true', help='Show what would be changed without modifying files') parser.add_argument('--path', default='/Users/kempersc/apps/glam/data/custodian', help='Path to custodian files') args = parser.parse_args() custodian_path = Path(args.path) # Find files with digital_platform_v2 files_fixed_names = 0 files_fixed_types = 0 total_checked = 0 print(f"{'[DRY RUN] ' if args.dry_run else ''}Scanning {custodian_path}...") print() for filepath in sorted(custodian_path.glob('NL-*.yaml')): stats = fix_file(filepath, dry_run=args.dry_run) if stats['name_fixed'] or stats['types_fixed']: total_checked += 1 if stats['name_fixed']: files_fixed_names += 1 print(f"✓ {filepath.name}") print(f" Name: '{stats['old_name']}' → '{stats['new_name']}'") if stats['types_fixed']: files_fixed_types += 1 print(f" Removed types: {stats['removed_types']}") print() print("=" * 60) print(f"{'[DRY RUN] ' if args.dry_run else ''}Summary:") print(f" Files with name fixed: {files_fixed_names}") print(f" Files with types fixed: {files_fixed_types}") print(f" Total files modified: {total_checked}") if args.dry_run: print() print("Run without --dry-run to apply changes.") if __name__ == '__main__': main()