glam/scripts/fix_generic_platform_names_fast.py

#!/usr/bin/env python3
"""
Fast fix for generic platform names - processes only files from stdin or file list.
"""

import yaml
import sys
from pathlib import Path
from datetime import datetime, timezone

GENERIC_NAMES = {'Home Website', 'Homepage Website', 'Welkom Website'}
INVALID_TYPES = {'ONLINEMARKETING', 'ONLINEBRANDING', 'ONLINEWEBSITE', 'ONLINE'}

def fix_file(filepath: Path, dry_run: bool = False) -> dict:
    """Fix a single file."""
    stats = {'name_fixed': False, 'types_fixed': False, 'old_name': None, 'new_name': None, 'removed_types': []}

    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
        data = yaml.safe_load(content)

    if not data or 'digital_platform_v2' not in data:
        return stats

    dpv2 = data['digital_platform_v2']
    modified = False

    # Fix generic names
    current_name = dpv2.get('platform_name', '')
    if current_name in GENERIC_NAMES:
        org_name = None
        if 'original_entry' in data and data['original_entry'].get('organisatie'):
            org_name = data['original_entry']['organisatie']
        elif 'museum_register_enrichment' in data and data['museum_register_enrichment'].get('museum_name'):
            org_name = data['museum_register_enrichment']['museum_name']
        elif 'wikidata_enrichment' in data and data['wikidata_enrichment'].get('wikidata_label_nl'):
            org_name = data['wikidata_enrichment']['wikidata_label_nl']

        if org_name:
            new_name = f"{org_name} Website"
            stats['old_name'] = current_name
            stats['new_name'] = new_name
            stats['name_fixed'] = True
            dpv2['platform_name'] = new_name
            modified = True

    # Fix invalid types
    if 'platform_type' in dpv2 and isinstance(dpv2['platform_type'], list):
        original_types = dpv2['platform_type'].copy()
        filtered_types = [t for t in original_types if t not in INVALID_TYPES]
        if len(filtered_types) < len(original_types):
            stats['removed_types'] = [t for t in original_types if t in INVALID_TYPES]
            stats['types_fixed'] = True
            dpv2['platform_type'] = filtered_types if filtered_types else ['INSTITUTIONAL_WEBSITE']
            modified = True

    if modified:
        if '_transformation_metadata' not in dpv2:
            dpv2['_transformation_metadata'] = {}
        dpv2['_transformation_metadata']['quality_fix_date'] = datetime.now(timezone.utc).isoformat()

        if not dry_run:
            with open(filepath, 'w', encoding='utf-8') as f:
                yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    return stats

def main():
    dry_run = '--dry-run' in sys.argv
    file_list = sys.argv[1] if len(sys.argv) > 1 and not sys.argv[1].startswith('--') else None

    if file_list:
        with open(file_list) as f:
            files = [Path(line.strip()) for line in f if line.strip()]
    else:
        files = [Path(line.strip()) for line in sys.stdin if line.strip()]

    fixed_names = 0
    fixed_types = 0

    for filepath in files:
        if not filepath.exists():
            continue
        stats = fix_file(filepath, dry_run=dry_run)

        if stats['name_fixed'] or stats['types_fixed']:
            if stats['name_fixed']:
                fixed_names += 1
                print(f"✓ {filepath.name}: '{stats['old_name']}' → '{stats['new_name']}'")
            if stats['types_fixed']:
                fixed_types += 1
                print(f"  Removed: {stats['removed_types']}")

    print(f"\n{'[DRY RUN] ' if dry_run else ''}Fixed: {fixed_names} names, {fixed_types} type lists")

if __name__ == '__main__':
    main()