glam/scripts/remove_wikidata_duplicates.py

#!/usr/bin/env python3
"""
Remove wikidata_enrichment from files with duplicate Wikidata entity IDs.

These files have incorrect Wikidata entity ID assignments where the same Q-number
was incorrectly assigned to multiple different institutions.

The script:
1. Reads the list of affected files from /tmp/wikidata_duplicates_to_clean.txt
2. For each file, removes the wikidata_enrichment section
3. Adds a provenance note documenting the removal
4. Preserves all other data
"""

import os
import sys
from datetime import datetime, timezone
from pathlib import Path
import yaml

# Preserve order in YAML output
class OrderedDumper(yaml.SafeDumper):
    pass

def represent_ordereddict(dumper, data):
    return dumper.represent_mapping('tag:yaml.org,2002:map', data.items())

def str_representer(dumper, data):
    if '\n' in data:
        return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
    return dumper.represent_scalar('tag:yaml.org,2002:str', data)

OrderedDumper.add_representer(str, str_representer)

def remove_wikidata_enrichment(file_path: Path) -> tuple[bool, str]:
    """
    Remove wikidata_enrichment from a file.

    Returns:
        tuple of (success: bool, message: str)
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Parse YAML
        data = yaml.safe_load(content)

        if data is None:
            return False, "Empty or invalid YAML"

        # Check if wikidata_enrichment exists
        if 'wikidata_enrichment' not in data:
            return False, "No wikidata_enrichment found"

        # Get the old wikidata ID for documentation
        old_wikidata_id = None
        if isinstance(data.get('wikidata_enrichment'), dict):
            old_wikidata_id = data['wikidata_enrichment'].get('wikidata_id')

        # Remove wikidata_enrichment
        del data['wikidata_enrichment']

        # Add provenance note
        timestamp = datetime.now(timezone.utc).isoformat()
        note = f"Removed incorrect wikidata_enrichment on {timestamp}. "
        if old_wikidata_id:
            note += f"Previous Wikidata ID {old_wikidata_id} was incorrectly assigned (duplicate across multiple institutions). "
        note += "Re-enrichment required with proper matching."

        # Update or create provenance
        if 'provenance' not in data:
            data['provenance'] = {}

        if isinstance(data['provenance'], dict):
            existing_notes = data['provenance'].get('notes', '')
            # Handle case where notes is a list
            if isinstance(existing_notes, list):
                existing_notes.append(note)
                data['provenance']['notes'] = existing_notes
            elif existing_notes:
                data['provenance']['notes'] = existing_notes + '\n\n' + note
            else:
                data['provenance']['notes'] = note

        # Write back
        with open(file_path, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, Dumper=OrderedDumper, allow_unicode=True,
                     default_flow_style=False, sort_keys=False, width=120)

        return True, f"Removed wikidata_enrichment (was {old_wikidata_id})"

    except Exception as e:
        return False, f"Error: {e}"


def main():
    # Read the list of files to clean
    list_file = Path('/tmp/wikidata_duplicates_to_clean.txt')
    if not list_file.exists():
        print(f"Error: {list_file} not found")
        sys.exit(1)

    with open(list_file, 'r') as f:
        filenames = [line.strip() for line in f if line.strip()]

    print(f"Found {len(filenames)} files to clean")

    # Base directory for custodian files
    base_dir = Path('/Users/kempersc/apps/glam/data/custodian')

    success_count = 0
    skip_count = 0
    error_count = 0

    for i, filename in enumerate(filenames):
        file_path = base_dir / filename

        if not file_path.exists():
            print(f"[{i+1}/{len(filenames)}] SKIP (not found): {filename}")
            skip_count += 1
            continue

        success, message = remove_wikidata_enrichment(file_path)

        if success:
            print(f"[{i+1}/{len(filenames)}] OK: {filename} - {message}")
            success_count += 1
        else:
            if "No wikidata_enrichment" in message:
                print(f"[{i+1}/{len(filenames)}] SKIP (no wikidata): {filename}")
                skip_count += 1
            else:
                print(f"[{i+1}/{len(filenames)}] ERROR: {filename} - {message}")
                error_count += 1

    print(f"\n=== Summary ===")
    print(f"Total files: {len(filenames)}")
    print(f"Successfully cleaned: {success_count}")
    print(f"Skipped: {skip_count}")
    print(f"Errors: {error_count}")


if __name__ == '__main__':
    main()