glam/scripts/reclassify_non_dutch_pending.py

#!/usr/bin/env python3
"""
Reclassify NL-XX-XXX-PENDING files that are clearly from other countries.

This script:
1. Scans NL PENDING files for country-specific indicators
2. Renames files with correct country prefixes
3. Logs all changes

Usage:
    python scripts/reclassify_non_dutch_pending.py --dry-run
    python scripts/reclassify_non_dutch_pending.py
"""

import re
import yaml
import shutil
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Tuple

CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")

# Country detection patterns - order matters (more specific first)
COUNTRY_PATTERNS = [
    # Indonesia
    (r'\b(Indonesia|ANRI|Arsip Nasional|Indonesian|Jakarta|Yogyakarta|Bandung|Bali|Java)\b', 'ID'),
    # Germany
    (r'\b(German|Germany|Deutschland|Deutsches?|Berlin|München|Munich|Hamburg|Frankfurt|Marburg|Köln|Cologne|Heidelberg|DAI)\b', 'DE'),
    # France
    (r'\b(French|France|Français|Française|Paris|Lyon|Marseille|Rennes|Toulouse|Bordeaux|EFR|École française)\b', 'FR'),
    (r'\b(Musée|Château|Bibliothèque nationale|Archives de|Centre des monuments nationaux)\b', 'FR'),
    # Italy
    (r'\b(Italian|Italy|Italia|Italiano|Roma|Rome|Milano|Milan|Venezia|Venice|Firenze|Florence|Venaria Reale)\b', 'IT'),
    # UK
    (r'\b(British|Britain|UK|United Kingdom|England|London|Edinburgh|Scotland|Wales|Oxford|Cambridge|Historic Royal Palaces)\b', 'GB'),
    # USA
    (r'\b(American|America|USA|United States|Washington|New York|California|Smithsonian|Library of Congress|GIA)\b', 'US'),
    # Australia
    (r'\b(Australian|Australia|Sydney|Melbourne|Brisbane|Victoria|Queensland|Canberra)\b', 'AU'),
    # Spain
    (r'\b(Spanish|Spain|España|Español|Madrid|Barcelona|Sevilla|Valencia|Bilbao)\b', 'ES'),
    # Portugal
    (r'\b(Portuguese|Portugal|Lisboa|Lisbon|Porto|Portuguesa)\b', 'PT'),
    # Greece
    (r'\b(Greek|Greece|Athens|British School at Athens)\b', 'GR'),
    # Denmark
    (r'\b(Danish|Denmark|København|Copenhagen|Fonden)\b', 'DK'),
    # Belgium
    (r'\b(Belgian|Belgium|Belgique|België|Brussels|Bruxelles|Antwerp|Antwerpen|Gent|Ghent|ADVN)\b', 'BE'),
    # International orgs (not Dutch)
    (r'\b(DARIAH|ERIC|European Union|IUCN|International Society)\b', 'EU'),
    # Saudi Arabia
    (r'\b(Saudi|Arabia|Saudi Arabian|Riyadh|Jeddah|Arabian Oud)\b', 'SA'),
    # Japan
    (r'\b(Japanese|Japan|Tokyo|Kyoto|Osaka)\b', 'JP'),
    # China
    (r'\b(Chinese|China|Beijing|Shanghai)\b', 'CN'),
    # India
    (r'\b(Indian|India|Mumbai|Delhi|Bangalore|Kolkata)\b', 'IN'),
    # Kenya
    (r'\b(Kenyan|Kenya|Nairobi)\b', 'KE'),
    # Israel
    (r'\b(Israeli|Israel|Jerusalem|Tel Aviv)\b', 'IL'),
    # Norway
    (r'\b(Norwegian|Norway|Oslo|Bergen)\b', 'NO'),
]

# Known organizations with their countries
KNOWN_ORGS = {
    'Anne Frank Educational Center': 'DE',  # Frankfurt
    'Archives de Rennes': 'FR',
    'Arsip Nasional Republik Indonesia': 'ID',
    'Bildarchiv Foto Marburg': 'DE',
    'British School at Athens': 'GR',
    'British Trust for Ornithology': 'GB',
    'Centre des monuments nationaux': 'FR',
    'Centro Conservazione Restauro La Venaria Reale': 'IT',
    'Château de Chantilly': 'FR',
    'Deutsches Archäologisches Institut': 'DE',
    'École française de Rome': 'IT',  # Based in Rome
    'European Museum Academy': 'EU',
    'GIA Gemological Institute of America': 'US',
    'Historic Royal Palaces': 'GB',
    'IRHT Institut de recherche et d\'histoire des textes': 'FR',
    'Archaeological Research Services Ltd': 'GB',
    'Australian Museums and Galleries Association Victoria': 'AU',
    'Australian Society of Archivists': 'AU',
    'Art:1 New Museum': 'ID',  # Jakarta
    'Art Zoo Museum': 'ID',  # Jakarta
    'Asmat Museum of Culture and Progress': 'ID',
    'Arabian Oud': 'SA',
    'Bonhams': 'GB',
    'CIFOR': 'ID',  # Bogor, Indonesia
    'DARIAH-ERIC': 'DE',  # Based in Germany
    'Fisheries Resource Center of Indonesia': 'ID',
    'Common Wadden Sea Secretariat': 'DE',  # Wilhelmshaven
    'International Society of Arboriculture': 'US',
    'IUCN SSC Shark Specialist Group': 'CA',  # Based in Canada
    'Augustinus Fonden': 'DK',
}


def detect_country(emic_name: str) -> Optional[str]:
    """Detect country from organization name."""
    # Check known orgs first
    for org, country in KNOWN_ORGS.items():
        if org.lower() in emic_name.lower():
            return country

    # Check patterns
    for pattern, country in COUNTRY_PATTERNS:
        if re.search(pattern, emic_name, re.IGNORECASE):
            return country

    return None


def process_file(filepath: Path, dry_run: bool = True) -> Optional[Tuple[str, str]]:
    """Process a single PENDING file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)

        emic_name = data.get('custodian_name', {}).get('emic_name', '')
        if not emic_name:
            return None

        # Detect country
        country = detect_country(emic_name)
        if not country or country == 'NL':
            return None

        # Generate new filename
        old_name = filepath.name
        # Replace NL- with new country code
        new_name = old_name.replace('NL-XX-XXX-PENDING', f'{country}-XX-XXX-PENDING')
        new_path = CUSTODIAN_DIR / new_name

        # Avoid collision
        if new_path.exists():
            stem = new_path.stem
            new_name = f"{stem}-2.yaml"
            new_path = CUSTODIAN_DIR / new_name

        if dry_run:
            print(f"[WOULD RECLASSIFY] {emic_name}")
            print(f"  NL -> {country}")
            print(f"  {old_name} -> {new_name}")
            return ('dry_run', country)

        # Update data
        data['location'] = data.get('location', {})
        data['location']['country'] = country

        # Add reclassification provenance
        if 'ghcid_resolution' not in data:
            data['ghcid_resolution'] = {}
        data['ghcid_resolution']['reclassified_from'] = 'NL'
        data['ghcid_resolution']['reclassified_to'] = country
        data['ghcid_resolution']['reclassified_at'] = datetime.now(timezone.utc).isoformat()

        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

        shutil.move(filepath, new_path)
        print(f"[RECLASSIFIED] {emic_name}: NL -> {country}")

        return (new_name, country)

    except Exception as e:
        print(f"[ERROR] {filepath.name}: {e}")
        return None


def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--dry-run', action='store_true')
    args = parser.parse_args()

    # Find NL PENDING files
    pending_files = list(CUSTODIAN_DIR.glob("NL-XX-XXX-PENDING-*.yaml"))
    print(f"Scanning {len(pending_files)} NL PENDING files for non-Dutch organizations...\n")

    from collections import defaultdict
    country_counts = defaultdict(int)
    reclassified = 0

    for filepath in pending_files:
        result = process_file(filepath, dry_run=args.dry_run)
        if result:
            reclassified += 1
            _, country = result
            country_counts[country] += 1

    print(f"\n{'Would reclassify' if args.dry_run else 'Reclassified'}: {reclassified}")
    if country_counts:
        print("\nBy country:")
        for country, count in sorted(country_counts.items(), key=lambda x: -x[1]):
            print(f"  {country}: {count}")


if __name__ == '__main__':
    main()