glam/scripts/fix_remaining_ar_xx_xxx.py

#!/usr/bin/env python3
"""
Fix remaining AR-XX-XXX institution files with researched locations.

Research findings (2025-12-19):
1. Archivo Inundación (Q125055212) - Digital archive, mark as VIRTUAL
2. Sala de Arte Emilio Saraco (Q106075183) - Neuquén city, AR-Q-NEU
3. Galería Kramer (Q136031976) - Buenos Aires (CABA), AR-C-BUE
4. Le Passé Ltd (Q135997285) - MULTINATIONAL (US/MX/AR), needs special handling
5. La Passe, Ltd (Q136003694) - Likely duplicate of Q135997285, same owners

This script handles cases 1-3. Cases 4-5 need manual review for multinational handling.
"""

import os
import sys
import uuid
import hashlib
import shutil
from datetime import datetime, timezone
from pathlib import Path

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))

import yaml

# Configure YAML to preserve formatting
def str_representer(dumper, data):
    if '\n' in data:
        return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
    return dumper.represent_scalar('tag:yaml.org,2002:str', data)

yaml.add_representer(str, str_representer)

DATA_DIR = Path(__file__).parent.parent / "data" / "custodian"

# GHCID namespace for UUID v5 generation
GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8")  # DNS namespace as base

def generate_ghcid_uuid(ghcid_string: str) -> str:
    """Generate UUID v5 from GHCID string."""
    return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))

def generate_ghcid_uuid_sha256(ghcid_string: str) -> str:
    """Generate UUID v8 (SHA-256 based) from GHCID string."""
    hash_bytes = hashlib.sha256(ghcid_string.encode()).digest()[:16]
    hash_bytes = bytearray(hash_bytes)
    hash_bytes[6] = (hash_bytes[6] & 0x0F) | 0x80  # Version 8
    hash_bytes[8] = (hash_bytes[8] & 0x3F) | 0x80  # Variant
    return str(uuid.UUID(bytes=bytes(hash_bytes)))

def generate_ghcid_numeric(ghcid_string: str) -> int:
    """Generate 64-bit numeric ID from GHCID string."""
    hash_bytes = hashlib.sha256(ghcid_string.encode()).digest()
    return int.from_bytes(hash_bytes[:8], byteorder='big')


# Institutions to FIX with resolved locations
RESOLVED_INSTITUTIONS = [
    {
        "old_file": "AR-XX-XXX-G-ESAG.yaml",  # Actually it's AR-XX-XXX-M-ESAG.yaml
        "wikidata_id": "Q106075183",
        "new_ghcid": "AR-Q-NEU-G-SAES",  # Neuquén, Gallery, Sala Arte Emilio Saraco
        "new_file": "AR-Q-NEU-G-SAES.yaml",
        "location": {
            "country": "AR",
            "region_code": "Q",
            "region_name": "Neuquén",
            "city": "Neuquén",
            "city_code": "NEU",
            "street_address": "Av. Olascoaga y Vías del Ferrocarril",
            "latitude": -38.9516,
            "longitude": -68.0591,
        },
        "resolution": {
            "method": "WEB_SEARCH",
            "research_date": "2025-12-19T00:00:00Z",
            "research_sources": [
                {"type": "web", "url": "https://www.neuquencapital.gov.ar/cultura/espaciosculturales/sala-emilio-saraco/"},
                {"type": "web", "url": "https://www.instagram.com/salaemiliosaraco/"},
            ],
            "notes": "Municipal cultural space in former railway warehouse. Correct name: Sala de Arte Emilio Saraco",
        },
        "updates": {
            "institution_type": "GALLERY",  # Correct from MUSEUM to GALLERY
            "custodian_name": {
                "claim_value": "Sala de Arte Emilio Saraco",
                "emic_name": "Sala de Arte Emilio Saraco",
            },
            "website": "https://www.neuquencapital.gov.ar/cultura/espaciosculturales/sala-emilio-saraco/",
        }
    },
    {
        "old_file": "AR-XX-XXX-G-GK.yaml",
        "wikidata_id": "Q136031976",
        "new_ghcid": "AR-C-BUE-G-GK",  # CABA, Buenos Aires, Gallery
        "new_file": "AR-C-BUE-G-GK.yaml",
        "location": {
            "country": "AR",
            "region_code": "C",
            "region_name": "Ciudad Autónoma de Buenos Aires",
            "city": "Buenos Aires",
            "city_code": "BUE",
        },
        "resolution": {
            "method": "WEB_SEARCH",
            "research_date": "2025-12-19T00:00:00Z",
            "research_sources": [
                {"type": "wikidata", "id": "Q136031976", "description": "art dealership in Buenos Aires"},
                {"type": "web", "url": "https://www.facebook.com/kramerartgallery/"},
                {"type": "academic", "note": "Referenced in art catalogs from 1980s-1990s"},
            ],
            "notes": "Historical art gallery in Buenos Aires. Website: kramerartgallery.com. May be historical/closed.",
        },
        "updates": {
            "website": "https://www.kramerartgallery.com",
        }
    },
]

# Virtual/Digital institutions to mark
VIRTUAL_INSTITUTIONS = [
    {
        "file": "AR-XX-XXX-A-AI.yaml",
        "wikidata_id": "Q125055212",
        "location_type": "VIRTUAL",
        "location_type_reason": "Digital archive project documenting the 2003 Santa Fe flood. Born-digital archival platform with no physical location. Based on community-contributed digital materials.",
        "updates": {
            "website": "https://archivoinundacion.ar/",
        },
        "resolution": {
            "method": "WEB_SEARCH",
            "research_date": "2025-12-19T00:00:00Z",
            "research_sources": [
                {"type": "web", "url": "https://archivoinundacion.ar/"},
                {"type": "web", "url": "https://commons.wikimedia.org/wiki/Commons:Archivo_Inundación_-_20_a%C3%B1os"},
                {"type": "wikidata", "id": "Q125055212", "description": "Archives digitisation project about the 2003 flood in Santa Fe, Argentina"},
            ],
            "notes": "Digital archival project commemorating 20 years of the 2003 Santa Fe flood. Community-driven digitization initiative.",
        }
    },
]

# Multinational institutions that need special handling (NOT auto-fixed)
MULTINATIONAL_NOTES = """
## Multinational Art Dealers - Manual Review Required

### Le Passé Ltd (Q135997285) and La Passe, Ltd (Q136003694)

**Finding**: These appear to be the SAME entity with variant spellings.
- Both owned by Paula de Koenigsberg (Q135891878) and Nicolas de Koenigsberg (Q135997213)
- Wikidata lists countries: United States, Mexico, Argentina
- Q136003694 has more properties (Getty ULAN ID: 12916, has collection at J. Paul Getty Museum)

**Recommendation**:
1. Merge Wikidata entries (Q135997285 → Q136003694 as primary)
2. Create separate custodian files for each country of operation:
   - US-XX-XXX-G-LPL.yaml (New York office)
   - MX-XX-XXX-G-LPL.yaml (Mexico office)
   - AR-C-BUE-G-LPL.yaml (Buenos Aires office)
3. Link them via `related_organizations` field

**For now**: Mark both files with `status: MULTINATIONAL_REVIEW_NEEDED`
"""


def fix_resolved_institution(config: dict, dry_run: bool = True) -> bool:
    """Fix a single resolved institution."""
    # Find the actual file (handle M vs G type mismatch)
    old_path = DATA_DIR / config["old_file"]
    if not old_path.exists():
        # Try finding by wikidata ID
        for f in DATA_DIR.glob("AR-XX-XXX-*.yaml"):
            with open(f, 'r', encoding='utf-8') as fp:
                data = yaml.safe_load(fp)
                if data.get('wikidata_enrichment', {}).get('wikidata_entity_id') == config['wikidata_id']:
                    old_path = f
                    break

    if not old_path.exists():
        print(f"❌ File not found: {config['old_file']} (wikidata: {config['wikidata_id']})")
        return False

    new_path = DATA_DIR / config["new_file"]

    print(f"\n{'[DRY RUN] ' if dry_run else ''}Processing: {old_path.name}")
    print(f"  → New GHCID: {config['new_ghcid']}")
    print(f"  → New file: {config['new_file']}")

    # Load existing data
    with open(old_path, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    old_ghcid = data['ghcid']['ghcid_current']
    timestamp = datetime.now(timezone.utc).isoformat()

    # Generate new UUIDs
    new_uuid = generate_ghcid_uuid(config['new_ghcid'])
    new_uuid_sha256 = generate_ghcid_uuid_sha256(config['new_ghcid'])
    new_numeric = generate_ghcid_numeric(config['new_ghcid'])

    # Update GHCID
    old_ghcid_entry = {
        'ghcid': old_ghcid,
        'ghcid_uuid': data['ghcid']['ghcid_uuid'],
        'ghcid_uuid_sha256': data['ghcid']['ghcid_uuid_sha256'],
        'ghcid_numeric': data['ghcid']['ghcid_numeric'],
        'valid_from': data['ghcid'].get('generation_timestamp', data.get('processing_timestamp')),
        'valid_to': timestamp,
        'reason': f"Location resolved via web research. New location: {config['location']['city']}, {config['location']['region_name']}"
    }

    # Initialize history if needed
    if 'ghcid_history' not in data['ghcid']:
        data['ghcid']['ghcid_history'] = []
    data['ghcid']['ghcid_history'].append(old_ghcid_entry)

    # Update current GHCID
    data['ghcid']['ghcid_current'] = config['new_ghcid']
    data['ghcid']['ghcid_uuid'] = new_uuid
    data['ghcid']['ghcid_uuid_sha256'] = new_uuid_sha256
    data['ghcid']['ghcid_numeric'] = new_numeric
    data['ghcid']['generation_timestamp'] = timestamp

    # Update location resolution
    loc = config['location']
    data['ghcid']['location_resolution'] = {
        'method': config['resolution']['method'],
        'country_code': loc['country'],
        'region_code': loc['region_code'],
        'region_name': loc['region_name'],
        'city_code': loc['city_code'],
        'city_name': loc['city'],
        'research_date': config['resolution']['research_date'],
        'research_sources': config['resolution']['research_sources'],
        'notes': config['resolution']['notes'],
    }

    # Update location block
    data['location'] = {
        'country': loc['country'],
        'region_code': loc['region_code'],
        'region_name': loc['region_name'],
        'city': loc['city'],
    }
    if 'street_address' in loc:
        data['location']['street_address'] = loc['street_address']
    if 'latitude' in loc and 'longitude' in loc:
        data['location']['latitude'] = loc['latitude']
        data['location']['longitude'] = loc['longitude']

    # Apply additional updates
    if 'updates' in config:
        for key, value in config['updates'].items():
            if key == 'custodian_name' and isinstance(value, dict):
                data['custodian_name'].update(value)
            else:
                data[key] = value

    # Update processing timestamp
    data['processing_timestamp'] = timestamp

    if dry_run:
        print(f"  Would rename: {old_path.name} → {new_path.name}")
        print(f"  New UUID: {new_uuid}")
        return True

    # Write updated data
    with open(new_path, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    # Remove old file
    if old_path != new_path:
        old_path.unlink()

    print(f"  ✅ Created: {new_path.name}")
    return True


def mark_virtual_institution(config: dict, dry_run: bool = True) -> bool:
    """Mark an institution as VIRTUAL."""
    file_path = DATA_DIR / config["file"]

    if not file_path.exists():
        print(f"❌ File not found: {config['file']}")
        return False

    print(f"\n{'[DRY RUN] ' if dry_run else ''}Marking as VIRTUAL: {file_path.name}")

    # Load existing data
    with open(file_path, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    timestamp = datetime.now(timezone.utc).isoformat()

    # Add location_type
    data['location']['location_type'] = config['location_type']
    data['location']['location_type_reason'] = config['location_type_reason']

    # Mark as intentional XX-XXX
    data['ghcid']['location_resolution']['intentional_xx_xxx'] = True
    data['ghcid']['location_resolution']['research_date'] = config['resolution']['research_date']
    data['ghcid']['location_resolution']['research_sources'] = config['resolution']['research_sources']
    data['ghcid']['location_resolution']['notes'] = config['resolution']['notes']

    # Apply additional updates
    if 'updates' in config:
        for key, value in config['updates'].items():
            data[key] = value

    # Update processing timestamp
    data['processing_timestamp'] = timestamp

    if dry_run:
        print(f"  Would mark as {config['location_type']}")
        return True

    # Write updated data
    with open(file_path, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    print(f"  ✅ Marked as {config['location_type']}")
    return True


def mark_multinational_for_review(dry_run: bool = True) -> None:
    """Mark multinational institutions for manual review."""
    multinational_files = [
        "AR-XX-XXX-G-LPL.yaml",  # Le Passé Ltd
        "AR-XX-XXX-G-PL.yaml",   # La Passe, Ltd
    ]

    timestamp = datetime.now(timezone.utc).isoformat()

    for filename in multinational_files:
        file_path = DATA_DIR / filename
        if not file_path.exists():
            print(f"❌ File not found: {filename}")
            continue

        print(f"\n{'[DRY RUN] ' if dry_run else ''}Marking for review: {filename}")

        if dry_run:
            print("  Would add status: MULTINATIONAL_REVIEW_NEEDED")
            continue

        # Load existing data
        with open(file_path, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)

        # Add review status
        data['review_status'] = {
            'status': 'MULTINATIONAL_REVIEW_NEEDED',
            'review_date': timestamp,
            'review_notes': (
                "This art dealer operated in multiple countries (US, Mexico, Argentina). "
                "Wikidata entries Q135997285 and Q136003694 appear to be the same entity "
                "with variant spelling (Le Passé Ltd vs La Passe, Ltd). "
                "Owned by Paula and Nicolas de Koenigsberg. "
                "Needs: 1) Wikidata merge, 2) Separate files per country, 3) Cross-linking."
            ),
        }

        # Update processing timestamp
        data['processing_timestamp'] = timestamp

        # Write updated data
        with open(file_path, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

        print(f"  ✅ Marked for review")


def main():
    import argparse
    parser = argparse.ArgumentParser(description="Fix remaining AR-XX-XXX institution files")
    parser.add_argument("--dry-run", action="store_true", help="Preview changes without writing")
    args = parser.parse_args()

    dry_run = args.dry_run

    print("=" * 60)
    print("Fix Remaining AR-XX-XXX Institution Files")
    print("=" * 60)

    if dry_run:
        print("\n⚠️  DRY RUN MODE - No changes will be made\n")

    # Process resolved institutions
    print("\n--- Resolving Locations ---")
    for config in RESOLVED_INSTITUTIONS:
        fix_resolved_institution(config, dry_run)

    # Process virtual institutions
    print("\n--- Marking Virtual/Digital Institutions ---")
    for config in VIRTUAL_INSTITUTIONS:
        mark_virtual_institution(config, dry_run)

    # Mark multinational for review
    print("\n--- Marking Multinational for Review ---")
    mark_multinational_for_review(dry_run)

    # Print notes about multinational handling
    print("\n" + "=" * 60)
    print(MULTINATIONAL_NOTES)

    print("\n" + "=" * 60)
    if dry_run:
        print("DRY RUN COMPLETE - Run without --dry-run to apply changes")
    else:
        print("PROCESSING COMPLETE")


if __name__ == "__main__":
    main()