#!/usr/bin/env python3 """ Fix remaining AR-XX-XXX institution files with researched locations. Research findings (2025-12-19): 1. Archivo Inundación (Q125055212) - Digital archive, mark as VIRTUAL 2. Sala de Arte Emilio Saraco (Q106075183) - Neuquén city, AR-Q-NEU 3. Galería Kramer (Q136031976) - Buenos Aires (CABA), AR-C-BUE 4. Le Passé Ltd (Q135997285) - MULTINATIONAL (US/MX/AR), needs special handling 5. La Passe, Ltd (Q136003694) - Likely duplicate of Q135997285, same owners This script handles cases 1-3. Cases 4-5 need manual review for multinational handling. """ import os import sys import uuid import hashlib import shutil from datetime import datetime, timezone from pathlib import Path # Add project root to path sys.path.insert(0, str(Path(__file__).parent.parent)) import yaml # Configure YAML to preserve formatting def str_representer(dumper, data): if '\n' in data: return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) yaml.add_representer(str, str_representer) DATA_DIR = Path(__file__).parent.parent / "data" / "custodian" # GHCID namespace for UUID v5 generation GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8") # DNS namespace as base def generate_ghcid_uuid(ghcid_string: str) -> str: """Generate UUID v5 from GHCID string.""" return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string)) def generate_ghcid_uuid_sha256(ghcid_string: str) -> str: """Generate UUID v8 (SHA-256 based) from GHCID string.""" hash_bytes = hashlib.sha256(ghcid_string.encode()).digest()[:16] hash_bytes = bytearray(hash_bytes) hash_bytes[6] = (hash_bytes[6] & 0x0F) | 0x80 # Version 8 hash_bytes[8] = (hash_bytes[8] & 0x3F) | 0x80 # Variant return str(uuid.UUID(bytes=bytes(hash_bytes))) def generate_ghcid_numeric(ghcid_string: str) -> int: """Generate 64-bit numeric ID from GHCID string.""" hash_bytes = hashlib.sha256(ghcid_string.encode()).digest() return int.from_bytes(hash_bytes[:8], byteorder='big') # Institutions to FIX with resolved locations RESOLVED_INSTITUTIONS = [ { "old_file": "AR-XX-XXX-G-ESAG.yaml", # Actually it's AR-XX-XXX-M-ESAG.yaml "wikidata_id": "Q106075183", "new_ghcid": "AR-Q-NEU-G-SAES", # Neuquén, Gallery, Sala Arte Emilio Saraco "new_file": "AR-Q-NEU-G-SAES.yaml", "location": { "country": "AR", "region_code": "Q", "region_name": "Neuquén", "city": "Neuquén", "city_code": "NEU", "street_address": "Av. Olascoaga y Vías del Ferrocarril", "latitude": -38.9516, "longitude": -68.0591, }, "resolution": { "method": "WEB_SEARCH", "research_date": "2025-12-19T00:00:00Z", "research_sources": [ {"type": "web", "url": "https://www.neuquencapital.gov.ar/cultura/espaciosculturales/sala-emilio-saraco/"}, {"type": "web", "url": "https://www.instagram.com/salaemiliosaraco/"}, ], "notes": "Municipal cultural space in former railway warehouse. Correct name: Sala de Arte Emilio Saraco", }, "updates": { "institution_type": "GALLERY", # Correct from MUSEUM to GALLERY "custodian_name": { "claim_value": "Sala de Arte Emilio Saraco", "emic_name": "Sala de Arte Emilio Saraco", }, "website": "https://www.neuquencapital.gov.ar/cultura/espaciosculturales/sala-emilio-saraco/", } }, { "old_file": "AR-XX-XXX-G-GK.yaml", "wikidata_id": "Q136031976", "new_ghcid": "AR-C-BUE-G-GK", # CABA, Buenos Aires, Gallery "new_file": "AR-C-BUE-G-GK.yaml", "location": { "country": "AR", "region_code": "C", "region_name": "Ciudad Autónoma de Buenos Aires", "city": "Buenos Aires", "city_code": "BUE", }, "resolution": { "method": "WEB_SEARCH", "research_date": "2025-12-19T00:00:00Z", "research_sources": [ {"type": "wikidata", "id": "Q136031976", "description": "art dealership in Buenos Aires"}, {"type": "web", "url": "https://www.facebook.com/kramerartgallery/"}, {"type": "academic", "note": "Referenced in art catalogs from 1980s-1990s"}, ], "notes": "Historical art gallery in Buenos Aires. Website: kramerartgallery.com. May be historical/closed.", }, "updates": { "website": "https://www.kramerartgallery.com", } }, ] # Virtual/Digital institutions to mark VIRTUAL_INSTITUTIONS = [ { "file": "AR-XX-XXX-A-AI.yaml", "wikidata_id": "Q125055212", "location_type": "VIRTUAL", "location_type_reason": "Digital archive project documenting the 2003 Santa Fe flood. Born-digital archival platform with no physical location. Based on community-contributed digital materials.", "updates": { "website": "https://archivoinundacion.ar/", }, "resolution": { "method": "WEB_SEARCH", "research_date": "2025-12-19T00:00:00Z", "research_sources": [ {"type": "web", "url": "https://archivoinundacion.ar/"}, {"type": "web", "url": "https://commons.wikimedia.org/wiki/Commons:Archivo_Inundación_-_20_a%C3%B1os"}, {"type": "wikidata", "id": "Q125055212", "description": "Archives digitisation project about the 2003 flood in Santa Fe, Argentina"}, ], "notes": "Digital archival project commemorating 20 years of the 2003 Santa Fe flood. Community-driven digitization initiative.", } }, ] # Multinational institutions that need special handling (NOT auto-fixed) MULTINATIONAL_NOTES = """ ## Multinational Art Dealers - Manual Review Required ### Le Passé Ltd (Q135997285) and La Passe, Ltd (Q136003694) **Finding**: These appear to be the SAME entity with variant spellings. - Both owned by Paula de Koenigsberg (Q135891878) and Nicolas de Koenigsberg (Q135997213) - Wikidata lists countries: United States, Mexico, Argentina - Q136003694 has more properties (Getty ULAN ID: 12916, has collection at J. Paul Getty Museum) **Recommendation**: 1. Merge Wikidata entries (Q135997285 → Q136003694 as primary) 2. Create separate custodian files for each country of operation: - US-XX-XXX-G-LPL.yaml (New York office) - MX-XX-XXX-G-LPL.yaml (Mexico office) - AR-C-BUE-G-LPL.yaml (Buenos Aires office) 3. Link them via `related_organizations` field **For now**: Mark both files with `status: MULTINATIONAL_REVIEW_NEEDED` """ def fix_resolved_institution(config: dict, dry_run: bool = True) -> bool: """Fix a single resolved institution.""" # Find the actual file (handle M vs G type mismatch) old_path = DATA_DIR / config["old_file"] if not old_path.exists(): # Try finding by wikidata ID for f in DATA_DIR.glob("AR-XX-XXX-*.yaml"): with open(f, 'r', encoding='utf-8') as fp: data = yaml.safe_load(fp) if data.get('wikidata_enrichment', {}).get('wikidata_entity_id') == config['wikidata_id']: old_path = f break if not old_path.exists(): print(f"❌ File not found: {config['old_file']} (wikidata: {config['wikidata_id']})") return False new_path = DATA_DIR / config["new_file"] print(f"\n{'[DRY RUN] ' if dry_run else ''}Processing: {old_path.name}") print(f" → New GHCID: {config['new_ghcid']}") print(f" → New file: {config['new_file']}") # Load existing data with open(old_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) old_ghcid = data['ghcid']['ghcid_current'] timestamp = datetime.now(timezone.utc).isoformat() # Generate new UUIDs new_uuid = generate_ghcid_uuid(config['new_ghcid']) new_uuid_sha256 = generate_ghcid_uuid_sha256(config['new_ghcid']) new_numeric = generate_ghcid_numeric(config['new_ghcid']) # Update GHCID old_ghcid_entry = { 'ghcid': old_ghcid, 'ghcid_uuid': data['ghcid']['ghcid_uuid'], 'ghcid_uuid_sha256': data['ghcid']['ghcid_uuid_sha256'], 'ghcid_numeric': data['ghcid']['ghcid_numeric'], 'valid_from': data['ghcid'].get('generation_timestamp', data.get('processing_timestamp')), 'valid_to': timestamp, 'reason': f"Location resolved via web research. New location: {config['location']['city']}, {config['location']['region_name']}" } # Initialize history if needed if 'ghcid_history' not in data['ghcid']: data['ghcid']['ghcid_history'] = [] data['ghcid']['ghcid_history'].append(old_ghcid_entry) # Update current GHCID data['ghcid']['ghcid_current'] = config['new_ghcid'] data['ghcid']['ghcid_uuid'] = new_uuid data['ghcid']['ghcid_uuid_sha256'] = new_uuid_sha256 data['ghcid']['ghcid_numeric'] = new_numeric data['ghcid']['generation_timestamp'] = timestamp # Update location resolution loc = config['location'] data['ghcid']['location_resolution'] = { 'method': config['resolution']['method'], 'country_code': loc['country'], 'region_code': loc['region_code'], 'region_name': loc['region_name'], 'city_code': loc['city_code'], 'city_name': loc['city'], 'research_date': config['resolution']['research_date'], 'research_sources': config['resolution']['research_sources'], 'notes': config['resolution']['notes'], } # Update location block data['location'] = { 'country': loc['country'], 'region_code': loc['region_code'], 'region_name': loc['region_name'], 'city': loc['city'], } if 'street_address' in loc: data['location']['street_address'] = loc['street_address'] if 'latitude' in loc and 'longitude' in loc: data['location']['latitude'] = loc['latitude'] data['location']['longitude'] = loc['longitude'] # Apply additional updates if 'updates' in config: for key, value in config['updates'].items(): if key == 'custodian_name' and isinstance(value, dict): data['custodian_name'].update(value) else: data[key] = value # Update processing timestamp data['processing_timestamp'] = timestamp if dry_run: print(f" Would rename: {old_path.name} → {new_path.name}") print(f" New UUID: {new_uuid}") return True # Write updated data with open(new_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Remove old file if old_path != new_path: old_path.unlink() print(f" ✅ Created: {new_path.name}") return True def mark_virtual_institution(config: dict, dry_run: bool = True) -> bool: """Mark an institution as VIRTUAL.""" file_path = DATA_DIR / config["file"] if not file_path.exists(): print(f"❌ File not found: {config['file']}") return False print(f"\n{'[DRY RUN] ' if dry_run else ''}Marking as VIRTUAL: {file_path.name}") # Load existing data with open(file_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) timestamp = datetime.now(timezone.utc).isoformat() # Add location_type data['location']['location_type'] = config['location_type'] data['location']['location_type_reason'] = config['location_type_reason'] # Mark as intentional XX-XXX data['ghcid']['location_resolution']['intentional_xx_xxx'] = True data['ghcid']['location_resolution']['research_date'] = config['resolution']['research_date'] data['ghcid']['location_resolution']['research_sources'] = config['resolution']['research_sources'] data['ghcid']['location_resolution']['notes'] = config['resolution']['notes'] # Apply additional updates if 'updates' in config: for key, value in config['updates'].items(): data[key] = value # Update processing timestamp data['processing_timestamp'] = timestamp if dry_run: print(f" Would mark as {config['location_type']}") return True # Write updated data with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f" ✅ Marked as {config['location_type']}") return True def mark_multinational_for_review(dry_run: bool = True) -> None: """Mark multinational institutions for manual review.""" multinational_files = [ "AR-XX-XXX-G-LPL.yaml", # Le Passé Ltd "AR-XX-XXX-G-PL.yaml", # La Passe, Ltd ] timestamp = datetime.now(timezone.utc).isoformat() for filename in multinational_files: file_path = DATA_DIR / filename if not file_path.exists(): print(f"❌ File not found: {filename}") continue print(f"\n{'[DRY RUN] ' if dry_run else ''}Marking for review: {filename}") if dry_run: print(" Would add status: MULTINATIONAL_REVIEW_NEEDED") continue # Load existing data with open(file_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Add review status data['review_status'] = { 'status': 'MULTINATIONAL_REVIEW_NEEDED', 'review_date': timestamp, 'review_notes': ( "This art dealer operated in multiple countries (US, Mexico, Argentina). " "Wikidata entries Q135997285 and Q136003694 appear to be the same entity " "with variant spelling (Le Passé Ltd vs La Passe, Ltd). " "Owned by Paula and Nicolas de Koenigsberg. " "Needs: 1) Wikidata merge, 2) Separate files per country, 3) Cross-linking." ), } # Update processing timestamp data['processing_timestamp'] = timestamp # Write updated data with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f" ✅ Marked for review") def main(): import argparse parser = argparse.ArgumentParser(description="Fix remaining AR-XX-XXX institution files") parser.add_argument("--dry-run", action="store_true", help="Preview changes without writing") args = parser.parse_args() dry_run = args.dry_run print("=" * 60) print("Fix Remaining AR-XX-XXX Institution Files") print("=" * 60) if dry_run: print("\n⚠️ DRY RUN MODE - No changes will be made\n") # Process resolved institutions print("\n--- Resolving Locations ---") for config in RESOLVED_INSTITUTIONS: fix_resolved_institution(config, dry_run) # Process virtual institutions print("\n--- Marking Virtual/Digital Institutions ---") for config in VIRTUAL_INSTITUTIONS: mark_virtual_institution(config, dry_run) # Mark multinational for review print("\n--- Marking Multinational for Review ---") mark_multinational_for_review(dry_run) # Print notes about multinational handling print("\n" + "=" * 60) print(MULTINATIONAL_NOTES) print("\n" + "=" * 60) if dry_run: print("DRY RUN COMPLETE - Run without --dry-run to apply changes") else: print("PROCESSING COMPLETE") if __name__ == "__main__": main()