glam/scripts/fix_remaining_ar_xx_xxx.py
2025-12-21 00:01:54 +01:00

416 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Fix remaining AR-XX-XXX institution files with researched locations.
Research findings (2025-12-19):
1. Archivo Inundación (Q125055212) - Digital archive, mark as VIRTUAL
2. Sala de Arte Emilio Saraco (Q106075183) - Neuquén city, AR-Q-NEU
3. Galería Kramer (Q136031976) - Buenos Aires (CABA), AR-C-BUE
4. Le Passé Ltd (Q135997285) - MULTINATIONAL (US/MX/AR), needs special handling
5. La Passe, Ltd (Q136003694) - Likely duplicate of Q135997285, same owners
This script handles cases 1-3. Cases 4-5 need manual review for multinational handling.
"""
import os
import sys
import uuid
import hashlib
import shutil
from datetime import datetime, timezone
from pathlib import Path
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))
import yaml
# Configure YAML to preserve formatting
def str_representer(dumper, data):
if '\n' in data:
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
yaml.add_representer(str, str_representer)
DATA_DIR = Path(__file__).parent.parent / "data" / "custodian"
# GHCID namespace for UUID v5 generation
GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8") # DNS namespace as base
def generate_ghcid_uuid(ghcid_string: str) -> str:
"""Generate UUID v5 from GHCID string."""
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
def generate_ghcid_uuid_sha256(ghcid_string: str) -> str:
"""Generate UUID v8 (SHA-256 based) from GHCID string."""
hash_bytes = hashlib.sha256(ghcid_string.encode()).digest()[:16]
hash_bytes = bytearray(hash_bytes)
hash_bytes[6] = (hash_bytes[6] & 0x0F) | 0x80 # Version 8
hash_bytes[8] = (hash_bytes[8] & 0x3F) | 0x80 # Variant
return str(uuid.UUID(bytes=bytes(hash_bytes)))
def generate_ghcid_numeric(ghcid_string: str) -> int:
"""Generate 64-bit numeric ID from GHCID string."""
hash_bytes = hashlib.sha256(ghcid_string.encode()).digest()
return int.from_bytes(hash_bytes[:8], byteorder='big')
# Institutions to FIX with resolved locations
RESOLVED_INSTITUTIONS = [
{
"old_file": "AR-XX-XXX-G-ESAG.yaml", # Actually it's AR-XX-XXX-M-ESAG.yaml
"wikidata_id": "Q106075183",
"new_ghcid": "AR-Q-NEU-G-SAES", # Neuquén, Gallery, Sala Arte Emilio Saraco
"new_file": "AR-Q-NEU-G-SAES.yaml",
"location": {
"country": "AR",
"region_code": "Q",
"region_name": "Neuquén",
"city": "Neuquén",
"city_code": "NEU",
"street_address": "Av. Olascoaga y Vías del Ferrocarril",
"latitude": -38.9516,
"longitude": -68.0591,
},
"resolution": {
"method": "WEB_SEARCH",
"research_date": "2025-12-19T00:00:00Z",
"research_sources": [
{"type": "web", "url": "https://www.neuquencapital.gov.ar/cultura/espaciosculturales/sala-emilio-saraco/"},
{"type": "web", "url": "https://www.instagram.com/salaemiliosaraco/"},
],
"notes": "Municipal cultural space in former railway warehouse. Correct name: Sala de Arte Emilio Saraco",
},
"updates": {
"institution_type": "GALLERY", # Correct from MUSEUM to GALLERY
"custodian_name": {
"claim_value": "Sala de Arte Emilio Saraco",
"emic_name": "Sala de Arte Emilio Saraco",
},
"website": "https://www.neuquencapital.gov.ar/cultura/espaciosculturales/sala-emilio-saraco/",
}
},
{
"old_file": "AR-XX-XXX-G-GK.yaml",
"wikidata_id": "Q136031976",
"new_ghcid": "AR-C-BUE-G-GK", # CABA, Buenos Aires, Gallery
"new_file": "AR-C-BUE-G-GK.yaml",
"location": {
"country": "AR",
"region_code": "C",
"region_name": "Ciudad Autónoma de Buenos Aires",
"city": "Buenos Aires",
"city_code": "BUE",
},
"resolution": {
"method": "WEB_SEARCH",
"research_date": "2025-12-19T00:00:00Z",
"research_sources": [
{"type": "wikidata", "id": "Q136031976", "description": "art dealership in Buenos Aires"},
{"type": "web", "url": "https://www.facebook.com/kramerartgallery/"},
{"type": "academic", "note": "Referenced in art catalogs from 1980s-1990s"},
],
"notes": "Historical art gallery in Buenos Aires. Website: kramerartgallery.com. May be historical/closed.",
},
"updates": {
"website": "https://www.kramerartgallery.com",
}
},
]
# Virtual/Digital institutions to mark
VIRTUAL_INSTITUTIONS = [
{
"file": "AR-XX-XXX-A-AI.yaml",
"wikidata_id": "Q125055212",
"location_type": "VIRTUAL",
"location_type_reason": "Digital archive project documenting the 2003 Santa Fe flood. Born-digital archival platform with no physical location. Based on community-contributed digital materials.",
"updates": {
"website": "https://archivoinundacion.ar/",
},
"resolution": {
"method": "WEB_SEARCH",
"research_date": "2025-12-19T00:00:00Z",
"research_sources": [
{"type": "web", "url": "https://archivoinundacion.ar/"},
{"type": "web", "url": "https://commons.wikimedia.org/wiki/Commons:Archivo_Inundación_-_20_a%C3%B1os"},
{"type": "wikidata", "id": "Q125055212", "description": "Archives digitisation project about the 2003 flood in Santa Fe, Argentina"},
],
"notes": "Digital archival project commemorating 20 years of the 2003 Santa Fe flood. Community-driven digitization initiative.",
}
},
]
# Multinational institutions that need special handling (NOT auto-fixed)
MULTINATIONAL_NOTES = """
## Multinational Art Dealers - Manual Review Required
### Le Passé Ltd (Q135997285) and La Passe, Ltd (Q136003694)
**Finding**: These appear to be the SAME entity with variant spellings.
- Both owned by Paula de Koenigsberg (Q135891878) and Nicolas de Koenigsberg (Q135997213)
- Wikidata lists countries: United States, Mexico, Argentina
- Q136003694 has more properties (Getty ULAN ID: 12916, has collection at J. Paul Getty Museum)
**Recommendation**:
1. Merge Wikidata entries (Q135997285 → Q136003694 as primary)
2. Create separate custodian files for each country of operation:
- US-XX-XXX-G-LPL.yaml (New York office)
- MX-XX-XXX-G-LPL.yaml (Mexico office)
- AR-C-BUE-G-LPL.yaml (Buenos Aires office)
3. Link them via `related_organizations` field
**For now**: Mark both files with `status: MULTINATIONAL_REVIEW_NEEDED`
"""
def fix_resolved_institution(config: dict, dry_run: bool = True) -> bool:
"""Fix a single resolved institution."""
# Find the actual file (handle M vs G type mismatch)
old_path = DATA_DIR / config["old_file"]
if not old_path.exists():
# Try finding by wikidata ID
for f in DATA_DIR.glob("AR-XX-XXX-*.yaml"):
with open(f, 'r', encoding='utf-8') as fp:
data = yaml.safe_load(fp)
if data.get('wikidata_enrichment', {}).get('wikidata_entity_id') == config['wikidata_id']:
old_path = f
break
if not old_path.exists():
print(f"❌ File not found: {config['old_file']} (wikidata: {config['wikidata_id']})")
return False
new_path = DATA_DIR / config["new_file"]
print(f"\n{'[DRY RUN] ' if dry_run else ''}Processing: {old_path.name}")
print(f" → New GHCID: {config['new_ghcid']}")
print(f" → New file: {config['new_file']}")
# Load existing data
with open(old_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
old_ghcid = data['ghcid']['ghcid_current']
timestamp = datetime.now(timezone.utc).isoformat()
# Generate new UUIDs
new_uuid = generate_ghcid_uuid(config['new_ghcid'])
new_uuid_sha256 = generate_ghcid_uuid_sha256(config['new_ghcid'])
new_numeric = generate_ghcid_numeric(config['new_ghcid'])
# Update GHCID
old_ghcid_entry = {
'ghcid': old_ghcid,
'ghcid_uuid': data['ghcid']['ghcid_uuid'],
'ghcid_uuid_sha256': data['ghcid']['ghcid_uuid_sha256'],
'ghcid_numeric': data['ghcid']['ghcid_numeric'],
'valid_from': data['ghcid'].get('generation_timestamp', data.get('processing_timestamp')),
'valid_to': timestamp,
'reason': f"Location resolved via web research. New location: {config['location']['city']}, {config['location']['region_name']}"
}
# Initialize history if needed
if 'ghcid_history' not in data['ghcid']:
data['ghcid']['ghcid_history'] = []
data['ghcid']['ghcid_history'].append(old_ghcid_entry)
# Update current GHCID
data['ghcid']['ghcid_current'] = config['new_ghcid']
data['ghcid']['ghcid_uuid'] = new_uuid
data['ghcid']['ghcid_uuid_sha256'] = new_uuid_sha256
data['ghcid']['ghcid_numeric'] = new_numeric
data['ghcid']['generation_timestamp'] = timestamp
# Update location resolution
loc = config['location']
data['ghcid']['location_resolution'] = {
'method': config['resolution']['method'],
'country_code': loc['country'],
'region_code': loc['region_code'],
'region_name': loc['region_name'],
'city_code': loc['city_code'],
'city_name': loc['city'],
'research_date': config['resolution']['research_date'],
'research_sources': config['resolution']['research_sources'],
'notes': config['resolution']['notes'],
}
# Update location block
data['location'] = {
'country': loc['country'],
'region_code': loc['region_code'],
'region_name': loc['region_name'],
'city': loc['city'],
}
if 'street_address' in loc:
data['location']['street_address'] = loc['street_address']
if 'latitude' in loc and 'longitude' in loc:
data['location']['latitude'] = loc['latitude']
data['location']['longitude'] = loc['longitude']
# Apply additional updates
if 'updates' in config:
for key, value in config['updates'].items():
if key == 'custodian_name' and isinstance(value, dict):
data['custodian_name'].update(value)
else:
data[key] = value
# Update processing timestamp
data['processing_timestamp'] = timestamp
if dry_run:
print(f" Would rename: {old_path.name}{new_path.name}")
print(f" New UUID: {new_uuid}")
return True
# Write updated data
with open(new_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Remove old file
if old_path != new_path:
old_path.unlink()
print(f" ✅ Created: {new_path.name}")
return True
def mark_virtual_institution(config: dict, dry_run: bool = True) -> bool:
"""Mark an institution as VIRTUAL."""
file_path = DATA_DIR / config["file"]
if not file_path.exists():
print(f"❌ File not found: {config['file']}")
return False
print(f"\n{'[DRY RUN] ' if dry_run else ''}Marking as VIRTUAL: {file_path.name}")
# Load existing data
with open(file_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
timestamp = datetime.now(timezone.utc).isoformat()
# Add location_type
data['location']['location_type'] = config['location_type']
data['location']['location_type_reason'] = config['location_type_reason']
# Mark as intentional XX-XXX
data['ghcid']['location_resolution']['intentional_xx_xxx'] = True
data['ghcid']['location_resolution']['research_date'] = config['resolution']['research_date']
data['ghcid']['location_resolution']['research_sources'] = config['resolution']['research_sources']
data['ghcid']['location_resolution']['notes'] = config['resolution']['notes']
# Apply additional updates
if 'updates' in config:
for key, value in config['updates'].items():
data[key] = value
# Update processing timestamp
data['processing_timestamp'] = timestamp
if dry_run:
print(f" Would mark as {config['location_type']}")
return True
# Write updated data
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f" ✅ Marked as {config['location_type']}")
return True
def mark_multinational_for_review(dry_run: bool = True) -> None:
"""Mark multinational institutions for manual review."""
multinational_files = [
"AR-XX-XXX-G-LPL.yaml", # Le Passé Ltd
"AR-XX-XXX-G-PL.yaml", # La Passe, Ltd
]
timestamp = datetime.now(timezone.utc).isoformat()
for filename in multinational_files:
file_path = DATA_DIR / filename
if not file_path.exists():
print(f"❌ File not found: {filename}")
continue
print(f"\n{'[DRY RUN] ' if dry_run else ''}Marking for review: {filename}")
if dry_run:
print(" Would add status: MULTINATIONAL_REVIEW_NEEDED")
continue
# Load existing data
with open(file_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Add review status
data['review_status'] = {
'status': 'MULTINATIONAL_REVIEW_NEEDED',
'review_date': timestamp,
'review_notes': (
"This art dealer operated in multiple countries (US, Mexico, Argentina). "
"Wikidata entries Q135997285 and Q136003694 appear to be the same entity "
"with variant spelling (Le Passé Ltd vs La Passe, Ltd). "
"Owned by Paula and Nicolas de Koenigsberg. "
"Needs: 1) Wikidata merge, 2) Separate files per country, 3) Cross-linking."
),
}
# Update processing timestamp
data['processing_timestamp'] = timestamp
# Write updated data
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f" ✅ Marked for review")
def main():
import argparse
parser = argparse.ArgumentParser(description="Fix remaining AR-XX-XXX institution files")
parser.add_argument("--dry-run", action="store_true", help="Preview changes without writing")
args = parser.parse_args()
dry_run = args.dry_run
print("=" * 60)
print("Fix Remaining AR-XX-XXX Institution Files")
print("=" * 60)
if dry_run:
print("\n⚠️ DRY RUN MODE - No changes will be made\n")
# Process resolved institutions
print("\n--- Resolving Locations ---")
for config in RESOLVED_INSTITUTIONS:
fix_resolved_institution(config, dry_run)
# Process virtual institutions
print("\n--- Marking Virtual/Digital Institutions ---")
for config in VIRTUAL_INSTITUTIONS:
mark_virtual_institution(config, dry_run)
# Mark multinational for review
print("\n--- Marking Multinational for Review ---")
mark_multinational_for_review(dry_run)
# Print notes about multinational handling
print("\n" + "=" * 60)
print(MULTINATIONAL_NOTES)
print("\n" + "=" * 60)
if dry_run:
print("DRY RUN COMPLETE - Run without --dry-run to apply changes")
else:
print("PROCESSING COMPLETE")
if __name__ == "__main__":
main()