416 lines
16 KiB
Python
416 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix remaining AR-XX-XXX institution files with researched locations.
|
|
|
|
Research findings (2025-12-19):
|
|
1. Archivo Inundación (Q125055212) - Digital archive, mark as VIRTUAL
|
|
2. Sala de Arte Emilio Saraco (Q106075183) - Neuquén city, AR-Q-NEU
|
|
3. Galería Kramer (Q136031976) - Buenos Aires (CABA), AR-C-BUE
|
|
4. Le Passé Ltd (Q135997285) - MULTINATIONAL (US/MX/AR), needs special handling
|
|
5. La Passe, Ltd (Q136003694) - Likely duplicate of Q135997285, same owners
|
|
|
|
This script handles cases 1-3. Cases 4-5 need manual review for multinational handling.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import uuid
|
|
import hashlib
|
|
import shutil
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Add project root to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
import yaml
|
|
|
|
# Configure YAML to preserve formatting
|
|
def str_representer(dumper, data):
|
|
if '\n' in data:
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
|
|
|
|
yaml.add_representer(str, str_representer)
|
|
|
|
DATA_DIR = Path(__file__).parent.parent / "data" / "custodian"
|
|
|
|
# GHCID namespace for UUID v5 generation
|
|
GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8") # DNS namespace as base
|
|
|
|
def generate_ghcid_uuid(ghcid_string: str) -> str:
|
|
"""Generate UUID v5 from GHCID string."""
|
|
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
|
|
|
|
def generate_ghcid_uuid_sha256(ghcid_string: str) -> str:
|
|
"""Generate UUID v8 (SHA-256 based) from GHCID string."""
|
|
hash_bytes = hashlib.sha256(ghcid_string.encode()).digest()[:16]
|
|
hash_bytes = bytearray(hash_bytes)
|
|
hash_bytes[6] = (hash_bytes[6] & 0x0F) | 0x80 # Version 8
|
|
hash_bytes[8] = (hash_bytes[8] & 0x3F) | 0x80 # Variant
|
|
return str(uuid.UUID(bytes=bytes(hash_bytes)))
|
|
|
|
def generate_ghcid_numeric(ghcid_string: str) -> int:
|
|
"""Generate 64-bit numeric ID from GHCID string."""
|
|
hash_bytes = hashlib.sha256(ghcid_string.encode()).digest()
|
|
return int.from_bytes(hash_bytes[:8], byteorder='big')
|
|
|
|
|
|
# Institutions to FIX with resolved locations
|
|
RESOLVED_INSTITUTIONS = [
|
|
{
|
|
"old_file": "AR-XX-XXX-G-ESAG.yaml", # Actually it's AR-XX-XXX-M-ESAG.yaml
|
|
"wikidata_id": "Q106075183",
|
|
"new_ghcid": "AR-Q-NEU-G-SAES", # Neuquén, Gallery, Sala Arte Emilio Saraco
|
|
"new_file": "AR-Q-NEU-G-SAES.yaml",
|
|
"location": {
|
|
"country": "AR",
|
|
"region_code": "Q",
|
|
"region_name": "Neuquén",
|
|
"city": "Neuquén",
|
|
"city_code": "NEU",
|
|
"street_address": "Av. Olascoaga y Vías del Ferrocarril",
|
|
"latitude": -38.9516,
|
|
"longitude": -68.0591,
|
|
},
|
|
"resolution": {
|
|
"method": "WEB_SEARCH",
|
|
"research_date": "2025-12-19T00:00:00Z",
|
|
"research_sources": [
|
|
{"type": "web", "url": "https://www.neuquencapital.gov.ar/cultura/espaciosculturales/sala-emilio-saraco/"},
|
|
{"type": "web", "url": "https://www.instagram.com/salaemiliosaraco/"},
|
|
],
|
|
"notes": "Municipal cultural space in former railway warehouse. Correct name: Sala de Arte Emilio Saraco",
|
|
},
|
|
"updates": {
|
|
"institution_type": "GALLERY", # Correct from MUSEUM to GALLERY
|
|
"custodian_name": {
|
|
"claim_value": "Sala de Arte Emilio Saraco",
|
|
"emic_name": "Sala de Arte Emilio Saraco",
|
|
},
|
|
"website": "https://www.neuquencapital.gov.ar/cultura/espaciosculturales/sala-emilio-saraco/",
|
|
}
|
|
},
|
|
{
|
|
"old_file": "AR-XX-XXX-G-GK.yaml",
|
|
"wikidata_id": "Q136031976",
|
|
"new_ghcid": "AR-C-BUE-G-GK", # CABA, Buenos Aires, Gallery
|
|
"new_file": "AR-C-BUE-G-GK.yaml",
|
|
"location": {
|
|
"country": "AR",
|
|
"region_code": "C",
|
|
"region_name": "Ciudad Autónoma de Buenos Aires",
|
|
"city": "Buenos Aires",
|
|
"city_code": "BUE",
|
|
},
|
|
"resolution": {
|
|
"method": "WEB_SEARCH",
|
|
"research_date": "2025-12-19T00:00:00Z",
|
|
"research_sources": [
|
|
{"type": "wikidata", "id": "Q136031976", "description": "art dealership in Buenos Aires"},
|
|
{"type": "web", "url": "https://www.facebook.com/kramerartgallery/"},
|
|
{"type": "academic", "note": "Referenced in art catalogs from 1980s-1990s"},
|
|
],
|
|
"notes": "Historical art gallery in Buenos Aires. Website: kramerartgallery.com. May be historical/closed.",
|
|
},
|
|
"updates": {
|
|
"website": "https://www.kramerartgallery.com",
|
|
}
|
|
},
|
|
]
|
|
|
|
# Virtual/Digital institutions to mark
|
|
VIRTUAL_INSTITUTIONS = [
|
|
{
|
|
"file": "AR-XX-XXX-A-AI.yaml",
|
|
"wikidata_id": "Q125055212",
|
|
"location_type": "VIRTUAL",
|
|
"location_type_reason": "Digital archive project documenting the 2003 Santa Fe flood. Born-digital archival platform with no physical location. Based on community-contributed digital materials.",
|
|
"updates": {
|
|
"website": "https://archivoinundacion.ar/",
|
|
},
|
|
"resolution": {
|
|
"method": "WEB_SEARCH",
|
|
"research_date": "2025-12-19T00:00:00Z",
|
|
"research_sources": [
|
|
{"type": "web", "url": "https://archivoinundacion.ar/"},
|
|
{"type": "web", "url": "https://commons.wikimedia.org/wiki/Commons:Archivo_Inundación_-_20_a%C3%B1os"},
|
|
{"type": "wikidata", "id": "Q125055212", "description": "Archives digitisation project about the 2003 flood in Santa Fe, Argentina"},
|
|
],
|
|
"notes": "Digital archival project commemorating 20 years of the 2003 Santa Fe flood. Community-driven digitization initiative.",
|
|
}
|
|
},
|
|
]
|
|
|
|
# Multinational institutions that need special handling (NOT auto-fixed)
|
|
MULTINATIONAL_NOTES = """
|
|
## Multinational Art Dealers - Manual Review Required
|
|
|
|
### Le Passé Ltd (Q135997285) and La Passe, Ltd (Q136003694)
|
|
|
|
**Finding**: These appear to be the SAME entity with variant spellings.
|
|
- Both owned by Paula de Koenigsberg (Q135891878) and Nicolas de Koenigsberg (Q135997213)
|
|
- Wikidata lists countries: United States, Mexico, Argentina
|
|
- Q136003694 has more properties (Getty ULAN ID: 12916, has collection at J. Paul Getty Museum)
|
|
|
|
**Recommendation**:
|
|
1. Merge Wikidata entries (Q135997285 → Q136003694 as primary)
|
|
2. Create separate custodian files for each country of operation:
|
|
- US-XX-XXX-G-LPL.yaml (New York office)
|
|
- MX-XX-XXX-G-LPL.yaml (Mexico office)
|
|
- AR-C-BUE-G-LPL.yaml (Buenos Aires office)
|
|
3. Link them via `related_organizations` field
|
|
|
|
**For now**: Mark both files with `status: MULTINATIONAL_REVIEW_NEEDED`
|
|
"""
|
|
|
|
|
|
def fix_resolved_institution(config: dict, dry_run: bool = True) -> bool:
|
|
"""Fix a single resolved institution."""
|
|
# Find the actual file (handle M vs G type mismatch)
|
|
old_path = DATA_DIR / config["old_file"]
|
|
if not old_path.exists():
|
|
# Try finding by wikidata ID
|
|
for f in DATA_DIR.glob("AR-XX-XXX-*.yaml"):
|
|
with open(f, 'r', encoding='utf-8') as fp:
|
|
data = yaml.safe_load(fp)
|
|
if data.get('wikidata_enrichment', {}).get('wikidata_entity_id') == config['wikidata_id']:
|
|
old_path = f
|
|
break
|
|
|
|
if not old_path.exists():
|
|
print(f"❌ File not found: {config['old_file']} (wikidata: {config['wikidata_id']})")
|
|
return False
|
|
|
|
new_path = DATA_DIR / config["new_file"]
|
|
|
|
print(f"\n{'[DRY RUN] ' if dry_run else ''}Processing: {old_path.name}")
|
|
print(f" → New GHCID: {config['new_ghcid']}")
|
|
print(f" → New file: {config['new_file']}")
|
|
|
|
# Load existing data
|
|
with open(old_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
old_ghcid = data['ghcid']['ghcid_current']
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Generate new UUIDs
|
|
new_uuid = generate_ghcid_uuid(config['new_ghcid'])
|
|
new_uuid_sha256 = generate_ghcid_uuid_sha256(config['new_ghcid'])
|
|
new_numeric = generate_ghcid_numeric(config['new_ghcid'])
|
|
|
|
# Update GHCID
|
|
old_ghcid_entry = {
|
|
'ghcid': old_ghcid,
|
|
'ghcid_uuid': data['ghcid']['ghcid_uuid'],
|
|
'ghcid_uuid_sha256': data['ghcid']['ghcid_uuid_sha256'],
|
|
'ghcid_numeric': data['ghcid']['ghcid_numeric'],
|
|
'valid_from': data['ghcid'].get('generation_timestamp', data.get('processing_timestamp')),
|
|
'valid_to': timestamp,
|
|
'reason': f"Location resolved via web research. New location: {config['location']['city']}, {config['location']['region_name']}"
|
|
}
|
|
|
|
# Initialize history if needed
|
|
if 'ghcid_history' not in data['ghcid']:
|
|
data['ghcid']['ghcid_history'] = []
|
|
data['ghcid']['ghcid_history'].append(old_ghcid_entry)
|
|
|
|
# Update current GHCID
|
|
data['ghcid']['ghcid_current'] = config['new_ghcid']
|
|
data['ghcid']['ghcid_uuid'] = new_uuid
|
|
data['ghcid']['ghcid_uuid_sha256'] = new_uuid_sha256
|
|
data['ghcid']['ghcid_numeric'] = new_numeric
|
|
data['ghcid']['generation_timestamp'] = timestamp
|
|
|
|
# Update location resolution
|
|
loc = config['location']
|
|
data['ghcid']['location_resolution'] = {
|
|
'method': config['resolution']['method'],
|
|
'country_code': loc['country'],
|
|
'region_code': loc['region_code'],
|
|
'region_name': loc['region_name'],
|
|
'city_code': loc['city_code'],
|
|
'city_name': loc['city'],
|
|
'research_date': config['resolution']['research_date'],
|
|
'research_sources': config['resolution']['research_sources'],
|
|
'notes': config['resolution']['notes'],
|
|
}
|
|
|
|
# Update location block
|
|
data['location'] = {
|
|
'country': loc['country'],
|
|
'region_code': loc['region_code'],
|
|
'region_name': loc['region_name'],
|
|
'city': loc['city'],
|
|
}
|
|
if 'street_address' in loc:
|
|
data['location']['street_address'] = loc['street_address']
|
|
if 'latitude' in loc and 'longitude' in loc:
|
|
data['location']['latitude'] = loc['latitude']
|
|
data['location']['longitude'] = loc['longitude']
|
|
|
|
# Apply additional updates
|
|
if 'updates' in config:
|
|
for key, value in config['updates'].items():
|
|
if key == 'custodian_name' and isinstance(value, dict):
|
|
data['custodian_name'].update(value)
|
|
else:
|
|
data[key] = value
|
|
|
|
# Update processing timestamp
|
|
data['processing_timestamp'] = timestamp
|
|
|
|
if dry_run:
|
|
print(f" Would rename: {old_path.name} → {new_path.name}")
|
|
print(f" New UUID: {new_uuid}")
|
|
return True
|
|
|
|
# Write updated data
|
|
with open(new_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
# Remove old file
|
|
if old_path != new_path:
|
|
old_path.unlink()
|
|
|
|
print(f" ✅ Created: {new_path.name}")
|
|
return True
|
|
|
|
|
|
def mark_virtual_institution(config: dict, dry_run: bool = True) -> bool:
|
|
"""Mark an institution as VIRTUAL."""
|
|
file_path = DATA_DIR / config["file"]
|
|
|
|
if not file_path.exists():
|
|
print(f"❌ File not found: {config['file']}")
|
|
return False
|
|
|
|
print(f"\n{'[DRY RUN] ' if dry_run else ''}Marking as VIRTUAL: {file_path.name}")
|
|
|
|
# Load existing data
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Add location_type
|
|
data['location']['location_type'] = config['location_type']
|
|
data['location']['location_type_reason'] = config['location_type_reason']
|
|
|
|
# Mark as intentional XX-XXX
|
|
data['ghcid']['location_resolution']['intentional_xx_xxx'] = True
|
|
data['ghcid']['location_resolution']['research_date'] = config['resolution']['research_date']
|
|
data['ghcid']['location_resolution']['research_sources'] = config['resolution']['research_sources']
|
|
data['ghcid']['location_resolution']['notes'] = config['resolution']['notes']
|
|
|
|
# Apply additional updates
|
|
if 'updates' in config:
|
|
for key, value in config['updates'].items():
|
|
data[key] = value
|
|
|
|
# Update processing timestamp
|
|
data['processing_timestamp'] = timestamp
|
|
|
|
if dry_run:
|
|
print(f" Would mark as {config['location_type']}")
|
|
return True
|
|
|
|
# Write updated data
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
print(f" ✅ Marked as {config['location_type']}")
|
|
return True
|
|
|
|
|
|
def mark_multinational_for_review(dry_run: bool = True) -> None:
|
|
"""Mark multinational institutions for manual review."""
|
|
multinational_files = [
|
|
"AR-XX-XXX-G-LPL.yaml", # Le Passé Ltd
|
|
"AR-XX-XXX-G-PL.yaml", # La Passe, Ltd
|
|
]
|
|
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
for filename in multinational_files:
|
|
file_path = DATA_DIR / filename
|
|
if not file_path.exists():
|
|
print(f"❌ File not found: {filename}")
|
|
continue
|
|
|
|
print(f"\n{'[DRY RUN] ' if dry_run else ''}Marking for review: {filename}")
|
|
|
|
if dry_run:
|
|
print(" Would add status: MULTINATIONAL_REVIEW_NEEDED")
|
|
continue
|
|
|
|
# Load existing data
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Add review status
|
|
data['review_status'] = {
|
|
'status': 'MULTINATIONAL_REVIEW_NEEDED',
|
|
'review_date': timestamp,
|
|
'review_notes': (
|
|
"This art dealer operated in multiple countries (US, Mexico, Argentina). "
|
|
"Wikidata entries Q135997285 and Q136003694 appear to be the same entity "
|
|
"with variant spelling (Le Passé Ltd vs La Passe, Ltd). "
|
|
"Owned by Paula and Nicolas de Koenigsberg. "
|
|
"Needs: 1) Wikidata merge, 2) Separate files per country, 3) Cross-linking."
|
|
),
|
|
}
|
|
|
|
# Update processing timestamp
|
|
data['processing_timestamp'] = timestamp
|
|
|
|
# Write updated data
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
print(f" ✅ Marked for review")
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description="Fix remaining AR-XX-XXX institution files")
|
|
parser.add_argument("--dry-run", action="store_true", help="Preview changes without writing")
|
|
args = parser.parse_args()
|
|
|
|
dry_run = args.dry_run
|
|
|
|
print("=" * 60)
|
|
print("Fix Remaining AR-XX-XXX Institution Files")
|
|
print("=" * 60)
|
|
|
|
if dry_run:
|
|
print("\n⚠️ DRY RUN MODE - No changes will be made\n")
|
|
|
|
# Process resolved institutions
|
|
print("\n--- Resolving Locations ---")
|
|
for config in RESOLVED_INSTITUTIONS:
|
|
fix_resolved_institution(config, dry_run)
|
|
|
|
# Process virtual institutions
|
|
print("\n--- Marking Virtual/Digital Institutions ---")
|
|
for config in VIRTUAL_INSTITUTIONS:
|
|
mark_virtual_institution(config, dry_run)
|
|
|
|
# Mark multinational for review
|
|
print("\n--- Marking Multinational for Review ---")
|
|
mark_multinational_for_review(dry_run)
|
|
|
|
# Print notes about multinational handling
|
|
print("\n" + "=" * 60)
|
|
print(MULTINATIONAL_NOTES)
|
|
|
|
print("\n" + "=" * 60)
|
|
if dry_run:
|
|
print("DRY RUN COMPLETE - Run without --dry-run to apply changes")
|
|
else:
|
|
print("PROCESSING COMPLETE")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|