330 lines
9.6 KiB
Python
330 lines
9.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Add CIDOC-CRM TimeSpan fields to heritage custodian YAML files.
|
|
|
|
This script enriches custodian records with temporal data from:
|
|
1. conflict_status/time_of_destruction - For destroyed/damaged institutions
|
|
2. wikidata_inception - For founding dates from Wikidata
|
|
3. wikidata_claims - For inception and dissolution dates
|
|
|
|
TimeSpan follows CIDOC-CRM E52_Time-Span pattern:
|
|
- begin_of_the_begin: Earliest possible start (P82a)
|
|
- end_of_the_begin: Latest possible start (P81a)
|
|
- begin_of_the_end: Earliest possible end (P81b)
|
|
- end_of_the_end: Latest possible end (P82b)
|
|
|
|
Usage:
|
|
python scripts/add_timespan_to_custodians.py [--dry-run] [--verbose]
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any
|
|
|
|
import yaml
|
|
|
|
# Preserve YAML formatting
|
|
class PreserveQuotesLoader(yaml.SafeLoader):
|
|
pass
|
|
|
|
class PreserveQuotesDumper(yaml.SafeDumper):
|
|
pass
|
|
|
|
def str_representer(dumper, data):
|
|
if '\n' in data:
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
|
|
|
|
PreserveQuotesDumper.add_representer(str, str_representer)
|
|
|
|
|
|
def parse_date(date_str: str) -> Optional[str]:
|
|
"""Parse various date formats to ISO 8601."""
|
|
if not date_str:
|
|
return None
|
|
|
|
# Already ISO format
|
|
if 'T' in str(date_str):
|
|
return str(date_str)
|
|
|
|
date_str = str(date_str).strip()
|
|
|
|
# Try various formats
|
|
formats = [
|
|
'%Y-%m-%d',
|
|
'%Y-%m',
|
|
'%Y',
|
|
'%d/%m/%Y',
|
|
'%m/%d/%Y',
|
|
]
|
|
|
|
for fmt in formats:
|
|
try:
|
|
dt = datetime.strptime(date_str, fmt)
|
|
return dt.strftime('%Y-%m-%dT00:00:00Z')
|
|
except ValueError:
|
|
continue
|
|
|
|
# Handle year-only
|
|
try:
|
|
year = int(date_str[:4])
|
|
if 1000 <= year <= 2100:
|
|
return f"{year}-01-01T00:00:00Z"
|
|
except (ValueError, IndexError):
|
|
pass
|
|
|
|
return None
|
|
|
|
|
|
def create_timespan_from_destruction(data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
"""Create TimeSpan from conflict_status or time_of_destruction."""
|
|
|
|
# Get destruction info
|
|
conflict_status = data.get('conflict_status', {})
|
|
time_of_destruction = data.get('time_of_destruction', {})
|
|
|
|
destruction_date = (
|
|
conflict_status.get('date') or
|
|
time_of_destruction.get('date')
|
|
)
|
|
|
|
if not destruction_date:
|
|
return None
|
|
|
|
destruction_iso = parse_date(destruction_date)
|
|
if not destruction_iso:
|
|
return None
|
|
|
|
# Get description
|
|
description = (
|
|
conflict_status.get('description') or
|
|
time_of_destruction.get('description') or
|
|
'Institution destroyed or severely damaged.'
|
|
)
|
|
|
|
# Get sources
|
|
sources = (
|
|
conflict_status.get('sources') or
|
|
time_of_destruction.get('sources') or
|
|
[]
|
|
)
|
|
|
|
# Create TimeSpan - we know end but not beginning
|
|
timespan: Dict[str, Any] = {
|
|
'begin_of_the_begin': None, # Unknown founding
|
|
'end_of_the_begin': None, # Unknown founding
|
|
'begin_of_the_end': destruction_iso,
|
|
'end_of_the_end': destruction_iso,
|
|
'notes': description,
|
|
}
|
|
|
|
if sources:
|
|
timespan['sources'] = sources
|
|
|
|
return timespan
|
|
|
|
|
|
def create_timespan_from_wikidata(data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
"""Create TimeSpan from Wikidata inception/dissolution dates."""
|
|
|
|
wikidata = data.get('wikidata_enrichment', {})
|
|
|
|
# Get inception date
|
|
inception = wikidata.get('wikidata_inception')
|
|
|
|
# Also check wikidata_claims for inception
|
|
claims = wikidata.get('wikidata_claims', {})
|
|
if not inception and claims:
|
|
inception_claim = claims.get('P571_inception', {})
|
|
if isinstance(inception_claim, dict):
|
|
inception = inception_claim.get('value')
|
|
elif isinstance(inception_claim, str):
|
|
inception = inception_claim
|
|
|
|
if not inception:
|
|
return None
|
|
|
|
inception_iso = parse_date(inception)
|
|
if not inception_iso:
|
|
return None
|
|
|
|
# Check for dissolution date
|
|
dissolution = None
|
|
if claims:
|
|
dissolution_claim = claims.get('P576_dissolved', {})
|
|
if isinstance(dissolution_claim, dict):
|
|
dissolution = dissolution_claim.get('value')
|
|
elif isinstance(dissolution_claim, str):
|
|
dissolution = dissolution_claim
|
|
|
|
dissolution_iso = parse_date(dissolution) if dissolution else None
|
|
|
|
# Create TimeSpan
|
|
timespan = {
|
|
'begin_of_the_begin': inception_iso,
|
|
'end_of_the_begin': inception_iso, # Precise date known
|
|
}
|
|
|
|
if dissolution_iso:
|
|
timespan['begin_of_the_end'] = dissolution_iso
|
|
timespan['end_of_the_end'] = dissolution_iso
|
|
else:
|
|
timespan['begin_of_the_end'] = None # Still operating
|
|
timespan['end_of_the_end'] = None
|
|
|
|
# Add source note
|
|
wikidata_id = wikidata.get('wikidata_entity_id', '')
|
|
if wikidata_id:
|
|
timespan['sources'] = [f'Wikidata: {wikidata_id}']
|
|
|
|
return timespan
|
|
|
|
|
|
def merge_timespans(existing: Optional[Dict], new: Optional[Dict]) -> Optional[Dict]:
|
|
"""Merge existing and new TimeSpan, preferring more specific data."""
|
|
if not new:
|
|
return existing
|
|
if not existing:
|
|
return new
|
|
|
|
merged = {}
|
|
|
|
# For each field, prefer non-null values
|
|
for field in ['begin_of_the_begin', 'end_of_the_begin', 'begin_of_the_end', 'end_of_the_end']:
|
|
merged[field] = existing.get(field) or new.get(field)
|
|
|
|
# Merge notes
|
|
notes = []
|
|
if existing.get('notes'):
|
|
notes.append(existing['notes'])
|
|
if new.get('notes') and new['notes'] not in notes:
|
|
notes.append(new['notes'])
|
|
if notes:
|
|
merged['notes'] = '\n'.join(notes)
|
|
|
|
# Merge sources
|
|
sources = list(existing.get('sources', []))
|
|
for s in new.get('sources', []):
|
|
if s not in sources:
|
|
sources.append(s)
|
|
if sources:
|
|
merged['sources'] = sources
|
|
|
|
return merged
|
|
|
|
|
|
def process_file(filepath: Path, dry_run: bool = False, verbose: bool = False) -> bool:
|
|
"""Process a single YAML file, adding TimeSpan if possible."""
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
except Exception as e:
|
|
print(f"Error reading {filepath}: {e}")
|
|
return False
|
|
|
|
if not data:
|
|
return False
|
|
|
|
# Check if already has timespan
|
|
existing_timespan = data.get('timespan')
|
|
|
|
# Try to create TimeSpan from destruction info
|
|
destruction_timespan = create_timespan_from_destruction(data)
|
|
|
|
# Try to create TimeSpan from Wikidata
|
|
wikidata_timespan = create_timespan_from_wikidata(data)
|
|
|
|
# Merge all sources
|
|
final_timespan = existing_timespan
|
|
if destruction_timespan:
|
|
final_timespan = merge_timespans(final_timespan, destruction_timespan)
|
|
if wikidata_timespan:
|
|
final_timespan = merge_timespans(final_timespan, wikidata_timespan)
|
|
|
|
if not final_timespan:
|
|
return False
|
|
|
|
# Check if anything changed
|
|
if final_timespan == existing_timespan:
|
|
return False
|
|
|
|
if verbose:
|
|
name = data.get('custodian_name', {}).get('claim_value', filepath.stem)
|
|
print(f"Adding TimeSpan to: {name}")
|
|
print(f" begin: {final_timespan.get('begin_of_the_begin')}")
|
|
print(f" end: {final_timespan.get('begin_of_the_end')}")
|
|
|
|
if dry_run:
|
|
return True
|
|
|
|
# Update the data
|
|
data['timespan'] = final_timespan
|
|
|
|
# Write back
|
|
try:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, Dumper=PreserveQuotesDumper,
|
|
default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
return True
|
|
except Exception as e:
|
|
print(f"Error writing {filepath}: {e}")
|
|
return False
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Add TimeSpan to custodian files')
|
|
parser.add_argument('--dry-run', action='store_true', help='Do not write changes')
|
|
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
|
|
parser.add_argument('--dir', default='data/custodian', help='Directory to process')
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = Path(args.dir)
|
|
if not custodian_dir.exists():
|
|
print(f"Directory not found: {custodian_dir}")
|
|
sys.exit(1)
|
|
|
|
print(f"Processing custodian files in: {custodian_dir}")
|
|
print(f"Dry run: {args.dry_run}")
|
|
print()
|
|
|
|
total = 0
|
|
updated = 0
|
|
destroyed_count = 0
|
|
inception_count = 0
|
|
|
|
for filepath in custodian_dir.glob('*.yaml'):
|
|
total += 1
|
|
|
|
# Quick check for relevant files
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
has_destruction = 'conflict_status:' in content or 'time_of_destruction:' in content
|
|
has_inception = "wikidata_inception: '" in content
|
|
|
|
if not has_destruction and not has_inception:
|
|
continue
|
|
|
|
if process_file(filepath, args.dry_run, args.verbose):
|
|
updated += 1
|
|
if has_destruction:
|
|
destroyed_count += 1
|
|
elif has_inception:
|
|
inception_count += 1
|
|
|
|
print()
|
|
print(f"Total files scanned: {total}")
|
|
print(f"Files updated: {updated}")
|
|
print(f" - From destruction data: {destroyed_count}")
|
|
print(f" - From Wikidata inception: {inception_count}")
|
|
|
|
if args.dry_run:
|
|
print("\n(Dry run - no files were modified)")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|