glam/scripts/deduplicate_tunisia.py

#!/usr/bin/env python3
"""
Deduplicate Tunisia heritage dataset by merging duplicate Chemtou records.

GLAM Data Extraction Project
Schema: LinkML v0.2.1
"""

import yaml
from datetime import datetime, timezone
from pathlib import Path

def merge_chemtou_records(record1, record2):
    """
    Merge two Chemtou Archaeological Museum records.
    Keep the more complete information from both.
    """
    # Use record2 as base (it has better description and digital platform)
    merged = record2.copy()

    # Merge collections (record1 has 4, record2 has 1)
    collections1 = record1.get('collections', [])
    collections2 = record2.get('collections', [])

    # Combine collections, avoiding duplicates by name
    all_collections = collections2.copy()
    for coll1 in collections1:
        is_duplicate = False
        for coll2 in collections2:
            if coll1.get('collection_name') == coll2.get('collection_name'):
                is_duplicate = True
                break
        if not is_duplicate:
            all_collections.append(coll1)

    merged['collections'] = all_collections

    # Merge change_history
    history1 = record1.get('change_history', [])
    history2 = record2.get('change_history', [])
    all_history = history2.copy()
    for h1 in history1:
        is_duplicate = False
        for h2 in history2:
            if h1.get('event_date') == h2.get('event_date'):
                is_duplicate = True
                break
        if not is_duplicate:
            all_history.append(h1)
    merged['change_history'] = all_history

    # Update provenance to note the merge
    if 'provenance' in merged:
        notes = merged['provenance'].get('notes', '')
        merge_note = f" Merged duplicate record from index 11 on {datetime.now(timezone.utc).isoformat()}."
        merged['provenance']['notes'] = notes + merge_note

    return merged

def main():
    input_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.yaml')
    output_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.yaml')
    backup_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.backup.yaml')

    print("Tunisia Heritage Dataset Deduplication")
    print("=" * 60)

    # Load data
    print(f"\nReading: {input_file}")
    with open(input_file, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    institutions = data['institutions']
    original_count = len(institutions)
    print(f"Original count: {original_count} institutions")

    # Find Chemtou duplicates
    chemtou_indices = []
    for i, inst in enumerate(institutions):
        if 'Chemtou' in inst.get('name', ''):
            chemtou_indices.append(i)
            print(f"  Found at index {i}: {inst['name']} (GHCID: {inst.get('ghcid')})")

    if len(chemtou_indices) != 2:
        print(f"\nError: Expected 2 Chemtou records, found {len(chemtou_indices)}")
        return

    # Merge records
    print(f"\nMerging records at indices {chemtou_indices[0]} and {chemtou_indices[1]}...")
    record1 = institutions[chemtou_indices[0]]
    record2 = institutions[chemtou_indices[1]]

    merged_record = merge_chemtou_records(record1, record2)

    # Remove duplicates (keep merged version)
    # Remove the first occurrence, keep the second (which we merged into)
    institutions.pop(chemtou_indices[0])

    # Update metadata
    data['_metadata']['count'] = len(institutions)
    data['_metadata']['generated'] = datetime.now(timezone.utc).isoformat()
    if 'enhancements' not in data['_metadata']:
        data['_metadata']['enhancements'] = []
    data['_metadata']['enhancements'].append('Deduplication (Chemtou)')

    # Create backup
    print(f"\nCreating backup: {backup_file}")
    with open(backup_file, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    # Save deduplicated data
    print(f"Writing deduplicated data: {output_file}")
    with open(output_file, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    final_count = len(institutions)
    print(f"\nDeduplication complete:")
    print(f"  Before: {original_count} institutions")
    print(f"  After:  {final_count} institutions")
    print(f"  Removed: {original_count - final_count} duplicate(s)")
    print(f"\n✅ Dataset saved to: {output_file}")

if __name__ == '__main__':
    main()