#!/usr/bin/env python3 """ Deduplicate Tunisia heritage dataset by merging duplicate Chemtou records. GLAM Data Extraction Project Schema: LinkML v0.2.1 """ import yaml from datetime import datetime, timezone from pathlib import Path def merge_chemtou_records(record1, record2): """ Merge two Chemtou Archaeological Museum records. Keep the more complete information from both. """ # Use record2 as base (it has better description and digital platform) merged = record2.copy() # Merge collections (record1 has 4, record2 has 1) collections1 = record1.get('collections', []) collections2 = record2.get('collections', []) # Combine collections, avoiding duplicates by name all_collections = collections2.copy() for coll1 in collections1: is_duplicate = False for coll2 in collections2: if coll1.get('collection_name') == coll2.get('collection_name'): is_duplicate = True break if not is_duplicate: all_collections.append(coll1) merged['collections'] = all_collections # Merge change_history history1 = record1.get('change_history', []) history2 = record2.get('change_history', []) all_history = history2.copy() for h1 in history1: is_duplicate = False for h2 in history2: if h1.get('event_date') == h2.get('event_date'): is_duplicate = True break if not is_duplicate: all_history.append(h1) merged['change_history'] = all_history # Update provenance to note the merge if 'provenance' in merged: notes = merged['provenance'].get('notes', '') merge_note = f" Merged duplicate record from index 11 on {datetime.now(timezone.utc).isoformat()}." merged['provenance']['notes'] = notes + merge_note return merged def main(): input_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.yaml') output_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.yaml') backup_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.backup.yaml') print("Tunisia Heritage Dataset Deduplication") print("=" * 60) # Load data print(f"\nReading: {input_file}") with open(input_file, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) institutions = data['institutions'] original_count = len(institutions) print(f"Original count: {original_count} institutions") # Find Chemtou duplicates chemtou_indices = [] for i, inst in enumerate(institutions): if 'Chemtou' in inst.get('name', ''): chemtou_indices.append(i) print(f" Found at index {i}: {inst['name']} (GHCID: {inst.get('ghcid')})") if len(chemtou_indices) != 2: print(f"\nError: Expected 2 Chemtou records, found {len(chemtou_indices)}") return # Merge records print(f"\nMerging records at indices {chemtou_indices[0]} and {chemtou_indices[1]}...") record1 = institutions[chemtou_indices[0]] record2 = institutions[chemtou_indices[1]] merged_record = merge_chemtou_records(record1, record2) # Remove duplicates (keep merged version) # Remove the first occurrence, keep the second (which we merged into) institutions.pop(chemtou_indices[0]) # Update metadata data['_metadata']['count'] = len(institutions) data['_metadata']['generated'] = datetime.now(timezone.utc).isoformat() if 'enhancements' not in data['_metadata']: data['_metadata']['enhancements'] = [] data['_metadata']['enhancements'].append('Deduplication (Chemtou)') # Create backup print(f"\nCreating backup: {backup_file}") with open(backup_file, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) # Save deduplicated data print(f"Writing deduplicated data: {output_file}") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) final_count = len(institutions) print(f"\nDeduplication complete:") print(f" Before: {original_count} institutions") print(f" After: {final_count} institutions") print(f" Removed: {original_count - final_count} duplicate(s)") print(f"\n✅ Dataset saved to: {output_file}") if __name__ == '__main__': main()