#!/usr/bin/env python3 """ Deduplicate Brazilian institutions in the global dataset. Strategy: 1. Group Brazilian institutions by name 2. For each group with duplicates: - Merge location data (prefer most complete) - Merge identifiers (combine all unique identifiers) - Merge digital_platforms (combine all unique platforms) - Merge collections (combine all unique collections) - Prefer record with higher confidence_score - Update provenance to note the merge 3. Write deduplicated dataset """ import yaml from datetime import datetime, timezone from collections import defaultdict from typing import List, Dict, Any import sys def load_yaml(filepath: str) -> List[Dict[str, Any]]: """Load YAML file.""" with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) or [] def save_yaml(filepath: str, data: List[Dict[str, Any]]): """Save YAML file.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) def is_brazilian(inst: Dict[str, Any]) -> bool: """Check if institution is Brazilian.""" locations = inst.get('locations') or [] return any(loc.get('country') == 'BR' for loc in locations) def merge_locations(loc1: List[Dict], loc2: List[Dict]) -> List[Dict]: """Merge location lists, preferring more complete records.""" if not loc1: return loc2 or [] if not loc2: return loc1 # Use location with more fields def location_completeness(loc): return sum(1 for k, v in loc.items() if v is not None) best_loc = max(loc1 + loc2, key=location_completeness) return [best_loc] def merge_identifiers(id1: List[Dict], id2: List[Dict]) -> List[Dict]: """Merge identifier lists, removing duplicates.""" all_ids = (id1 or []) + (id2 or []) # Deduplicate by (scheme, value) tuple unique_ids = {} for ident in all_ids: key = (ident.get('identifier_scheme'), ident.get('identifier_value')) if key not in unique_ids: unique_ids[key] = ident return list(unique_ids.values()) def merge_platforms(plat1: List[Dict], plat2: List[Dict]) -> List[Dict]: """Merge platform lists, removing duplicates.""" all_platforms = (plat1 or []) + (plat2 or []) # Deduplicate by platform_url unique_platforms = {} for plat in all_platforms: url = plat.get('platform_url') if url and url not in unique_platforms: unique_platforms[url] = plat return list(unique_platforms.values()) def merge_collections(coll1: List[Dict], coll2: List[Dict]) -> List[Dict]: """Merge collection lists, removing duplicates.""" all_collections = (coll1 or []) + (coll2 or []) # Deduplicate by collection_name unique_collections = {} for coll in all_collections: name = coll.get('collection_name') if name and name not in unique_collections: unique_collections[name] = coll return list(unique_collections.values()) def merge_institutions(institutions: List[Dict[str, Any]]) -> Dict[str, Any]: """Merge multiple institution records into one.""" if len(institutions) == 1: return institutions[0] # Sort by confidence score (prefer higher) then by completeness def score_completeness(inst): confidence = inst.get('provenance', {}).get('confidence_score', 0.5) field_count = sum(1 for v in inst.values() if v is not None) return (confidence, field_count) institutions.sort(key=score_completeness, reverse=True) # Use best record as base merged = institutions[0].copy() # Merge locations from all records all_locations = [] for inst in institutions: if inst.get('locations'): all_locations.extend(inst['locations']) merged['locations'] = merge_locations( merged.get('locations') or [], all_locations[1:] if len(all_locations) > 1 else [] ) # Merge identifiers all_identifiers = [] for inst in institutions: if inst.get('identifiers'): all_identifiers.extend(inst['identifiers']) if all_identifiers: merged['identifiers'] = merge_identifiers([], all_identifiers) # Merge digital platforms all_platforms = [] for inst in institutions: if inst.get('digital_platforms'): all_platforms.extend(inst['digital_platforms']) if all_platforms: merged['digital_platforms'] = merge_platforms([], all_platforms) # Merge collections all_collections = [] for inst in institutions: if inst.get('collections'): all_collections.extend(inst['collections']) if all_collections: merged['collections'] = merge_collections([], all_collections) # Update provenance to note merge if 'provenance' not in merged: merged['provenance'] = {} original_notes = merged['provenance'].get('notes', '') merge_note = f"Merged {len(institutions)} duplicate records on {datetime.now(timezone.utc).isoformat()}" if original_notes: merged['provenance']['notes'] = f"{original_notes}; {merge_note}" else: merged['provenance']['notes'] = merge_note return merged def deduplicate_brazilian_institutions(input_file: str, output_file: str) -> Dict[str, Any]: """Deduplicate Brazilian institutions in dataset.""" print(f"Loading {input_file}...") data = load_yaml(input_file) print(f"Total institutions: {len(data)}") # Separate Brazilian and non-Brazilian institutions brazilian = [inst for inst in data if is_brazilian(inst)] non_brazilian = [inst for inst in data if not is_brazilian(inst)] print(f"Brazilian institutions: {len(brazilian)}") print(f"Non-Brazilian institutions: {len(non_brazilian)}") # Group Brazilian institutions by name by_name = defaultdict(list) for inst in brazilian: name = inst.get('name') if name: by_name[name].append(inst) # Find duplicates duplicates = {name: insts for name, insts in by_name.items() if len(insts) > 1} print(f"\nFound {len(duplicates)} duplicate names") # Merge duplicates deduplicated_brazilian = [] merged_count = 0 for name, institutions in by_name.items(): if len(institutions) > 1: print(f" Merging {len(institutions)}x: {name}") merged = merge_institutions(institutions) deduplicated_brazilian.append(merged) merged_count += len(institutions) - 1 else: deduplicated_brazilian.append(institutions[0]) # Combine with non-Brazilian institutions deduplicated_data = non_brazilian + deduplicated_brazilian print(f"\nDeduplication complete:") print(f" Original Brazilian institutions: {len(brazilian)}") print(f" Deduplicated Brazilian institutions: {len(deduplicated_brazilian)}") print(f" Records merged: {merged_count}") print(f" Total dataset size: {len(deduplicated_data)} (was {len(data)})") # Save deduplicated dataset print(f"\nSaving to {output_file}...") save_yaml(output_file, deduplicated_data) # Generate report report = { 'total_original': len(data), 'total_deduplicated': len(deduplicated_data), 'brazilian_original': len(brazilian), 'brazilian_deduplicated': len(deduplicated_brazilian), 'records_merged': merged_count, 'duplicates_found': len(duplicates), 'duplicate_names': sorted(duplicates.keys()) } return report if __name__ == '__main__': input_file = 'data/instances/all/globalglam-20251111.yaml' output_file = 'data/instances/all/globalglam-20251111.yaml' backup_file = 'data/instances/all/globalglam-20251111.yaml.pre_dedup_backup' # Create backup import shutil print(f"Creating backup: {backup_file}") shutil.copy(input_file, backup_file) # Run deduplication report = deduplicate_brazilian_institutions(input_file, output_file) # Save report report_file = 'data/instances/brazil/DEDUPLICATION_REPORT.md' with open(report_file, 'w') as f: f.write("# Brazilian Institutions Deduplication Report\n\n") f.write(f"**Date**: {datetime.now(timezone.utc).isoformat()}\n\n") f.write("## Summary\n\n") f.write(f"- Original dataset: {report['total_original']} institutions\n") f.write(f"- Deduplicated dataset: {report['total_deduplicated']} institutions\n") f.write(f"- Brazilian institutions (original): {report['brazilian_original']}\n") f.write(f"- Brazilian institutions (deduplicated): {report['brazilian_deduplicated']}\n") f.write(f"- Records merged: {report['records_merged']}\n") f.write(f"- Duplicate names found: {report['duplicates_found']}\n\n") f.write("## Duplicate Names\n\n") for name in report['duplicate_names']: f.write(f"- {name}\n") print(f"\n✅ Report saved to {report_file}") print("\nDeduplication complete!")