#!/usr/bin/env python3 """ Deduplicate Mexican Heritage Institutions Problem: Mexican institutions appear twice in the dataset: - First entry: Missing GHCID (value is None or 'N/A') - Second entry: Has proper GHCID assigned This script: 1. Identifies duplicate Mexican institutions by name 2. Merges records, preferring the one with GHCID 3. Removes GHCID=N/A duplicates 4. Preserves 26 unique institutions that only have GHCID entries 5. Exports deduplicated dataset Author: GLAM Data Extraction Project Date: 2025-11-13 """ import yaml from typing import List, Dict, Any from collections import defaultdict def load_institutions(filepath: str) -> List[Dict[str, Any]]: """Load institutions from YAML file.""" with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def is_mexican_institution(inst: Dict[str, Any]) -> bool: """Check if institution is in Mexico.""" locations = inst.get('locations', []) return any(loc.get('country') == 'MX' for loc in locations) def merge_institution_records(without_ghcid: Dict[str, Any], with_ghcid: Dict[str, Any]) -> Dict[str, Any]: """ Merge two institution records, preferring with_ghcid version. Strategy: - Use with_ghcid as base (has proper GHCID) - Add any missing fields from without_ghcid version - Merge lists (identifiers, collections, etc.) without duplicates """ merged = with_ghcid.copy() # Merge identifiers (avoid duplicates) merged_identifiers = with_ghcid.get('identifiers', []).copy() existing_ids = { (i.get('identifier_scheme'), i.get('identifier_value')) for i in merged_identifiers } for identifier in without_ghcid.get('identifiers', []): key = (identifier.get('identifier_scheme'), identifier.get('identifier_value')) if key not in existing_ids: merged_identifiers.append(identifier) existing_ids.add(key) if merged_identifiers: merged['identifiers'] = merged_identifiers # Merge collections (avoid duplicates by name) merged_collections = with_ghcid.get('collections', []).copy() existing_collections = {c.get('collection_name') for c in merged_collections} for collection in without_ghcid.get('collections', []): if collection.get('collection_name') not in existing_collections: merged_collections.append(collection) existing_collections.add(collection.get('collection_name')) if merged_collections: merged['collections'] = merged_collections # Merge digital_platforms (avoid duplicates by name) merged_platforms = with_ghcid.get('digital_platforms', []).copy() existing_platforms = {p.get('platform_name') for p in merged_platforms} for platform in without_ghcid.get('digital_platforms', []): if platform.get('platform_name') not in existing_platforms: merged_platforms.append(platform) existing_platforms.add(platform.get('platform_name')) if merged_platforms: merged['digital_platforms'] = merged_platforms # Merge alternative_names (avoid duplicates) merged_alt_names = set(with_ghcid.get('alternative_names', [])) merged_alt_names.update(without_ghcid.get('alternative_names', [])) if merged_alt_names: merged['alternative_names'] = sorted(merged_alt_names) # Use description from with_ghcid, fallback to without_ghcid if missing if not merged.get('description') and without_ghcid.get('description'): merged['description'] = without_ghcid['description'] return merged def deduplicate_mexican_institutions(institutions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Deduplicate Mexican institutions. Returns: List of deduplicated institutions (Mexican + all non-Mexican) """ mexican = [] non_mexican = [] # Separate Mexican from non-Mexican for inst in institutions: if is_mexican_institution(inst): mexican.append(inst) else: non_mexican.append(inst) print(f"Total institutions: {len(institutions)}") print(f"Mexican institutions: {len(mexican)}") print(f"Non-Mexican institutions: {len(non_mexican)}") # Group Mexican institutions by name by_name = defaultdict(list) for inst in mexican: name = inst.get('name', 'Unknown') by_name[name].append(inst) print(f"Unique Mexican institution names: {len(by_name)}") # Process each name group deduplicated_mexican = [] duplicates_merged = 0 unique_kept = 0 for name, instances in by_name.items(): if len(instances) == 1: # Single instance - keep as-is deduplicated_mexican.append(instances[0]) unique_kept += 1 else: # Multiple instances - identify with/without GHCID with_ghcid = [i for i in instances if i.get('ghcid') and i.get('ghcid') != 'N/A'] without_ghcid = [i for i in instances if not i.get('ghcid') or i.get('ghcid') == 'N/A'] if len(with_ghcid) == 1 and len(without_ghcid) == 1: # Standard duplicate case: merge merged = merge_institution_records(without_ghcid[0], with_ghcid[0]) deduplicated_mexican.append(merged) duplicates_merged += 1 elif len(with_ghcid) == 1 and len(without_ghcid) == 0: # Only has-GHCID version exists (should not happen in by_name grouping) deduplicated_mexican.append(with_ghcid[0]) unique_kept += 1 else: # Multiple GHCIDs or complex case - keep all (need manual review) print(f"WARNING: Complex duplicate for '{name}': {len(with_ghcid)} with GHCID, {len(without_ghcid)} without") deduplicated_mexican.extend(instances) print(f"\nDeduplication results:") print(f" Duplicates merged: {duplicates_merged}") print(f" Unique institutions kept: {unique_kept}") print(f" Total deduplicated Mexican institutions: {len(deduplicated_mexican)}") # Combine deduplicated Mexican with non-Mexican final = non_mexican + deduplicated_mexican print(f"\nFinal dataset size: {len(final)} (was {len(institutions)})") print(f"Reduction: {len(institutions) - len(final)} duplicates removed") return final def main(): """Main execution.""" input_file = 'data/instances/all/globalglam-20251111-brazil-campaign-final.yaml' output_file = 'data/instances/all/globalglam-20251113-mexico-deduplicated.yaml' print("="*80) print("Mexican Heritage Institutions Deduplication") print("="*80) print() # Load data print(f"Loading: {input_file}") institutions = load_institutions(input_file) # Deduplicate deduplicated = deduplicate_mexican_institutions(institutions) # Save print(f"\nSaving deduplicated dataset: {output_file}") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(deduplicated, f, allow_unicode=True, sort_keys=False, width=120) print("\n✓ Deduplication complete!") print(f" Output: {output_file}") if __name__ == '__main__': main()