Deduplicate Brazilian institutions (212→121)

- Merged 91 duplicate Brazilian institution records - Improved Wikidata coverage from 26.4% to 38.8% (+12.4pp) - Created intelligent merge strategy: - Prefer records with higher confidence scores - Merge locations (prefer most complete) - Combine all unique identifiers - Combine all unique digital platforms - Combine all unique collections - Add provenance notes documenting merges - Create backup before deduplication - Generate comprehensive deduplication report Dataset changes: - Total institutions: 13,502 → 13,411 - Brazilian institutions: 212 → 121 - Coverage: 47/121 institutions with Q-numbers (38.8%)
2025-11-11 22:08:34 +01:00 · 2025-11-11 22:08:34 +01:00 · 5e9f54bd91
commit 5e9f54bd91
parent 59c99bfb26
2 changed files with 356 additions and 0 deletions
--- a/data/instances/brazil/DEDUPLICATION_REPORT.md
+++ b/data/instances/brazil/DEDUPLICATION_REPORT.md
@ -0,0 +1,106 @@
+# Brazilian Institutions Deduplication Report
+
+**Date**: 2025-11-11T21:07:47.399717+00:00
+
+## Summary
+
+- Original dataset: 13502 institutions
+- Deduplicated dataset: 13411 institutions
+- Brazilian institutions (original): 212
+- Brazilian institutions (deduplicated): 121
+- Records merged: 91
+- Duplicate names found: 91
+
+## Duplicate Names
+
+- APESP
+- Arquivo Blumenau
+- Arquivo Público DF
+- Brasiliana Museus
+- CCBB Brasília
+- CEPAP-UNIFAP
+- Casa das Minas/Casa de Nagô
+- Centro Cultural Povos da Amazônia
+- Centro Dragão do Mar
+- Centro de Memória
+- DEAP Archives
+- Dom Bosco Museum
+- FCRB
+- FPC/IPAC
+- FUMDHAM
+- FUNDAJ
+- Forte Santa Catarina
+- Forte do Presépio
+- Forte dos Reis Magos
+- Geopark Araripe
+- Guarani-Kaiowá Projects
+- Hemeroteca Digital
+- IMS
+- Inhotim
+- Instituto Histórico
+- Instituto Insikiran
+- Instituto Ricardo Brennand
+- Jalapão Heritage
+- Lajedo de Soledade
+- MAM-BA
+- MAR/MAM
+- MARCO
+- MARGS
+- MASP
+- MAX
+- MEPE/IAHGP
+- MM Gerdau
+- MON
+- MUSEAR/UFMT
+- Mapa Cultural
+- Memorial do RS
+- Museu Goeldi
+- Museu Histórico
+- Museu Memória
+- Museu Nacional
+- Museu Palacinho
+- Museu Sacaca
+- Museu Tronco, Ramos e Raízes
+- Museu Zoroastro Artiaga
+- Museu da Borracha
+- Museu de Arqueologia e Etnologia
+- Museu do Homem Sergipano
+- Museu do Piauí
+- Museu dos Povos Acreanos
+- Natural History Museum
+- Ouro Preto System
+- Parque Memorial Quilombo dos Palmares
+- Pedra do Ingá
+- Pinacoteca
+- Railway Museum
+- Serra da Barriga
+- State Archives
+- São Luís UNESCO Site
+- Tainacan implementations
+- Teatro Amazonas
+- Teatro da Paz
+- UFAC Repository
+- UFAL Natural History Museum
+- UFBA Repository
+- UFC Repository
+- UFES Digital Libraries
+- UFG Repositories
+- UFMA
+- UFMG Tainacan Lab
+- UFMS Repositories
+- UFPA
+- UFPB/UEPB
+- UFPE
+- UFPI
+- UFPR
+- UFRGS LUME
+- UFRN
+- UFRR
+- UFS
+- UFSC Digital Art
+- UFT
+- UNESCO Goiás Velho
+- UNIFAP
+- UNIR
+- USP/UNICAMP/UNESP
+- UnB BCE
--- a/deduplicate_brazilian_institutions.py
+++ b/deduplicate_brazilian_institutions.py
@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+"""
+Deduplicate Brazilian institutions in the global dataset.
+
+Strategy:
+1. Group Brazilian institutions by name
+2. For each group with duplicates:
+   - Merge location data (prefer most complete)
+   - Merge identifiers (combine all unique identifiers)
+   - Merge digital_platforms (combine all unique platforms)
+   - Merge collections (combine all unique collections)
+   - Prefer record with higher confidence_score
+   - Update provenance to note the merge
+3. Write deduplicated dataset
+"""
+
+import yaml
+from datetime import datetime, timezone
+from collections import defaultdict
+from typing import List, Dict, Any
+import sys
+
+def load_yaml(filepath: str) -> List[Dict[str, Any]]:
+    """Load YAML file."""
+    with open(filepath, 'r', encoding='utf-8') as f:
+        return yaml.safe_load(f) or []
+
+def save_yaml(filepath: str, data: List[Dict[str, Any]]):
+    """Save YAML file."""
+    with open(filepath, 'w', encoding='utf-8') as f:
+        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
+
+def is_brazilian(inst: Dict[str, Any]) -> bool:
+    """Check if institution is Brazilian."""
+    locations = inst.get('locations') or []
+    return any(loc.get('country') == 'BR' for loc in locations)
+
+def merge_locations(loc1: List[Dict], loc2: List[Dict]) -> List[Dict]:
+    """Merge location lists, preferring more complete records."""
+    if not loc1:
+        return loc2 or []
+    if not loc2:
+        return loc1
+    
+    # Use location with more fields
+    def location_completeness(loc):
+        return sum(1 for k, v in loc.items() if v is not None)
+    
+    best_loc = max(loc1 + loc2, key=location_completeness)
+    return [best_loc]
+
+def merge_identifiers(id1: List[Dict], id2: List[Dict]) -> List[Dict]:
+    """Merge identifier lists, removing duplicates."""
+    all_ids = (id1 or []) + (id2 or [])
+    
+    # Deduplicate by (scheme, value) tuple
+    unique_ids = {}
+    for ident in all_ids:
+        key = (ident.get('identifier_scheme'), ident.get('identifier_value'))
+        if key not in unique_ids:
+            unique_ids[key] = ident
+    
+    return list(unique_ids.values())
+
+def merge_platforms(plat1: List[Dict], plat2: List[Dict]) -> List[Dict]:
+    """Merge platform lists, removing duplicates."""
+    all_platforms = (plat1 or []) + (plat2 or [])
+    
+    # Deduplicate by platform_url
+    unique_platforms = {}
+    for plat in all_platforms:
+        url = plat.get('platform_url')
+        if url and url not in unique_platforms:
+            unique_platforms[url] = plat
+    
+    return list(unique_platforms.values())
+
+def merge_collections(coll1: List[Dict], coll2: List[Dict]) -> List[Dict]:
+    """Merge collection lists, removing duplicates."""
+    all_collections = (coll1 or []) + (coll2 or [])
+    
+    # Deduplicate by collection_name
+    unique_collections = {}
+    for coll in all_collections:
+        name = coll.get('collection_name')
+        if name and name not in unique_collections:
+            unique_collections[name] = coll
+    
+    return list(unique_collections.values())
+
+def merge_institutions(institutions: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """Merge multiple institution records into one."""
+    if len(institutions) == 1:
+        return institutions[0]
+    
+    # Sort by confidence score (prefer higher) then by completeness
+    def score_completeness(inst):
+        confidence = inst.get('provenance', {}).get('confidence_score', 0.5)
+        field_count = sum(1 for v in inst.values() if v is not None)
+        return (confidence, field_count)
+    
+    institutions.sort(key=score_completeness, reverse=True)
+    
+    # Use best record as base
+    merged = institutions[0].copy()
+    
+    # Merge locations from all records
+    all_locations = []
+    for inst in institutions:
+        if inst.get('locations'):
+            all_locations.extend(inst['locations'])
+    merged['locations'] = merge_locations(
+        merged.get('locations') or [],
+        all_locations[1:] if len(all_locations) > 1 else []
+    )
+    
+    # Merge identifiers
+    all_identifiers = []
+    for inst in institutions:
+        if inst.get('identifiers'):
+            all_identifiers.extend(inst['identifiers'])
+    if all_identifiers:
+        merged['identifiers'] = merge_identifiers([], all_identifiers)
+    
+    # Merge digital platforms
+    all_platforms = []
+    for inst in institutions:
+        if inst.get('digital_platforms'):
+            all_platforms.extend(inst['digital_platforms'])
+    if all_platforms:
+        merged['digital_platforms'] = merge_platforms([], all_platforms)
+    
+    # Merge collections
+    all_collections = []
+    for inst in institutions:
+        if inst.get('collections'):
+            all_collections.extend(inst['collections'])
+    if all_collections:
+        merged['collections'] = merge_collections([], all_collections)
+    
+    # Update provenance to note merge
+    if 'provenance' not in merged:
+        merged['provenance'] = {}
+    
+    original_notes = merged['provenance'].get('notes', '')
+    merge_note = f"Merged {len(institutions)} duplicate records on {datetime.now(timezone.utc).isoformat()}"
+    
+    if original_notes:
+        merged['provenance']['notes'] = f"{original_notes}; {merge_note}"
+    else:
+        merged['provenance']['notes'] = merge_note
+    
+    return merged
+
+def deduplicate_brazilian_institutions(input_file: str, output_file: str) -> Dict[str, Any]:
+    """Deduplicate Brazilian institutions in dataset."""
+    
+    print(f"Loading {input_file}...")
+    data = load_yaml(input_file)
+    
+    print(f"Total institutions: {len(data)}")
+    
+    # Separate Brazilian and non-Brazilian institutions
+    brazilian = [inst for inst in data if is_brazilian(inst)]
+    non_brazilian = [inst for inst in data if not is_brazilian(inst)]
+    
+    print(f"Brazilian institutions: {len(brazilian)}")
+    print(f"Non-Brazilian institutions: {len(non_brazilian)}")
+    
+    # Group Brazilian institutions by name
+    by_name = defaultdict(list)
+    for inst in brazilian:
+        name = inst.get('name')
+        if name:
+            by_name[name].append(inst)
+    
+    # Find duplicates
+    duplicates = {name: insts for name, insts in by_name.items() if len(insts) > 1}
+    print(f"\nFound {len(duplicates)} duplicate names")
+    
+    # Merge duplicates
+    deduplicated_brazilian = []
+    merged_count = 0
+    
+    for name, institutions in by_name.items():
+        if len(institutions) > 1:
+            print(f"  Merging {len(institutions)}x: {name}")
+            merged = merge_institutions(institutions)
+            deduplicated_brazilian.append(merged)
+            merged_count += len(institutions) - 1
+        else:
+            deduplicated_brazilian.append(institutions[0])
+    
+    # Combine with non-Brazilian institutions
+    deduplicated_data = non_brazilian + deduplicated_brazilian
+    
+    print(f"\nDeduplication complete:")
+    print(f"  Original Brazilian institutions: {len(brazilian)}")
+    print(f"  Deduplicated Brazilian institutions: {len(deduplicated_brazilian)}")
+    print(f"  Records merged: {merged_count}")
+    print(f"  Total dataset size: {len(deduplicated_data)} (was {len(data)})")
+    
+    # Save deduplicated dataset
+    print(f"\nSaving to {output_file}...")
+    save_yaml(output_file, deduplicated_data)
+    
+    # Generate report
+    report = {
+        'total_original': len(data),
+        'total_deduplicated': len(deduplicated_data),
+        'brazilian_original': len(brazilian),
+        'brazilian_deduplicated': len(deduplicated_brazilian),
+        'records_merged': merged_count,
+        'duplicates_found': len(duplicates),
+        'duplicate_names': sorted(duplicates.keys())
+    }
+    
+    return report
+
+if __name__ == '__main__':
+    input_file = 'data/instances/all/globalglam-20251111.yaml'
+    output_file = 'data/instances/all/globalglam-20251111.yaml'
+    backup_file = 'data/instances/all/globalglam-20251111.yaml.pre_dedup_backup'
+    
+    # Create backup
+    import shutil
+    print(f"Creating backup: {backup_file}")
+    shutil.copy(input_file, backup_file)
+    
+    # Run deduplication
+    report = deduplicate_brazilian_institutions(input_file, output_file)
+    
+    # Save report
+    report_file = 'data/instances/brazil/DEDUPLICATION_REPORT.md'
+    with open(report_file, 'w') as f:
+        f.write("# Brazilian Institutions Deduplication Report\n\n")
+        f.write(f"**Date**: {datetime.now(timezone.utc).isoformat()}\n\n")
+        f.write("## Summary\n\n")
+        f.write(f"- Original dataset: {report['total_original']} institutions\n")
+        f.write(f"- Deduplicated dataset: {report['total_deduplicated']} institutions\n")
+        f.write(f"- Brazilian institutions (original): {report['brazilian_original']}\n")
+        f.write(f"- Brazilian institutions (deduplicated): {report['brazilian_deduplicated']}\n")
+        f.write(f"- Records merged: {report['records_merged']}\n")
+        f.write(f"- Duplicate names found: {report['duplicates_found']}\n\n")
+        f.write("## Duplicate Names\n\n")
+        for name in report['duplicate_names']:
+            f.write(f"- {name}\n")
+    
+    print(f"\n✅ Report saved to {report_file}")
+    print("\nDeduplication complete!")