diff --git a/data/instances/brazil/DEDUPLICATION_REPORT.md b/data/instances/brazil/DEDUPLICATION_REPORT.md new file mode 100644 index 0000000000..3d7263f0be --- /dev/null +++ b/data/instances/brazil/DEDUPLICATION_REPORT.md @@ -0,0 +1,106 @@ +# Brazilian Institutions Deduplication Report + +**Date**: 2025-11-11T21:07:47.399717+00:00 + +## Summary + +- Original dataset: 13502 institutions +- Deduplicated dataset: 13411 institutions +- Brazilian institutions (original): 212 +- Brazilian institutions (deduplicated): 121 +- Records merged: 91 +- Duplicate names found: 91 + +## Duplicate Names + +- APESP +- Arquivo Blumenau +- Arquivo Público DF +- Brasiliana Museus +- CCBB Brasília +- CEPAP-UNIFAP +- Casa das Minas/Casa de Nagô +- Centro Cultural Povos da Amazônia +- Centro Dragão do Mar +- Centro de Memória +- DEAP Archives +- Dom Bosco Museum +- FCRB +- FPC/IPAC +- FUMDHAM +- FUNDAJ +- Forte Santa Catarina +- Forte do Presépio +- Forte dos Reis Magos +- Geopark Araripe +- Guarani-Kaiowá Projects +- Hemeroteca Digital +- IMS +- Inhotim +- Instituto Histórico +- Instituto Insikiran +- Instituto Ricardo Brennand +- Jalapão Heritage +- Lajedo de Soledade +- MAM-BA +- MAR/MAM +- MARCO +- MARGS +- MASP +- MAX +- MEPE/IAHGP +- MM Gerdau +- MON +- MUSEAR/UFMT +- Mapa Cultural +- Memorial do RS +- Museu Goeldi +- Museu Histórico +- Museu Memória +- Museu Nacional +- Museu Palacinho +- Museu Sacaca +- Museu Tronco, Ramos e Raízes +- Museu Zoroastro Artiaga +- Museu da Borracha +- Museu de Arqueologia e Etnologia +- Museu do Homem Sergipano +- Museu do Piauí +- Museu dos Povos Acreanos +- Natural History Museum +- Ouro Preto System +- Parque Memorial Quilombo dos Palmares +- Pedra do Ingá +- Pinacoteca +- Railway Museum +- Serra da Barriga +- State Archives +- São Luís UNESCO Site +- Tainacan implementations +- Teatro Amazonas +- Teatro da Paz +- UFAC Repository +- UFAL Natural History Museum +- UFBA Repository +- UFC Repository +- UFES Digital Libraries +- UFG Repositories +- UFMA +- UFMG Tainacan Lab +- UFMS Repositories +- UFPA +- UFPB/UEPB +- UFPE +- UFPI +- UFPR +- UFRGS LUME +- UFRN +- UFRR +- UFS +- UFSC Digital Art +- UFT +- UNESCO Goiás Velho +- UNIFAP +- UNIR +- USP/UNICAMP/UNESP +- UnB BCE diff --git a/deduplicate_brazilian_institutions.py b/deduplicate_brazilian_institutions.py new file mode 100644 index 0000000000..1f49d616ab --- /dev/null +++ b/deduplicate_brazilian_institutions.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +""" +Deduplicate Brazilian institutions in the global dataset. + +Strategy: +1. Group Brazilian institutions by name +2. For each group with duplicates: + - Merge location data (prefer most complete) + - Merge identifiers (combine all unique identifiers) + - Merge digital_platforms (combine all unique platforms) + - Merge collections (combine all unique collections) + - Prefer record with higher confidence_score + - Update provenance to note the merge +3. Write deduplicated dataset +""" + +import yaml +from datetime import datetime, timezone +from collections import defaultdict +from typing import List, Dict, Any +import sys + +def load_yaml(filepath: str) -> List[Dict[str, Any]]: + """Load YAML file.""" + with open(filepath, 'r', encoding='utf-8') as f: + return yaml.safe_load(f) or [] + +def save_yaml(filepath: str, data: List[Dict[str, Any]]): + """Save YAML file.""" + with open(filepath, 'w', encoding='utf-8') as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + +def is_brazilian(inst: Dict[str, Any]) -> bool: + """Check if institution is Brazilian.""" + locations = inst.get('locations') or [] + return any(loc.get('country') == 'BR' for loc in locations) + +def merge_locations(loc1: List[Dict], loc2: List[Dict]) -> List[Dict]: + """Merge location lists, preferring more complete records.""" + if not loc1: + return loc2 or [] + if not loc2: + return loc1 + + # Use location with more fields + def location_completeness(loc): + return sum(1 for k, v in loc.items() if v is not None) + + best_loc = max(loc1 + loc2, key=location_completeness) + return [best_loc] + +def merge_identifiers(id1: List[Dict], id2: List[Dict]) -> List[Dict]: + """Merge identifier lists, removing duplicates.""" + all_ids = (id1 or []) + (id2 or []) + + # Deduplicate by (scheme, value) tuple + unique_ids = {} + for ident in all_ids: + key = (ident.get('identifier_scheme'), ident.get('identifier_value')) + if key not in unique_ids: + unique_ids[key] = ident + + return list(unique_ids.values()) + +def merge_platforms(plat1: List[Dict], plat2: List[Dict]) -> List[Dict]: + """Merge platform lists, removing duplicates.""" + all_platforms = (plat1 or []) + (plat2 or []) + + # Deduplicate by platform_url + unique_platforms = {} + for plat in all_platforms: + url = plat.get('platform_url') + if url and url not in unique_platforms: + unique_platforms[url] = plat + + return list(unique_platforms.values()) + +def merge_collections(coll1: List[Dict], coll2: List[Dict]) -> List[Dict]: + """Merge collection lists, removing duplicates.""" + all_collections = (coll1 or []) + (coll2 or []) + + # Deduplicate by collection_name + unique_collections = {} + for coll in all_collections: + name = coll.get('collection_name') + if name and name not in unique_collections: + unique_collections[name] = coll + + return list(unique_collections.values()) + +def merge_institutions(institutions: List[Dict[str, Any]]) -> Dict[str, Any]: + """Merge multiple institution records into one.""" + if len(institutions) == 1: + return institutions[0] + + # Sort by confidence score (prefer higher) then by completeness + def score_completeness(inst): + confidence = inst.get('provenance', {}).get('confidence_score', 0.5) + field_count = sum(1 for v in inst.values() if v is not None) + return (confidence, field_count) + + institutions.sort(key=score_completeness, reverse=True) + + # Use best record as base + merged = institutions[0].copy() + + # Merge locations from all records + all_locations = [] + for inst in institutions: + if inst.get('locations'): + all_locations.extend(inst['locations']) + merged['locations'] = merge_locations( + merged.get('locations') or [], + all_locations[1:] if len(all_locations) > 1 else [] + ) + + # Merge identifiers + all_identifiers = [] + for inst in institutions: + if inst.get('identifiers'): + all_identifiers.extend(inst['identifiers']) + if all_identifiers: + merged['identifiers'] = merge_identifiers([], all_identifiers) + + # Merge digital platforms + all_platforms = [] + for inst in institutions: + if inst.get('digital_platforms'): + all_platforms.extend(inst['digital_platforms']) + if all_platforms: + merged['digital_platforms'] = merge_platforms([], all_platforms) + + # Merge collections + all_collections = [] + for inst in institutions: + if inst.get('collections'): + all_collections.extend(inst['collections']) + if all_collections: + merged['collections'] = merge_collections([], all_collections) + + # Update provenance to note merge + if 'provenance' not in merged: + merged['provenance'] = {} + + original_notes = merged['provenance'].get('notes', '') + merge_note = f"Merged {len(institutions)} duplicate records on {datetime.now(timezone.utc).isoformat()}" + + if original_notes: + merged['provenance']['notes'] = f"{original_notes}; {merge_note}" + else: + merged['provenance']['notes'] = merge_note + + return merged + +def deduplicate_brazilian_institutions(input_file: str, output_file: str) -> Dict[str, Any]: + """Deduplicate Brazilian institutions in dataset.""" + + print(f"Loading {input_file}...") + data = load_yaml(input_file) + + print(f"Total institutions: {len(data)}") + + # Separate Brazilian and non-Brazilian institutions + brazilian = [inst for inst in data if is_brazilian(inst)] + non_brazilian = [inst for inst in data if not is_brazilian(inst)] + + print(f"Brazilian institutions: {len(brazilian)}") + print(f"Non-Brazilian institutions: {len(non_brazilian)}") + + # Group Brazilian institutions by name + by_name = defaultdict(list) + for inst in brazilian: + name = inst.get('name') + if name: + by_name[name].append(inst) + + # Find duplicates + duplicates = {name: insts for name, insts in by_name.items() if len(insts) > 1} + print(f"\nFound {len(duplicates)} duplicate names") + + # Merge duplicates + deduplicated_brazilian = [] + merged_count = 0 + + for name, institutions in by_name.items(): + if len(institutions) > 1: + print(f" Merging {len(institutions)}x: {name}") + merged = merge_institutions(institutions) + deduplicated_brazilian.append(merged) + merged_count += len(institutions) - 1 + else: + deduplicated_brazilian.append(institutions[0]) + + # Combine with non-Brazilian institutions + deduplicated_data = non_brazilian + deduplicated_brazilian + + print(f"\nDeduplication complete:") + print(f" Original Brazilian institutions: {len(brazilian)}") + print(f" Deduplicated Brazilian institutions: {len(deduplicated_brazilian)}") + print(f" Records merged: {merged_count}") + print(f" Total dataset size: {len(deduplicated_data)} (was {len(data)})") + + # Save deduplicated dataset + print(f"\nSaving to {output_file}...") + save_yaml(output_file, deduplicated_data) + + # Generate report + report = { + 'total_original': len(data), + 'total_deduplicated': len(deduplicated_data), + 'brazilian_original': len(brazilian), + 'brazilian_deduplicated': len(deduplicated_brazilian), + 'records_merged': merged_count, + 'duplicates_found': len(duplicates), + 'duplicate_names': sorted(duplicates.keys()) + } + + return report + +if __name__ == '__main__': + input_file = 'data/instances/all/globalglam-20251111.yaml' + output_file = 'data/instances/all/globalglam-20251111.yaml' + backup_file = 'data/instances/all/globalglam-20251111.yaml.pre_dedup_backup' + + # Create backup + import shutil + print(f"Creating backup: {backup_file}") + shutil.copy(input_file, backup_file) + + # Run deduplication + report = deduplicate_brazilian_institutions(input_file, output_file) + + # Save report + report_file = 'data/instances/brazil/DEDUPLICATION_REPORT.md' + with open(report_file, 'w') as f: + f.write("# Brazilian Institutions Deduplication Report\n\n") + f.write(f"**Date**: {datetime.now(timezone.utc).isoformat()}\n\n") + f.write("## Summary\n\n") + f.write(f"- Original dataset: {report['total_original']} institutions\n") + f.write(f"- Deduplicated dataset: {report['total_deduplicated']} institutions\n") + f.write(f"- Brazilian institutions (original): {report['brazilian_original']}\n") + f.write(f"- Brazilian institutions (deduplicated): {report['brazilian_deduplicated']}\n") + f.write(f"- Records merged: {report['records_merged']}\n") + f.write(f"- Duplicate names found: {report['duplicates_found']}\n\n") + f.write("## Duplicate Names\n\n") + for name in report['duplicate_names']: + f.write(f"- {name}\n") + + print(f"\n✅ Report saved to {report_file}") + print("\nDeduplication complete!")