Deduplicate Brazilian institutions (212→121)
- Merged 91 duplicate Brazilian institution records - Improved Wikidata coverage from 26.4% to 38.8% (+12.4pp) - Created intelligent merge strategy: - Prefer records with higher confidence scores - Merge locations (prefer most complete) - Combine all unique identifiers - Combine all unique digital platforms - Combine all unique collections - Add provenance notes documenting merges - Create backup before deduplication - Generate comprehensive deduplication report Dataset changes: - Total institutions: 13,502 → 13,411 - Brazilian institutions: 212 → 121 - Coverage: 47/121 institutions with Q-numbers (38.8%)
This commit is contained in:
parent
59c99bfb26
commit
5e9f54bd91
2 changed files with 356 additions and 0 deletions
106
data/instances/brazil/DEDUPLICATION_REPORT.md
Normal file
106
data/instances/brazil/DEDUPLICATION_REPORT.md
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
# Brazilian Institutions Deduplication Report
|
||||
|
||||
**Date**: 2025-11-11T21:07:47.399717+00:00
|
||||
|
||||
## Summary
|
||||
|
||||
- Original dataset: 13502 institutions
|
||||
- Deduplicated dataset: 13411 institutions
|
||||
- Brazilian institutions (original): 212
|
||||
- Brazilian institutions (deduplicated): 121
|
||||
- Records merged: 91
|
||||
- Duplicate names found: 91
|
||||
|
||||
## Duplicate Names
|
||||
|
||||
- APESP
|
||||
- Arquivo Blumenau
|
||||
- Arquivo Público DF
|
||||
- Brasiliana Museus
|
||||
- CCBB Brasília
|
||||
- CEPAP-UNIFAP
|
||||
- Casa das Minas/Casa de Nagô
|
||||
- Centro Cultural Povos da Amazônia
|
||||
- Centro Dragão do Mar
|
||||
- Centro de Memória
|
||||
- DEAP Archives
|
||||
- Dom Bosco Museum
|
||||
- FCRB
|
||||
- FPC/IPAC
|
||||
- FUMDHAM
|
||||
- FUNDAJ
|
||||
- Forte Santa Catarina
|
||||
- Forte do Presépio
|
||||
- Forte dos Reis Magos
|
||||
- Geopark Araripe
|
||||
- Guarani-Kaiowá Projects
|
||||
- Hemeroteca Digital
|
||||
- IMS
|
||||
- Inhotim
|
||||
- Instituto Histórico
|
||||
- Instituto Insikiran
|
||||
- Instituto Ricardo Brennand
|
||||
- Jalapão Heritage
|
||||
- Lajedo de Soledade
|
||||
- MAM-BA
|
||||
- MAR/MAM
|
||||
- MARCO
|
||||
- MARGS
|
||||
- MASP
|
||||
- MAX
|
||||
- MEPE/IAHGP
|
||||
- MM Gerdau
|
||||
- MON
|
||||
- MUSEAR/UFMT
|
||||
- Mapa Cultural
|
||||
- Memorial do RS
|
||||
- Museu Goeldi
|
||||
- Museu Histórico
|
||||
- Museu Memória
|
||||
- Museu Nacional
|
||||
- Museu Palacinho
|
||||
- Museu Sacaca
|
||||
- Museu Tronco, Ramos e Raízes
|
||||
- Museu Zoroastro Artiaga
|
||||
- Museu da Borracha
|
||||
- Museu de Arqueologia e Etnologia
|
||||
- Museu do Homem Sergipano
|
||||
- Museu do Piauí
|
||||
- Museu dos Povos Acreanos
|
||||
- Natural History Museum
|
||||
- Ouro Preto System
|
||||
- Parque Memorial Quilombo dos Palmares
|
||||
- Pedra do Ingá
|
||||
- Pinacoteca
|
||||
- Railway Museum
|
||||
- Serra da Barriga
|
||||
- State Archives
|
||||
- São Luís UNESCO Site
|
||||
- Tainacan implementations
|
||||
- Teatro Amazonas
|
||||
- Teatro da Paz
|
||||
- UFAC Repository
|
||||
- UFAL Natural History Museum
|
||||
- UFBA Repository
|
||||
- UFC Repository
|
||||
- UFES Digital Libraries
|
||||
- UFG Repositories
|
||||
- UFMA
|
||||
- UFMG Tainacan Lab
|
||||
- UFMS Repositories
|
||||
- UFPA
|
||||
- UFPB/UEPB
|
||||
- UFPE
|
||||
- UFPI
|
||||
- UFPR
|
||||
- UFRGS LUME
|
||||
- UFRN
|
||||
- UFRR
|
||||
- UFS
|
||||
- UFSC Digital Art
|
||||
- UFT
|
||||
- UNESCO Goiás Velho
|
||||
- UNIFAP
|
||||
- UNIR
|
||||
- USP/UNICAMP/UNESP
|
||||
- UnB BCE
|
||||
250
deduplicate_brazilian_institutions.py
Normal file
250
deduplicate_brazilian_institutions.py
Normal file
|
|
@ -0,0 +1,250 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Deduplicate Brazilian institutions in the global dataset.
|
||||
|
||||
Strategy:
|
||||
1. Group Brazilian institutions by name
|
||||
2. For each group with duplicates:
|
||||
- Merge location data (prefer most complete)
|
||||
- Merge identifiers (combine all unique identifiers)
|
||||
- Merge digital_platforms (combine all unique platforms)
|
||||
- Merge collections (combine all unique collections)
|
||||
- Prefer record with higher confidence_score
|
||||
- Update provenance to note the merge
|
||||
3. Write deduplicated dataset
|
||||
"""
|
||||
|
||||
import yaml
|
||||
from datetime import datetime, timezone
|
||||
from collections import defaultdict
|
||||
from typing import List, Dict, Any
|
||||
import sys
|
||||
|
||||
def load_yaml(filepath: str) -> List[Dict[str, Any]]:
|
||||
"""Load YAML file."""
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
return yaml.safe_load(f) or []
|
||||
|
||||
def save_yaml(filepath: str, data: List[Dict[str, Any]]):
|
||||
"""Save YAML file."""
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||
|
||||
def is_brazilian(inst: Dict[str, Any]) -> bool:
|
||||
"""Check if institution is Brazilian."""
|
||||
locations = inst.get('locations') or []
|
||||
return any(loc.get('country') == 'BR' for loc in locations)
|
||||
|
||||
def merge_locations(loc1: List[Dict], loc2: List[Dict]) -> List[Dict]:
|
||||
"""Merge location lists, preferring more complete records."""
|
||||
if not loc1:
|
||||
return loc2 or []
|
||||
if not loc2:
|
||||
return loc1
|
||||
|
||||
# Use location with more fields
|
||||
def location_completeness(loc):
|
||||
return sum(1 for k, v in loc.items() if v is not None)
|
||||
|
||||
best_loc = max(loc1 + loc2, key=location_completeness)
|
||||
return [best_loc]
|
||||
|
||||
def merge_identifiers(id1: List[Dict], id2: List[Dict]) -> List[Dict]:
|
||||
"""Merge identifier lists, removing duplicates."""
|
||||
all_ids = (id1 or []) + (id2 or [])
|
||||
|
||||
# Deduplicate by (scheme, value) tuple
|
||||
unique_ids = {}
|
||||
for ident in all_ids:
|
||||
key = (ident.get('identifier_scheme'), ident.get('identifier_value'))
|
||||
if key not in unique_ids:
|
||||
unique_ids[key] = ident
|
||||
|
||||
return list(unique_ids.values())
|
||||
|
||||
def merge_platforms(plat1: List[Dict], plat2: List[Dict]) -> List[Dict]:
|
||||
"""Merge platform lists, removing duplicates."""
|
||||
all_platforms = (plat1 or []) + (plat2 or [])
|
||||
|
||||
# Deduplicate by platform_url
|
||||
unique_platforms = {}
|
||||
for plat in all_platforms:
|
||||
url = plat.get('platform_url')
|
||||
if url and url not in unique_platforms:
|
||||
unique_platforms[url] = plat
|
||||
|
||||
return list(unique_platforms.values())
|
||||
|
||||
def merge_collections(coll1: List[Dict], coll2: List[Dict]) -> List[Dict]:
|
||||
"""Merge collection lists, removing duplicates."""
|
||||
all_collections = (coll1 or []) + (coll2 or [])
|
||||
|
||||
# Deduplicate by collection_name
|
||||
unique_collections = {}
|
||||
for coll in all_collections:
|
||||
name = coll.get('collection_name')
|
||||
if name and name not in unique_collections:
|
||||
unique_collections[name] = coll
|
||||
|
||||
return list(unique_collections.values())
|
||||
|
||||
def merge_institutions(institutions: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""Merge multiple institution records into one."""
|
||||
if len(institutions) == 1:
|
||||
return institutions[0]
|
||||
|
||||
# Sort by confidence score (prefer higher) then by completeness
|
||||
def score_completeness(inst):
|
||||
confidence = inst.get('provenance', {}).get('confidence_score', 0.5)
|
||||
field_count = sum(1 for v in inst.values() if v is not None)
|
||||
return (confidence, field_count)
|
||||
|
||||
institutions.sort(key=score_completeness, reverse=True)
|
||||
|
||||
# Use best record as base
|
||||
merged = institutions[0].copy()
|
||||
|
||||
# Merge locations from all records
|
||||
all_locations = []
|
||||
for inst in institutions:
|
||||
if inst.get('locations'):
|
||||
all_locations.extend(inst['locations'])
|
||||
merged['locations'] = merge_locations(
|
||||
merged.get('locations') or [],
|
||||
all_locations[1:] if len(all_locations) > 1 else []
|
||||
)
|
||||
|
||||
# Merge identifiers
|
||||
all_identifiers = []
|
||||
for inst in institutions:
|
||||
if inst.get('identifiers'):
|
||||
all_identifiers.extend(inst['identifiers'])
|
||||
if all_identifiers:
|
||||
merged['identifiers'] = merge_identifiers([], all_identifiers)
|
||||
|
||||
# Merge digital platforms
|
||||
all_platforms = []
|
||||
for inst in institutions:
|
||||
if inst.get('digital_platforms'):
|
||||
all_platforms.extend(inst['digital_platforms'])
|
||||
if all_platforms:
|
||||
merged['digital_platforms'] = merge_platforms([], all_platforms)
|
||||
|
||||
# Merge collections
|
||||
all_collections = []
|
||||
for inst in institutions:
|
||||
if inst.get('collections'):
|
||||
all_collections.extend(inst['collections'])
|
||||
if all_collections:
|
||||
merged['collections'] = merge_collections([], all_collections)
|
||||
|
||||
# Update provenance to note merge
|
||||
if 'provenance' not in merged:
|
||||
merged['provenance'] = {}
|
||||
|
||||
original_notes = merged['provenance'].get('notes', '')
|
||||
merge_note = f"Merged {len(institutions)} duplicate records on {datetime.now(timezone.utc).isoformat()}"
|
||||
|
||||
if original_notes:
|
||||
merged['provenance']['notes'] = f"{original_notes}; {merge_note}"
|
||||
else:
|
||||
merged['provenance']['notes'] = merge_note
|
||||
|
||||
return merged
|
||||
|
||||
def deduplicate_brazilian_institutions(input_file: str, output_file: str) -> Dict[str, Any]:
|
||||
"""Deduplicate Brazilian institutions in dataset."""
|
||||
|
||||
print(f"Loading {input_file}...")
|
||||
data = load_yaml(input_file)
|
||||
|
||||
print(f"Total institutions: {len(data)}")
|
||||
|
||||
# Separate Brazilian and non-Brazilian institutions
|
||||
brazilian = [inst for inst in data if is_brazilian(inst)]
|
||||
non_brazilian = [inst for inst in data if not is_brazilian(inst)]
|
||||
|
||||
print(f"Brazilian institutions: {len(brazilian)}")
|
||||
print(f"Non-Brazilian institutions: {len(non_brazilian)}")
|
||||
|
||||
# Group Brazilian institutions by name
|
||||
by_name = defaultdict(list)
|
||||
for inst in brazilian:
|
||||
name = inst.get('name')
|
||||
if name:
|
||||
by_name[name].append(inst)
|
||||
|
||||
# Find duplicates
|
||||
duplicates = {name: insts for name, insts in by_name.items() if len(insts) > 1}
|
||||
print(f"\nFound {len(duplicates)} duplicate names")
|
||||
|
||||
# Merge duplicates
|
||||
deduplicated_brazilian = []
|
||||
merged_count = 0
|
||||
|
||||
for name, institutions in by_name.items():
|
||||
if len(institutions) > 1:
|
||||
print(f" Merging {len(institutions)}x: {name}")
|
||||
merged = merge_institutions(institutions)
|
||||
deduplicated_brazilian.append(merged)
|
||||
merged_count += len(institutions) - 1
|
||||
else:
|
||||
deduplicated_brazilian.append(institutions[0])
|
||||
|
||||
# Combine with non-Brazilian institutions
|
||||
deduplicated_data = non_brazilian + deduplicated_brazilian
|
||||
|
||||
print(f"\nDeduplication complete:")
|
||||
print(f" Original Brazilian institutions: {len(brazilian)}")
|
||||
print(f" Deduplicated Brazilian institutions: {len(deduplicated_brazilian)}")
|
||||
print(f" Records merged: {merged_count}")
|
||||
print(f" Total dataset size: {len(deduplicated_data)} (was {len(data)})")
|
||||
|
||||
# Save deduplicated dataset
|
||||
print(f"\nSaving to {output_file}...")
|
||||
save_yaml(output_file, deduplicated_data)
|
||||
|
||||
# Generate report
|
||||
report = {
|
||||
'total_original': len(data),
|
||||
'total_deduplicated': len(deduplicated_data),
|
||||
'brazilian_original': len(brazilian),
|
||||
'brazilian_deduplicated': len(deduplicated_brazilian),
|
||||
'records_merged': merged_count,
|
||||
'duplicates_found': len(duplicates),
|
||||
'duplicate_names': sorted(duplicates.keys())
|
||||
}
|
||||
|
||||
return report
|
||||
|
||||
if __name__ == '__main__':
|
||||
input_file = 'data/instances/all/globalglam-20251111.yaml'
|
||||
output_file = 'data/instances/all/globalglam-20251111.yaml'
|
||||
backup_file = 'data/instances/all/globalglam-20251111.yaml.pre_dedup_backup'
|
||||
|
||||
# Create backup
|
||||
import shutil
|
||||
print(f"Creating backup: {backup_file}")
|
||||
shutil.copy(input_file, backup_file)
|
||||
|
||||
# Run deduplication
|
||||
report = deduplicate_brazilian_institutions(input_file, output_file)
|
||||
|
||||
# Save report
|
||||
report_file = 'data/instances/brazil/DEDUPLICATION_REPORT.md'
|
||||
with open(report_file, 'w') as f:
|
||||
f.write("# Brazilian Institutions Deduplication Report\n\n")
|
||||
f.write(f"**Date**: {datetime.now(timezone.utc).isoformat()}\n\n")
|
||||
f.write("## Summary\n\n")
|
||||
f.write(f"- Original dataset: {report['total_original']} institutions\n")
|
||||
f.write(f"- Deduplicated dataset: {report['total_deduplicated']} institutions\n")
|
||||
f.write(f"- Brazilian institutions (original): {report['brazilian_original']}\n")
|
||||
f.write(f"- Brazilian institutions (deduplicated): {report['brazilian_deduplicated']}\n")
|
||||
f.write(f"- Records merged: {report['records_merged']}\n")
|
||||
f.write(f"- Duplicate names found: {report['duplicates_found']}\n\n")
|
||||
f.write("## Duplicate Names\n\n")
|
||||
for name in report['duplicate_names']:
|
||||
f.write(f"- {name}\n")
|
||||
|
||||
print(f"\n✅ Report saved to {report_file}")
|
||||
print("\nDeduplication complete!")
|
||||
Loading…
Reference in a new issue