glam/deduplicate_brazilian_institutions.py
kempersc 5e9f54bd91 Deduplicate Brazilian institutions (212→121)
- Merged 91 duplicate Brazilian institution records
- Improved Wikidata coverage from 26.4% to 38.8% (+12.4pp)
- Created intelligent merge strategy:
  - Prefer records with higher confidence scores
  - Merge locations (prefer most complete)
  - Combine all unique identifiers
  - Combine all unique digital platforms
  - Combine all unique collections
- Add provenance notes documenting merges
- Create backup before deduplication
- Generate comprehensive deduplication report

Dataset changes:
- Total institutions: 13,502 → 13,411
- Brazilian institutions: 212 → 121
- Coverage: 47/121 institutions with Q-numbers (38.8%)
2025-11-11 22:08:34 +01:00

250 lines
9 KiB
Python

#!/usr/bin/env python3
"""
Deduplicate Brazilian institutions in the global dataset.
Strategy:
1. Group Brazilian institutions by name
2. For each group with duplicates:
- Merge location data (prefer most complete)
- Merge identifiers (combine all unique identifiers)
- Merge digital_platforms (combine all unique platforms)
- Merge collections (combine all unique collections)
- Prefer record with higher confidence_score
- Update provenance to note the merge
3. Write deduplicated dataset
"""
import yaml
from datetime import datetime, timezone
from collections import defaultdict
from typing import List, Dict, Any
import sys
def load_yaml(filepath: str) -> List[Dict[str, Any]]:
"""Load YAML file."""
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f) or []
def save_yaml(filepath: str, data: List[Dict[str, Any]]):
"""Save YAML file."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
def is_brazilian(inst: Dict[str, Any]) -> bool:
"""Check if institution is Brazilian."""
locations = inst.get('locations') or []
return any(loc.get('country') == 'BR' for loc in locations)
def merge_locations(loc1: List[Dict], loc2: List[Dict]) -> List[Dict]:
"""Merge location lists, preferring more complete records."""
if not loc1:
return loc2 or []
if not loc2:
return loc1
# Use location with more fields
def location_completeness(loc):
return sum(1 for k, v in loc.items() if v is not None)
best_loc = max(loc1 + loc2, key=location_completeness)
return [best_loc]
def merge_identifiers(id1: List[Dict], id2: List[Dict]) -> List[Dict]:
"""Merge identifier lists, removing duplicates."""
all_ids = (id1 or []) + (id2 or [])
# Deduplicate by (scheme, value) tuple
unique_ids = {}
for ident in all_ids:
key = (ident.get('identifier_scheme'), ident.get('identifier_value'))
if key not in unique_ids:
unique_ids[key] = ident
return list(unique_ids.values())
def merge_platforms(plat1: List[Dict], plat2: List[Dict]) -> List[Dict]:
"""Merge platform lists, removing duplicates."""
all_platforms = (plat1 or []) + (plat2 or [])
# Deduplicate by platform_url
unique_platforms = {}
for plat in all_platforms:
url = plat.get('platform_url')
if url and url not in unique_platforms:
unique_platforms[url] = plat
return list(unique_platforms.values())
def merge_collections(coll1: List[Dict], coll2: List[Dict]) -> List[Dict]:
"""Merge collection lists, removing duplicates."""
all_collections = (coll1 or []) + (coll2 or [])
# Deduplicate by collection_name
unique_collections = {}
for coll in all_collections:
name = coll.get('collection_name')
if name and name not in unique_collections:
unique_collections[name] = coll
return list(unique_collections.values())
def merge_institutions(institutions: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Merge multiple institution records into one."""
if len(institutions) == 1:
return institutions[0]
# Sort by confidence score (prefer higher) then by completeness
def score_completeness(inst):
confidence = inst.get('provenance', {}).get('confidence_score', 0.5)
field_count = sum(1 for v in inst.values() if v is not None)
return (confidence, field_count)
institutions.sort(key=score_completeness, reverse=True)
# Use best record as base
merged = institutions[0].copy()
# Merge locations from all records
all_locations = []
for inst in institutions:
if inst.get('locations'):
all_locations.extend(inst['locations'])
merged['locations'] = merge_locations(
merged.get('locations') or [],
all_locations[1:] if len(all_locations) > 1 else []
)
# Merge identifiers
all_identifiers = []
for inst in institutions:
if inst.get('identifiers'):
all_identifiers.extend(inst['identifiers'])
if all_identifiers:
merged['identifiers'] = merge_identifiers([], all_identifiers)
# Merge digital platforms
all_platforms = []
for inst in institutions:
if inst.get('digital_platforms'):
all_platforms.extend(inst['digital_platforms'])
if all_platforms:
merged['digital_platforms'] = merge_platforms([], all_platforms)
# Merge collections
all_collections = []
for inst in institutions:
if inst.get('collections'):
all_collections.extend(inst['collections'])
if all_collections:
merged['collections'] = merge_collections([], all_collections)
# Update provenance to note merge
if 'provenance' not in merged:
merged['provenance'] = {}
original_notes = merged['provenance'].get('notes', '')
merge_note = f"Merged {len(institutions)} duplicate records on {datetime.now(timezone.utc).isoformat()}"
if original_notes:
merged['provenance']['notes'] = f"{original_notes}; {merge_note}"
else:
merged['provenance']['notes'] = merge_note
return merged
def deduplicate_brazilian_institutions(input_file: str, output_file: str) -> Dict[str, Any]:
"""Deduplicate Brazilian institutions in dataset."""
print(f"Loading {input_file}...")
data = load_yaml(input_file)
print(f"Total institutions: {len(data)}")
# Separate Brazilian and non-Brazilian institutions
brazilian = [inst for inst in data if is_brazilian(inst)]
non_brazilian = [inst for inst in data if not is_brazilian(inst)]
print(f"Brazilian institutions: {len(brazilian)}")
print(f"Non-Brazilian institutions: {len(non_brazilian)}")
# Group Brazilian institutions by name
by_name = defaultdict(list)
for inst in brazilian:
name = inst.get('name')
if name:
by_name[name].append(inst)
# Find duplicates
duplicates = {name: insts for name, insts in by_name.items() if len(insts) > 1}
print(f"\nFound {len(duplicates)} duplicate names")
# Merge duplicates
deduplicated_brazilian = []
merged_count = 0
for name, institutions in by_name.items():
if len(institutions) > 1:
print(f" Merging {len(institutions)}x: {name}")
merged = merge_institutions(institutions)
deduplicated_brazilian.append(merged)
merged_count += len(institutions) - 1
else:
deduplicated_brazilian.append(institutions[0])
# Combine with non-Brazilian institutions
deduplicated_data = non_brazilian + deduplicated_brazilian
print(f"\nDeduplication complete:")
print(f" Original Brazilian institutions: {len(brazilian)}")
print(f" Deduplicated Brazilian institutions: {len(deduplicated_brazilian)}")
print(f" Records merged: {merged_count}")
print(f" Total dataset size: {len(deduplicated_data)} (was {len(data)})")
# Save deduplicated dataset
print(f"\nSaving to {output_file}...")
save_yaml(output_file, deduplicated_data)
# Generate report
report = {
'total_original': len(data),
'total_deduplicated': len(deduplicated_data),
'brazilian_original': len(brazilian),
'brazilian_deduplicated': len(deduplicated_brazilian),
'records_merged': merged_count,
'duplicates_found': len(duplicates),
'duplicate_names': sorted(duplicates.keys())
}
return report
if __name__ == '__main__':
input_file = 'data/instances/all/globalglam-20251111.yaml'
output_file = 'data/instances/all/globalglam-20251111.yaml'
backup_file = 'data/instances/all/globalglam-20251111.yaml.pre_dedup_backup'
# Create backup
import shutil
print(f"Creating backup: {backup_file}")
shutil.copy(input_file, backup_file)
# Run deduplication
report = deduplicate_brazilian_institutions(input_file, output_file)
# Save report
report_file = 'data/instances/brazil/DEDUPLICATION_REPORT.md'
with open(report_file, 'w') as f:
f.write("# Brazilian Institutions Deduplication Report\n\n")
f.write(f"**Date**: {datetime.now(timezone.utc).isoformat()}\n\n")
f.write("## Summary\n\n")
f.write(f"- Original dataset: {report['total_original']} institutions\n")
f.write(f"- Deduplicated dataset: {report['total_deduplicated']} institutions\n")
f.write(f"- Brazilian institutions (original): {report['brazilian_original']}\n")
f.write(f"- Brazilian institutions (deduplicated): {report['brazilian_deduplicated']}\n")
f.write(f"- Records merged: {report['records_merged']}\n")
f.write(f"- Duplicate names found: {report['duplicates_found']}\n\n")
f.write("## Duplicate Names\n\n")
for name in report['duplicate_names']:
f.write(f"- {name}\n")
print(f"\n✅ Report saved to {report_file}")
print("\nDeduplication complete!")