- Merged 91 duplicate Brazilian institution records - Improved Wikidata coverage from 26.4% to 38.8% (+12.4pp) - Created intelligent merge strategy: - Prefer records with higher confidence scores - Merge locations (prefer most complete) - Combine all unique identifiers - Combine all unique digital platforms - Combine all unique collections - Add provenance notes documenting merges - Create backup before deduplication - Generate comprehensive deduplication report Dataset changes: - Total institutions: 13,502 → 13,411 - Brazilian institutions: 212 → 121 - Coverage: 47/121 institutions with Q-numbers (38.8%)
250 lines
9 KiB
Python
250 lines
9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Deduplicate Brazilian institutions in the global dataset.
|
|
|
|
Strategy:
|
|
1. Group Brazilian institutions by name
|
|
2. For each group with duplicates:
|
|
- Merge location data (prefer most complete)
|
|
- Merge identifiers (combine all unique identifiers)
|
|
- Merge digital_platforms (combine all unique platforms)
|
|
- Merge collections (combine all unique collections)
|
|
- Prefer record with higher confidence_score
|
|
- Update provenance to note the merge
|
|
3. Write deduplicated dataset
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from collections import defaultdict
|
|
from typing import List, Dict, Any
|
|
import sys
|
|
|
|
def load_yaml(filepath: str) -> List[Dict[str, Any]]:
|
|
"""Load YAML file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f) or []
|
|
|
|
def save_yaml(filepath: str, data: List[Dict[str, Any]]):
|
|
"""Save YAML file."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
def is_brazilian(inst: Dict[str, Any]) -> bool:
|
|
"""Check if institution is Brazilian."""
|
|
locations = inst.get('locations') or []
|
|
return any(loc.get('country') == 'BR' for loc in locations)
|
|
|
|
def merge_locations(loc1: List[Dict], loc2: List[Dict]) -> List[Dict]:
|
|
"""Merge location lists, preferring more complete records."""
|
|
if not loc1:
|
|
return loc2 or []
|
|
if not loc2:
|
|
return loc1
|
|
|
|
# Use location with more fields
|
|
def location_completeness(loc):
|
|
return sum(1 for k, v in loc.items() if v is not None)
|
|
|
|
best_loc = max(loc1 + loc2, key=location_completeness)
|
|
return [best_loc]
|
|
|
|
def merge_identifiers(id1: List[Dict], id2: List[Dict]) -> List[Dict]:
|
|
"""Merge identifier lists, removing duplicates."""
|
|
all_ids = (id1 or []) + (id2 or [])
|
|
|
|
# Deduplicate by (scheme, value) tuple
|
|
unique_ids = {}
|
|
for ident in all_ids:
|
|
key = (ident.get('identifier_scheme'), ident.get('identifier_value'))
|
|
if key not in unique_ids:
|
|
unique_ids[key] = ident
|
|
|
|
return list(unique_ids.values())
|
|
|
|
def merge_platforms(plat1: List[Dict], plat2: List[Dict]) -> List[Dict]:
|
|
"""Merge platform lists, removing duplicates."""
|
|
all_platforms = (plat1 or []) + (plat2 or [])
|
|
|
|
# Deduplicate by platform_url
|
|
unique_platforms = {}
|
|
for plat in all_platforms:
|
|
url = plat.get('platform_url')
|
|
if url and url not in unique_platforms:
|
|
unique_platforms[url] = plat
|
|
|
|
return list(unique_platforms.values())
|
|
|
|
def merge_collections(coll1: List[Dict], coll2: List[Dict]) -> List[Dict]:
|
|
"""Merge collection lists, removing duplicates."""
|
|
all_collections = (coll1 or []) + (coll2 or [])
|
|
|
|
# Deduplicate by collection_name
|
|
unique_collections = {}
|
|
for coll in all_collections:
|
|
name = coll.get('collection_name')
|
|
if name and name not in unique_collections:
|
|
unique_collections[name] = coll
|
|
|
|
return list(unique_collections.values())
|
|
|
|
def merge_institutions(institutions: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""Merge multiple institution records into one."""
|
|
if len(institutions) == 1:
|
|
return institutions[0]
|
|
|
|
# Sort by confidence score (prefer higher) then by completeness
|
|
def score_completeness(inst):
|
|
confidence = inst.get('provenance', {}).get('confidence_score', 0.5)
|
|
field_count = sum(1 for v in inst.values() if v is not None)
|
|
return (confidence, field_count)
|
|
|
|
institutions.sort(key=score_completeness, reverse=True)
|
|
|
|
# Use best record as base
|
|
merged = institutions[0].copy()
|
|
|
|
# Merge locations from all records
|
|
all_locations = []
|
|
for inst in institutions:
|
|
if inst.get('locations'):
|
|
all_locations.extend(inst['locations'])
|
|
merged['locations'] = merge_locations(
|
|
merged.get('locations') or [],
|
|
all_locations[1:] if len(all_locations) > 1 else []
|
|
)
|
|
|
|
# Merge identifiers
|
|
all_identifiers = []
|
|
for inst in institutions:
|
|
if inst.get('identifiers'):
|
|
all_identifiers.extend(inst['identifiers'])
|
|
if all_identifiers:
|
|
merged['identifiers'] = merge_identifiers([], all_identifiers)
|
|
|
|
# Merge digital platforms
|
|
all_platforms = []
|
|
for inst in institutions:
|
|
if inst.get('digital_platforms'):
|
|
all_platforms.extend(inst['digital_platforms'])
|
|
if all_platforms:
|
|
merged['digital_platforms'] = merge_platforms([], all_platforms)
|
|
|
|
# Merge collections
|
|
all_collections = []
|
|
for inst in institutions:
|
|
if inst.get('collections'):
|
|
all_collections.extend(inst['collections'])
|
|
if all_collections:
|
|
merged['collections'] = merge_collections([], all_collections)
|
|
|
|
# Update provenance to note merge
|
|
if 'provenance' not in merged:
|
|
merged['provenance'] = {}
|
|
|
|
original_notes = merged['provenance'].get('notes', '')
|
|
merge_note = f"Merged {len(institutions)} duplicate records on {datetime.now(timezone.utc).isoformat()}"
|
|
|
|
if original_notes:
|
|
merged['provenance']['notes'] = f"{original_notes}; {merge_note}"
|
|
else:
|
|
merged['provenance']['notes'] = merge_note
|
|
|
|
return merged
|
|
|
|
def deduplicate_brazilian_institutions(input_file: str, output_file: str) -> Dict[str, Any]:
|
|
"""Deduplicate Brazilian institutions in dataset."""
|
|
|
|
print(f"Loading {input_file}...")
|
|
data = load_yaml(input_file)
|
|
|
|
print(f"Total institutions: {len(data)}")
|
|
|
|
# Separate Brazilian and non-Brazilian institutions
|
|
brazilian = [inst for inst in data if is_brazilian(inst)]
|
|
non_brazilian = [inst for inst in data if not is_brazilian(inst)]
|
|
|
|
print(f"Brazilian institutions: {len(brazilian)}")
|
|
print(f"Non-Brazilian institutions: {len(non_brazilian)}")
|
|
|
|
# Group Brazilian institutions by name
|
|
by_name = defaultdict(list)
|
|
for inst in brazilian:
|
|
name = inst.get('name')
|
|
if name:
|
|
by_name[name].append(inst)
|
|
|
|
# Find duplicates
|
|
duplicates = {name: insts for name, insts in by_name.items() if len(insts) > 1}
|
|
print(f"\nFound {len(duplicates)} duplicate names")
|
|
|
|
# Merge duplicates
|
|
deduplicated_brazilian = []
|
|
merged_count = 0
|
|
|
|
for name, institutions in by_name.items():
|
|
if len(institutions) > 1:
|
|
print(f" Merging {len(institutions)}x: {name}")
|
|
merged = merge_institutions(institutions)
|
|
deduplicated_brazilian.append(merged)
|
|
merged_count += len(institutions) - 1
|
|
else:
|
|
deduplicated_brazilian.append(institutions[0])
|
|
|
|
# Combine with non-Brazilian institutions
|
|
deduplicated_data = non_brazilian + deduplicated_brazilian
|
|
|
|
print(f"\nDeduplication complete:")
|
|
print(f" Original Brazilian institutions: {len(brazilian)}")
|
|
print(f" Deduplicated Brazilian institutions: {len(deduplicated_brazilian)}")
|
|
print(f" Records merged: {merged_count}")
|
|
print(f" Total dataset size: {len(deduplicated_data)} (was {len(data)})")
|
|
|
|
# Save deduplicated dataset
|
|
print(f"\nSaving to {output_file}...")
|
|
save_yaml(output_file, deduplicated_data)
|
|
|
|
# Generate report
|
|
report = {
|
|
'total_original': len(data),
|
|
'total_deduplicated': len(deduplicated_data),
|
|
'brazilian_original': len(brazilian),
|
|
'brazilian_deduplicated': len(deduplicated_brazilian),
|
|
'records_merged': merged_count,
|
|
'duplicates_found': len(duplicates),
|
|
'duplicate_names': sorted(duplicates.keys())
|
|
}
|
|
|
|
return report
|
|
|
|
if __name__ == '__main__':
|
|
input_file = 'data/instances/all/globalglam-20251111.yaml'
|
|
output_file = 'data/instances/all/globalglam-20251111.yaml'
|
|
backup_file = 'data/instances/all/globalglam-20251111.yaml.pre_dedup_backup'
|
|
|
|
# Create backup
|
|
import shutil
|
|
print(f"Creating backup: {backup_file}")
|
|
shutil.copy(input_file, backup_file)
|
|
|
|
# Run deduplication
|
|
report = deduplicate_brazilian_institutions(input_file, output_file)
|
|
|
|
# Save report
|
|
report_file = 'data/instances/brazil/DEDUPLICATION_REPORT.md'
|
|
with open(report_file, 'w') as f:
|
|
f.write("# Brazilian Institutions Deduplication Report\n\n")
|
|
f.write(f"**Date**: {datetime.now(timezone.utc).isoformat()}\n\n")
|
|
f.write("## Summary\n\n")
|
|
f.write(f"- Original dataset: {report['total_original']} institutions\n")
|
|
f.write(f"- Deduplicated dataset: {report['total_deduplicated']} institutions\n")
|
|
f.write(f"- Brazilian institutions (original): {report['brazilian_original']}\n")
|
|
f.write(f"- Brazilian institutions (deduplicated): {report['brazilian_deduplicated']}\n")
|
|
f.write(f"- Records merged: {report['records_merged']}\n")
|
|
f.write(f"- Duplicate names found: {report['duplicates_found']}\n\n")
|
|
f.write("## Duplicate Names\n\n")
|
|
for name in report['duplicate_names']:
|
|
f.write(f"- {name}\n")
|
|
|
|
print(f"\n✅ Report saved to {report_file}")
|
|
print("\nDeduplication complete!")
|