glam/scripts/deduplicate_mexican_institutions.py
2025-11-19 23:25:22 +01:00

200 lines
7.2 KiB
Python

#!/usr/bin/env python3
"""
Deduplicate Mexican Heritage Institutions
Problem: Mexican institutions appear twice in the dataset:
- First entry: Missing GHCID (value is None or 'N/A')
- Second entry: Has proper GHCID assigned
This script:
1. Identifies duplicate Mexican institutions by name
2. Merges records, preferring the one with GHCID
3. Removes GHCID=N/A duplicates
4. Preserves 26 unique institutions that only have GHCID entries
5. Exports deduplicated dataset
Author: GLAM Data Extraction Project
Date: 2025-11-13
"""
import yaml
from typing import List, Dict, Any
from collections import defaultdict
def load_institutions(filepath: str) -> List[Dict[str, Any]]:
"""Load institutions from YAML file."""
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def is_mexican_institution(inst: Dict[str, Any]) -> bool:
"""Check if institution is in Mexico."""
locations = inst.get('locations', [])
return any(loc.get('country') == 'MX' for loc in locations)
def merge_institution_records(without_ghcid: Dict[str, Any],
with_ghcid: Dict[str, Any]) -> Dict[str, Any]:
"""
Merge two institution records, preferring with_ghcid version.
Strategy:
- Use with_ghcid as base (has proper GHCID)
- Add any missing fields from without_ghcid version
- Merge lists (identifiers, collections, etc.) without duplicates
"""
merged = with_ghcid.copy()
# Merge identifiers (avoid duplicates)
merged_identifiers = with_ghcid.get('identifiers', []).copy()
existing_ids = {
(i.get('identifier_scheme'), i.get('identifier_value'))
for i in merged_identifiers
}
for identifier in without_ghcid.get('identifiers', []):
key = (identifier.get('identifier_scheme'), identifier.get('identifier_value'))
if key not in existing_ids:
merged_identifiers.append(identifier)
existing_ids.add(key)
if merged_identifiers:
merged['identifiers'] = merged_identifiers
# Merge collections (avoid duplicates by name)
merged_collections = with_ghcid.get('collections', []).copy()
existing_collections = {c.get('collection_name') for c in merged_collections}
for collection in without_ghcid.get('collections', []):
if collection.get('collection_name') not in existing_collections:
merged_collections.append(collection)
existing_collections.add(collection.get('collection_name'))
if merged_collections:
merged['collections'] = merged_collections
# Merge digital_platforms (avoid duplicates by name)
merged_platforms = with_ghcid.get('digital_platforms', []).copy()
existing_platforms = {p.get('platform_name') for p in merged_platforms}
for platform in without_ghcid.get('digital_platforms', []):
if platform.get('platform_name') not in existing_platforms:
merged_platforms.append(platform)
existing_platforms.add(platform.get('platform_name'))
if merged_platforms:
merged['digital_platforms'] = merged_platforms
# Merge alternative_names (avoid duplicates)
merged_alt_names = set(with_ghcid.get('alternative_names', []))
merged_alt_names.update(without_ghcid.get('alternative_names', []))
if merged_alt_names:
merged['alternative_names'] = sorted(merged_alt_names)
# Use description from with_ghcid, fallback to without_ghcid if missing
if not merged.get('description') and without_ghcid.get('description'):
merged['description'] = without_ghcid['description']
return merged
def deduplicate_mexican_institutions(institutions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Deduplicate Mexican institutions.
Returns:
List of deduplicated institutions (Mexican + all non-Mexican)
"""
mexican = []
non_mexican = []
# Separate Mexican from non-Mexican
for inst in institutions:
if is_mexican_institution(inst):
mexican.append(inst)
else:
non_mexican.append(inst)
print(f"Total institutions: {len(institutions)}")
print(f"Mexican institutions: {len(mexican)}")
print(f"Non-Mexican institutions: {len(non_mexican)}")
# Group Mexican institutions by name
by_name = defaultdict(list)
for inst in mexican:
name = inst.get('name', 'Unknown')
by_name[name].append(inst)
print(f"Unique Mexican institution names: {len(by_name)}")
# Process each name group
deduplicated_mexican = []
duplicates_merged = 0
unique_kept = 0
for name, instances in by_name.items():
if len(instances) == 1:
# Single instance - keep as-is
deduplicated_mexican.append(instances[0])
unique_kept += 1
else:
# Multiple instances - identify with/without GHCID
with_ghcid = [i for i in instances if i.get('ghcid') and i.get('ghcid') != 'N/A']
without_ghcid = [i for i in instances if not i.get('ghcid') or i.get('ghcid') == 'N/A']
if len(with_ghcid) == 1 and len(without_ghcid) == 1:
# Standard duplicate case: merge
merged = merge_institution_records(without_ghcid[0], with_ghcid[0])
deduplicated_mexican.append(merged)
duplicates_merged += 1
elif len(with_ghcid) == 1 and len(without_ghcid) == 0:
# Only has-GHCID version exists (should not happen in by_name grouping)
deduplicated_mexican.append(with_ghcid[0])
unique_kept += 1
else:
# Multiple GHCIDs or complex case - keep all (need manual review)
print(f"WARNING: Complex duplicate for '{name}': {len(with_ghcid)} with GHCID, {len(without_ghcid)} without")
deduplicated_mexican.extend(instances)
print(f"\nDeduplication results:")
print(f" Duplicates merged: {duplicates_merged}")
print(f" Unique institutions kept: {unique_kept}")
print(f" Total deduplicated Mexican institutions: {len(deduplicated_mexican)}")
# Combine deduplicated Mexican with non-Mexican
final = non_mexican + deduplicated_mexican
print(f"\nFinal dataset size: {len(final)} (was {len(institutions)})")
print(f"Reduction: {len(institutions) - len(final)} duplicates removed")
return final
def main():
"""Main execution."""
input_file = 'data/instances/all/globalglam-20251111-brazil-campaign-final.yaml'
output_file = 'data/instances/all/globalglam-20251113-mexico-deduplicated.yaml'
print("="*80)
print("Mexican Heritage Institutions Deduplication")
print("="*80)
print()
# Load data
print(f"Loading: {input_file}")
institutions = load_institutions(input_file)
# Deduplicate
deduplicated = deduplicate_mexican_institutions(institutions)
# Save
print(f"\nSaving deduplicated dataset: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(deduplicated, f, allow_unicode=True, sort_keys=False, width=120)
print("\n✓ Deduplication complete!")
print(f" Output: {output_file}")
if __name__ == '__main__':
main()