200 lines
7.2 KiB
Python
200 lines
7.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Deduplicate Mexican Heritage Institutions
|
|
|
|
Problem: Mexican institutions appear twice in the dataset:
|
|
- First entry: Missing GHCID (value is None or 'N/A')
|
|
- Second entry: Has proper GHCID assigned
|
|
|
|
This script:
|
|
1. Identifies duplicate Mexican institutions by name
|
|
2. Merges records, preferring the one with GHCID
|
|
3. Removes GHCID=N/A duplicates
|
|
4. Preserves 26 unique institutions that only have GHCID entries
|
|
5. Exports deduplicated dataset
|
|
|
|
Author: GLAM Data Extraction Project
|
|
Date: 2025-11-13
|
|
"""
|
|
|
|
import yaml
|
|
from typing import List, Dict, Any
|
|
from collections import defaultdict
|
|
|
|
|
|
def load_institutions(filepath: str) -> List[Dict[str, Any]]:
|
|
"""Load institutions from YAML file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
def is_mexican_institution(inst: Dict[str, Any]) -> bool:
|
|
"""Check if institution is in Mexico."""
|
|
locations = inst.get('locations', [])
|
|
return any(loc.get('country') == 'MX' for loc in locations)
|
|
|
|
|
|
def merge_institution_records(without_ghcid: Dict[str, Any],
|
|
with_ghcid: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Merge two institution records, preferring with_ghcid version.
|
|
|
|
Strategy:
|
|
- Use with_ghcid as base (has proper GHCID)
|
|
- Add any missing fields from without_ghcid version
|
|
- Merge lists (identifiers, collections, etc.) without duplicates
|
|
"""
|
|
merged = with_ghcid.copy()
|
|
|
|
# Merge identifiers (avoid duplicates)
|
|
merged_identifiers = with_ghcid.get('identifiers', []).copy()
|
|
existing_ids = {
|
|
(i.get('identifier_scheme'), i.get('identifier_value'))
|
|
for i in merged_identifiers
|
|
}
|
|
|
|
for identifier in without_ghcid.get('identifiers', []):
|
|
key = (identifier.get('identifier_scheme'), identifier.get('identifier_value'))
|
|
if key not in existing_ids:
|
|
merged_identifiers.append(identifier)
|
|
existing_ids.add(key)
|
|
|
|
if merged_identifiers:
|
|
merged['identifiers'] = merged_identifiers
|
|
|
|
# Merge collections (avoid duplicates by name)
|
|
merged_collections = with_ghcid.get('collections', []).copy()
|
|
existing_collections = {c.get('collection_name') for c in merged_collections}
|
|
|
|
for collection in without_ghcid.get('collections', []):
|
|
if collection.get('collection_name') not in existing_collections:
|
|
merged_collections.append(collection)
|
|
existing_collections.add(collection.get('collection_name'))
|
|
|
|
if merged_collections:
|
|
merged['collections'] = merged_collections
|
|
|
|
# Merge digital_platforms (avoid duplicates by name)
|
|
merged_platforms = with_ghcid.get('digital_platforms', []).copy()
|
|
existing_platforms = {p.get('platform_name') for p in merged_platforms}
|
|
|
|
for platform in without_ghcid.get('digital_platforms', []):
|
|
if platform.get('platform_name') not in existing_platforms:
|
|
merged_platforms.append(platform)
|
|
existing_platforms.add(platform.get('platform_name'))
|
|
|
|
if merged_platforms:
|
|
merged['digital_platforms'] = merged_platforms
|
|
|
|
# Merge alternative_names (avoid duplicates)
|
|
merged_alt_names = set(with_ghcid.get('alternative_names', []))
|
|
merged_alt_names.update(without_ghcid.get('alternative_names', []))
|
|
if merged_alt_names:
|
|
merged['alternative_names'] = sorted(merged_alt_names)
|
|
|
|
# Use description from with_ghcid, fallback to without_ghcid if missing
|
|
if not merged.get('description') and without_ghcid.get('description'):
|
|
merged['description'] = without_ghcid['description']
|
|
|
|
return merged
|
|
|
|
|
|
def deduplicate_mexican_institutions(institutions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Deduplicate Mexican institutions.
|
|
|
|
Returns:
|
|
List of deduplicated institutions (Mexican + all non-Mexican)
|
|
"""
|
|
mexican = []
|
|
non_mexican = []
|
|
|
|
# Separate Mexican from non-Mexican
|
|
for inst in institutions:
|
|
if is_mexican_institution(inst):
|
|
mexican.append(inst)
|
|
else:
|
|
non_mexican.append(inst)
|
|
|
|
print(f"Total institutions: {len(institutions)}")
|
|
print(f"Mexican institutions: {len(mexican)}")
|
|
print(f"Non-Mexican institutions: {len(non_mexican)}")
|
|
|
|
# Group Mexican institutions by name
|
|
by_name = defaultdict(list)
|
|
for inst in mexican:
|
|
name = inst.get('name', 'Unknown')
|
|
by_name[name].append(inst)
|
|
|
|
print(f"Unique Mexican institution names: {len(by_name)}")
|
|
|
|
# Process each name group
|
|
deduplicated_mexican = []
|
|
duplicates_merged = 0
|
|
unique_kept = 0
|
|
|
|
for name, instances in by_name.items():
|
|
if len(instances) == 1:
|
|
# Single instance - keep as-is
|
|
deduplicated_mexican.append(instances[0])
|
|
unique_kept += 1
|
|
else:
|
|
# Multiple instances - identify with/without GHCID
|
|
with_ghcid = [i for i in instances if i.get('ghcid') and i.get('ghcid') != 'N/A']
|
|
without_ghcid = [i for i in instances if not i.get('ghcid') or i.get('ghcid') == 'N/A']
|
|
|
|
if len(with_ghcid) == 1 and len(without_ghcid) == 1:
|
|
# Standard duplicate case: merge
|
|
merged = merge_institution_records(without_ghcid[0], with_ghcid[0])
|
|
deduplicated_mexican.append(merged)
|
|
duplicates_merged += 1
|
|
elif len(with_ghcid) == 1 and len(without_ghcid) == 0:
|
|
# Only has-GHCID version exists (should not happen in by_name grouping)
|
|
deduplicated_mexican.append(with_ghcid[0])
|
|
unique_kept += 1
|
|
else:
|
|
# Multiple GHCIDs or complex case - keep all (need manual review)
|
|
print(f"WARNING: Complex duplicate for '{name}': {len(with_ghcid)} with GHCID, {len(without_ghcid)} without")
|
|
deduplicated_mexican.extend(instances)
|
|
|
|
print(f"\nDeduplication results:")
|
|
print(f" Duplicates merged: {duplicates_merged}")
|
|
print(f" Unique institutions kept: {unique_kept}")
|
|
print(f" Total deduplicated Mexican institutions: {len(deduplicated_mexican)}")
|
|
|
|
# Combine deduplicated Mexican with non-Mexican
|
|
final = non_mexican + deduplicated_mexican
|
|
print(f"\nFinal dataset size: {len(final)} (was {len(institutions)})")
|
|
print(f"Reduction: {len(institutions) - len(final)} duplicates removed")
|
|
|
|
return final
|
|
|
|
|
|
def main():
|
|
"""Main execution."""
|
|
input_file = 'data/instances/all/globalglam-20251111-brazil-campaign-final.yaml'
|
|
output_file = 'data/instances/all/globalglam-20251113-mexico-deduplicated.yaml'
|
|
|
|
print("="*80)
|
|
print("Mexican Heritage Institutions Deduplication")
|
|
print("="*80)
|
|
print()
|
|
|
|
# Load data
|
|
print(f"Loading: {input_file}")
|
|
institutions = load_institutions(input_file)
|
|
|
|
# Deduplicate
|
|
deduplicated = deduplicate_mexican_institutions(institutions)
|
|
|
|
# Save
|
|
print(f"\nSaving deduplicated dataset: {output_file}")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(deduplicated, f, allow_unicode=True, sort_keys=False, width=120)
|
|
|
|
print("\n✓ Deduplication complete!")
|
|
print(f" Output: {output_file}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|