#!/usr/bin/env python3 """ Analyze GHCID Collisions in Japan Dataset Investigates why 2,573 Japanese institutions have colliding GHCIDs. This represents 21.3% data loss during deduplication - needs investigation. Author: GLAM Data Extraction Project Date: 2025-11-07 """ import yaml from pathlib import Path from collections import Counter, defaultdict from typing import List, Dict, Any def load_yaml_dataset(file_path: Path) -> List[Dict[str, Any]]: """Load YAML dataset from file.""" with open(file_path, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def analyze_collisions(institutions: List[Dict[str, Any]]) -> Dict[str, Any]: """Analyze GHCID collision patterns.""" # Group by GHCID ghcid_groups = defaultdict(list) for inst in institutions: ghcid = inst.get('ghcid') if ghcid: ghcid_groups[ghcid].append(inst) # Find collisions (GHCIDs with multiple institutions) collisions = { ghcid: insts for ghcid, insts in ghcid_groups.items() if len(insts) > 1 } print(f"Total institutions: {len(institutions):,}") print(f"Unique GHCIDs: {len(ghcid_groups):,}") print(f"Colliding GHCIDs: {len(collisions):,}") print(f"Total institutions affected: {sum(len(insts) for insts in collisions.values()):,}") print(f"Data loss: {sum(len(insts) - 1 for insts in collisions.values()):,} institutions") # Analyze collision sizes collision_sizes = Counter(len(insts) for insts in collisions.values()) print(f"\nCollision size distribution:") for size in sorted(collision_sizes.keys(), reverse=True): count = collision_sizes[size] print(f" {size} institutions sharing same GHCID: {count} cases") # Show examples of largest collisions print(f"\nTop 20 Largest Collisions:") print(f"{'='*80}") largest_collisions = sorted( collisions.items(), key=lambda x: len(x[1]), reverse=True )[:20] for ghcid, insts in largest_collisions: print(f"\nGHCID: {ghcid} ({len(insts)} institutions)") print(f" Common pattern:") # Show first 5 institutions for i, inst in enumerate(insts[:5], 1): name = inst.get('name', 'UNKNOWN') city = inst.get('locations', [{}])[0].get('city', 'UNKNOWN') isil = next( (id['identifier_value'] for id in inst.get('identifiers', []) if id.get('identifier_scheme') == 'ISIL'), 'NO-ISIL' ) print(f" {i}. {name} ({city}) - ISIL: {isil}") if len(insts) > 5: print(f" ... and {len(insts) - 5} more") # Analyze by prefecture/city print(f"\n{'='*80}") print("Collision Hotspots (Cities with Most Collisions):") print(f"{'='*80}") city_collisions = defaultdict(int) for ghcid, insts in collisions.items(): city = insts[0].get('locations', [{}])[0].get('city', 'UNKNOWN') city_collisions[city] += len(insts) - 1 # Count excess institutions for city, loss in sorted(city_collisions.items(), key=lambda x: x[1], reverse=True)[:20]: print(f" {city}: {loss} institutions lost to collisions") # Analyze GHCID structure patterns print(f"\n{'='*80}") print("GHCID Pattern Analysis:") print(f"{'='*80}") # Count institution types in collisions type_counter = Counter() for ghcid, insts in collisions.items(): for inst in insts: type_counter[inst.get('institution_type', 'UNKNOWN')] += 1 print("\nInstitution types affected by collisions:") for inst_type, count in type_counter.most_common(): print(f" {inst_type}: {count:,}") # Sample collision details for debugging print(f"\n{'='*80}") print("Sample Collision Details (First 10):") print(f"{'='*80}") for ghcid, insts in list(collisions.items())[:10]: print(f"\nGHCID: {ghcid}") for inst in insts: name = inst.get('name', 'UNKNOWN') name_abbrev = inst.get('name_abbreviation', 'NO-ABBREV') city = inst.get('locations', [{}])[0].get('city', 'UNKNOWN') isil = next( (id['identifier_value'] for id in inst.get('identifiers', []) if id.get('identifier_scheme') == 'ISIL'), 'NO-ISIL' ) print(f" - Name: {name}") print(f" Abbreviation: {name_abbrev}") print(f" City: {city}") print(f" ISIL: {isil}") return { 'total_institutions': len(institutions), 'unique_ghcids': len(ghcid_groups), 'colliding_ghcids': len(collisions), 'total_affected': sum(len(insts) for insts in collisions.values()), 'data_loss': sum(len(insts) - 1 for insts in collisions.values()), 'collision_sizes': dict(collision_sizes), 'city_hotspots': dict(city_collisions), 'type_distribution': dict(type_counter), } def main(): """Main execution.""" base_path = Path('/Users/kempersc/apps/glam') # Load Japan dataset japan_file = base_path / 'data/instances/japan/jp_institutions.yaml' print(f"Loading {japan_file.name}...\n") institutions = load_yaml_dataset(japan_file) # Analyze stats = analyze_collisions(institutions) # Save analysis output_file = base_path / 'data/instances/japan/ghcid_collision_analysis.yaml' with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(stats, f, allow_unicode=True, default_flow_style=False) print(f"\n✅ Analysis saved to {output_file}") if __name__ == '__main__': main()