glam/scripts/analyze_ghcid_collisions.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

167 lines
5.7 KiB
Python

#!/usr/bin/env python3
"""
Analyze GHCID Collisions in Japan Dataset
Investigates why 2,573 Japanese institutions have colliding GHCIDs.
This represents 21.3% data loss during deduplication - needs investigation.
Author: GLAM Data Extraction Project
Date: 2025-11-07
"""
import yaml
from pathlib import Path
from collections import Counter, defaultdict
from typing import List, Dict, Any
def load_yaml_dataset(file_path: Path) -> List[Dict[str, Any]]:
"""Load YAML dataset from file."""
with open(file_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def analyze_collisions(institutions: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Analyze GHCID collision patterns."""
# Group by GHCID
ghcid_groups = defaultdict(list)
for inst in institutions:
ghcid = inst.get('ghcid')
if ghcid:
ghcid_groups[ghcid].append(inst)
# Find collisions (GHCIDs with multiple institutions)
collisions = {
ghcid: insts
for ghcid, insts in ghcid_groups.items()
if len(insts) > 1
}
print(f"Total institutions: {len(institutions):,}")
print(f"Unique GHCIDs: {len(ghcid_groups):,}")
print(f"Colliding GHCIDs: {len(collisions):,}")
print(f"Total institutions affected: {sum(len(insts) for insts in collisions.values()):,}")
print(f"Data loss: {sum(len(insts) - 1 for insts in collisions.values()):,} institutions")
# Analyze collision sizes
collision_sizes = Counter(len(insts) for insts in collisions.values())
print(f"\nCollision size distribution:")
for size in sorted(collision_sizes.keys(), reverse=True):
count = collision_sizes[size]
print(f" {size} institutions sharing same GHCID: {count} cases")
# Show examples of largest collisions
print(f"\nTop 20 Largest Collisions:")
print(f"{'='*80}")
largest_collisions = sorted(
collisions.items(),
key=lambda x: len(x[1]),
reverse=True
)[:20]
for ghcid, insts in largest_collisions:
print(f"\nGHCID: {ghcid} ({len(insts)} institutions)")
print(f" Common pattern:")
# Show first 5 institutions
for i, inst in enumerate(insts[:5], 1):
name = inst.get('name', 'UNKNOWN')
city = inst.get('locations', [{}])[0].get('city', 'UNKNOWN')
isil = next(
(id['identifier_value']
for id in inst.get('identifiers', [])
if id.get('identifier_scheme') == 'ISIL'),
'NO-ISIL'
)
print(f" {i}. {name} ({city}) - ISIL: {isil}")
if len(insts) > 5:
print(f" ... and {len(insts) - 5} more")
# Analyze by prefecture/city
print(f"\n{'='*80}")
print("Collision Hotspots (Cities with Most Collisions):")
print(f"{'='*80}")
city_collisions = defaultdict(int)
for ghcid, insts in collisions.items():
city = insts[0].get('locations', [{}])[0].get('city', 'UNKNOWN')
city_collisions[city] += len(insts) - 1 # Count excess institutions
for city, loss in sorted(city_collisions.items(), key=lambda x: x[1], reverse=True)[:20]:
print(f" {city}: {loss} institutions lost to collisions")
# Analyze GHCID structure patterns
print(f"\n{'='*80}")
print("GHCID Pattern Analysis:")
print(f"{'='*80}")
# Count institution types in collisions
type_counter = Counter()
for ghcid, insts in collisions.items():
for inst in insts:
type_counter[inst.get('institution_type', 'UNKNOWN')] += 1
print("\nInstitution types affected by collisions:")
for inst_type, count in type_counter.most_common():
print(f" {inst_type}: {count:,}")
# Sample collision details for debugging
print(f"\n{'='*80}")
print("Sample Collision Details (First 10):")
print(f"{'='*80}")
for ghcid, insts in list(collisions.items())[:10]:
print(f"\nGHCID: {ghcid}")
for inst in insts:
name = inst.get('name', 'UNKNOWN')
name_abbrev = inst.get('name_abbreviation', 'NO-ABBREV')
city = inst.get('locations', [{}])[0].get('city', 'UNKNOWN')
isil = next(
(id['identifier_value']
for id in inst.get('identifiers', [])
if id.get('identifier_scheme') == 'ISIL'),
'NO-ISIL'
)
print(f" - Name: {name}")
print(f" Abbreviation: {name_abbrev}")
print(f" City: {city}")
print(f" ISIL: {isil}")
return {
'total_institutions': len(institutions),
'unique_ghcids': len(ghcid_groups),
'colliding_ghcids': len(collisions),
'total_affected': sum(len(insts) for insts in collisions.values()),
'data_loss': sum(len(insts) - 1 for insts in collisions.values()),
'collision_sizes': dict(collision_sizes),
'city_hotspots': dict(city_collisions),
'type_distribution': dict(type_counter),
}
def main():
"""Main execution."""
base_path = Path('/Users/kempersc/apps/glam')
# Load Japan dataset
japan_file = base_path / 'data/instances/japan/jp_institutions.yaml'
print(f"Loading {japan_file.name}...\n")
institutions = load_yaml_dataset(japan_file)
# Analyze
stats = analyze_collisions(institutions)
# Save analysis
output_file = base_path / 'data/instances/japan/ghcid_collision_analysis.yaml'
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(stats, f, allow_unicode=True, default_flow_style=False)
print(f"\n✅ Analysis saved to {output_file}")
if __name__ == '__main__':
main()