- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
167 lines
5.7 KiB
Python
167 lines
5.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Analyze GHCID Collisions in Japan Dataset
|
|
|
|
Investigates why 2,573 Japanese institutions have colliding GHCIDs.
|
|
This represents 21.3% data loss during deduplication - needs investigation.
|
|
|
|
Author: GLAM Data Extraction Project
|
|
Date: 2025-11-07
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from collections import Counter, defaultdict
|
|
from typing import List, Dict, Any
|
|
|
|
|
|
def load_yaml_dataset(file_path: Path) -> List[Dict[str, Any]]:
|
|
"""Load YAML dataset from file."""
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
def analyze_collisions(institutions: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""Analyze GHCID collision patterns."""
|
|
|
|
# Group by GHCID
|
|
ghcid_groups = defaultdict(list)
|
|
for inst in institutions:
|
|
ghcid = inst.get('ghcid')
|
|
if ghcid:
|
|
ghcid_groups[ghcid].append(inst)
|
|
|
|
# Find collisions (GHCIDs with multiple institutions)
|
|
collisions = {
|
|
ghcid: insts
|
|
for ghcid, insts in ghcid_groups.items()
|
|
if len(insts) > 1
|
|
}
|
|
|
|
print(f"Total institutions: {len(institutions):,}")
|
|
print(f"Unique GHCIDs: {len(ghcid_groups):,}")
|
|
print(f"Colliding GHCIDs: {len(collisions):,}")
|
|
print(f"Total institutions affected: {sum(len(insts) for insts in collisions.values()):,}")
|
|
print(f"Data loss: {sum(len(insts) - 1 for insts in collisions.values()):,} institutions")
|
|
|
|
# Analyze collision sizes
|
|
collision_sizes = Counter(len(insts) for insts in collisions.values())
|
|
print(f"\nCollision size distribution:")
|
|
for size in sorted(collision_sizes.keys(), reverse=True):
|
|
count = collision_sizes[size]
|
|
print(f" {size} institutions sharing same GHCID: {count} cases")
|
|
|
|
# Show examples of largest collisions
|
|
print(f"\nTop 20 Largest Collisions:")
|
|
print(f"{'='*80}")
|
|
|
|
largest_collisions = sorted(
|
|
collisions.items(),
|
|
key=lambda x: len(x[1]),
|
|
reverse=True
|
|
)[:20]
|
|
|
|
for ghcid, insts in largest_collisions:
|
|
print(f"\nGHCID: {ghcid} ({len(insts)} institutions)")
|
|
print(f" Common pattern:")
|
|
|
|
# Show first 5 institutions
|
|
for i, inst in enumerate(insts[:5], 1):
|
|
name = inst.get('name', 'UNKNOWN')
|
|
city = inst.get('locations', [{}])[0].get('city', 'UNKNOWN')
|
|
isil = next(
|
|
(id['identifier_value']
|
|
for id in inst.get('identifiers', [])
|
|
if id.get('identifier_scheme') == 'ISIL'),
|
|
'NO-ISIL'
|
|
)
|
|
print(f" {i}. {name} ({city}) - ISIL: {isil}")
|
|
|
|
if len(insts) > 5:
|
|
print(f" ... and {len(insts) - 5} more")
|
|
|
|
# Analyze by prefecture/city
|
|
print(f"\n{'='*80}")
|
|
print("Collision Hotspots (Cities with Most Collisions):")
|
|
print(f"{'='*80}")
|
|
|
|
city_collisions = defaultdict(int)
|
|
for ghcid, insts in collisions.items():
|
|
city = insts[0].get('locations', [{}])[0].get('city', 'UNKNOWN')
|
|
city_collisions[city] += len(insts) - 1 # Count excess institutions
|
|
|
|
for city, loss in sorted(city_collisions.items(), key=lambda x: x[1], reverse=True)[:20]:
|
|
print(f" {city}: {loss} institutions lost to collisions")
|
|
|
|
# Analyze GHCID structure patterns
|
|
print(f"\n{'='*80}")
|
|
print("GHCID Pattern Analysis:")
|
|
print(f"{'='*80}")
|
|
|
|
# Count institution types in collisions
|
|
type_counter = Counter()
|
|
for ghcid, insts in collisions.items():
|
|
for inst in insts:
|
|
type_counter[inst.get('institution_type', 'UNKNOWN')] += 1
|
|
|
|
print("\nInstitution types affected by collisions:")
|
|
for inst_type, count in type_counter.most_common():
|
|
print(f" {inst_type}: {count:,}")
|
|
|
|
# Sample collision details for debugging
|
|
print(f"\n{'='*80}")
|
|
print("Sample Collision Details (First 10):")
|
|
print(f"{'='*80}")
|
|
|
|
for ghcid, insts in list(collisions.items())[:10]:
|
|
print(f"\nGHCID: {ghcid}")
|
|
for inst in insts:
|
|
name = inst.get('name', 'UNKNOWN')
|
|
name_abbrev = inst.get('name_abbreviation', 'NO-ABBREV')
|
|
city = inst.get('locations', [{}])[0].get('city', 'UNKNOWN')
|
|
isil = next(
|
|
(id['identifier_value']
|
|
for id in inst.get('identifiers', [])
|
|
if id.get('identifier_scheme') == 'ISIL'),
|
|
'NO-ISIL'
|
|
)
|
|
print(f" - Name: {name}")
|
|
print(f" Abbreviation: {name_abbrev}")
|
|
print(f" City: {city}")
|
|
print(f" ISIL: {isil}")
|
|
|
|
return {
|
|
'total_institutions': len(institutions),
|
|
'unique_ghcids': len(ghcid_groups),
|
|
'colliding_ghcids': len(collisions),
|
|
'total_affected': sum(len(insts) for insts in collisions.values()),
|
|
'data_loss': sum(len(insts) - 1 for insts in collisions.values()),
|
|
'collision_sizes': dict(collision_sizes),
|
|
'city_hotspots': dict(city_collisions),
|
|
'type_distribution': dict(type_counter),
|
|
}
|
|
|
|
|
|
def main():
|
|
"""Main execution."""
|
|
base_path = Path('/Users/kempersc/apps/glam')
|
|
|
|
# Load Japan dataset
|
|
japan_file = base_path / 'data/instances/japan/jp_institutions.yaml'
|
|
print(f"Loading {japan_file.name}...\n")
|
|
|
|
institutions = load_yaml_dataset(japan_file)
|
|
|
|
# Analyze
|
|
stats = analyze_collisions(institutions)
|
|
|
|
# Save analysis
|
|
output_file = base_path / 'data/instances/japan/ghcid_collision_analysis.yaml'
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(stats, f, allow_unicode=True, default_flow_style=False)
|
|
|
|
print(f"\n✅ Analysis saved to {output_file}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|