#!/usr/bin/env python3 """ Apply Collision Resolution to Dutch Datasets This script: 1. Parses Dutch ISIL registry (364 institutions) 2. Parses Dutch organizations CSV (1,351 institutions) 3. Deduplicates combined dataset to remove true duplicates 4. Generates GHCIDs for all institutions 5. Applies collision detection and resolution 6. Exports merged dataset with collision statistics 7. Generates detailed collision analysis report Usage: python scripts/apply_collision_resolution_dutch_datasets.py Output: - data/dutch_institutions_with_ghcids.yaml - Full dataset with resolved GHCIDs - data/dutch_collision_report.txt - Detailed collision analysis - data/dutch_collision_stats.json - Machine-readable statistics - data/dutch_deduplication_report.txt - Deduplication details """ import json from collections import defaultdict from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Set, Tuple import yaml from glam_extractor.identifiers.collision_detector import GHCIDCollisionDetector from glam_extractor.identifiers.ghcid import GHCIDGenerator, InstitutionType from glam_extractor.identifiers.lookups import get_ghcid_components_for_dutch_city from glam_extractor.models import HeritageCustodian from glam_extractor.parsers.deduplicator import InstitutionDeduplicator from glam_extractor.parsers.dutch_orgs import DutchOrgsParser from glam_extractor.parsers.isil_registry import ISILRegistryParser def generate_ghcid_for_institution( institution: HeritageCustodian, generator: GHCIDGenerator ) -> None: """ Generate GHCID for a HeritageCustodian instance. Updates the institution in-place with ghcid, ghcid_uuid, etc. Args: institution: HeritageCustodian to update generator: GHCIDGenerator instance """ if not institution.locations or not institution.locations[0].city: print(f"Warning: No city for {institution.name}, skipping GHCID generation") return city = institution.locations[0].city country = institution.locations[0].country or "NL" # Get GHCID components for this city try: component_dict = get_ghcid_components_for_dutch_city( city=city, institution_name=institution.name, institution_type=InstitutionType[institution.institution_type], ) except (ValueError, KeyError) as e: print( f"Warning: Could not generate GHCID components for {institution.name} " f"in {city}: {e}" ) return if not component_dict: print(f"Warning: No GHCID components for {institution.name} in {city}") return # Generate GHCID components components = generator.generate( institution_name=component_dict["institution_name"], english_name=component_dict["english_name"], institution_type=InstitutionType[institution.institution_type], country_code=component_dict["country_code"], region_code=component_dict["region_code"], city_locode=component_dict["city_locode"], ) # Update institution with all identifier formats institution.ghcid = components.to_string() institution.ghcid_uuid = str(components.to_uuid()) institution.ghcid_uuid_sha256 = str(components.to_uuid_sha256()) institution.ghcid_numeric = components.to_numeric() def analyze_collisions( institutions: List[HeritageCustodian], ) -> Dict[str, any]: """ Analyze resolved institutions and generate collision statistics. Args: institutions: List of resolved HeritageCustodian objects Returns: Dictionary of statistics """ stats = { "total_institutions": len(institutions), "institutions_with_collisions": 0, "collision_groups": 0, "first_batch_groups": 0, "historical_addition_groups": 0, "q_numbers_added": 0, "wikidata_q_numbers": 0, "synthetic_q_numbers": 0, "ghcid_changes": 0, "collisions_by_city": defaultdict(int), "collisions_by_type": defaultdict(int), "largest_collision_group": 0, } # Group by base GHCID (without Q-number) base_ghcid_groups = defaultdict(list) for inst in institutions: if inst.ghcid: base_ghcid = inst.ghcid.split("-Q")[0] base_ghcid_groups[base_ghcid].append(inst) # Analyze collision groups for base_ghcid, group in base_ghcid_groups.items(): if len(group) > 1: stats["collision_groups"] += 1 stats["largest_collision_group"] = max( stats["largest_collision_group"], len(group) ) # Determine if first batch or historical addition extraction_dates = [inst.provenance.extraction_date for inst in group] unique_dates = set(d.date() for d in extraction_dates) if len(unique_dates) == 1: stats["first_batch_groups"] += 1 else: stats["historical_addition_groups"] += 1 # Count institutions in this collision for inst in group: stats["institutions_with_collisions"] += 1 # Track city and type if inst.locations and inst.locations[0].city: stats["collisions_by_city"][inst.locations[0].city] += 1 stats["collisions_by_type"][inst.institution_type] += 1 # Check if Q-number was added if inst.ghcid and "-Q" in inst.ghcid: stats["q_numbers_added"] += 1 # Check if Wikidata or synthetic if any( i.identifier_scheme == "Wikidata" for i in (inst.identifiers or []) ): stats["wikidata_q_numbers"] += 1 else: stats["synthetic_q_numbers"] += 1 # Check if GHCID changed (has history) if inst.ghcid_history and len(inst.ghcid_history) > 1: stats["ghcid_changes"] += 1 return stats def generate_collision_report( institutions: List[HeritageCustodian], stats: Dict[str, any] ) -> str: """ Generate human-readable collision analysis report. Args: institutions: List of resolved HeritageCustodian objects stats: Statistics dictionary from analyze_collisions() Returns: Multi-line string report """ lines = [ "=" * 80, "GHCID Collision Analysis Report - Dutch Heritage Institutions", "=" * 80, "", f"Generated: {datetime.now(timezone.utc).isoformat()}", "", "OVERVIEW", "-" * 80, f"Total institutions processed: {stats['total_institutions']:,}", f"Institutions with collisions: {stats['institutions_with_collisions']:,} " f"({stats['institutions_with_collisions'] / stats['total_institutions'] * 100:.1f}%)", f"Collision groups detected: {stats['collision_groups']:,}", f" - First batch collisions: {stats['first_batch_groups']:,}", f" - Historical additions: {stats['historical_addition_groups']:,}", f"Largest collision group: {stats['largest_collision_group']} institutions", "", "Q-NUMBER ASSIGNMENT", "-" * 80, f"Q-numbers added: {stats['q_numbers_added']:,}", f" - From Wikidata: {stats['wikidata_q_numbers']:,}", f" - Synthetic (generated): {stats['synthetic_q_numbers']:,}", f"GHCID changes tracked: {stats['ghcid_changes']:,}", "", "COLLISIONS BY CITY", "-" * 80, ] # Sort cities by collision count cities_sorted = sorted( stats["collisions_by_city"].items(), key=lambda x: x[1], reverse=True ) for city, count in cities_sorted[:20]: # Top 20 cities lines.append(f" {city:<30} {count:>4} institutions") lines.extend([ "", "COLLISIONS BY INSTITUTION TYPE", "-" * 80, ]) # Sort types by collision count types_sorted = sorted( stats["collisions_by_type"].items(), key=lambda x: x[1], reverse=True ) for inst_type, count in types_sorted: lines.append(f" {inst_type:<30} {count:>4} institutions") lines.extend([ "", "DETAILED COLLISION GROUPS", "-" * 80, "", ]) # Group by base GHCID for detailed listing base_ghcid_groups = defaultdict(list) for inst in institutions: if inst.ghcid: base_ghcid = inst.ghcid.split("-Q")[0] base_ghcid_groups[base_ghcid].append(inst) # Show collision groups (2+ institutions per base GHCID) collision_groups = [ (base_ghcid, group) for base_ghcid, group in base_ghcid_groups.items() if len(group) > 1 ] collision_groups.sort(key=lambda x: len(x[1]), reverse=True) for i, (base_ghcid, group) in enumerate(collision_groups[:50], 1): # Top 50 groups lines.append(f"{i}. Base GHCID: {base_ghcid}") lines.append(f" {len(group)} institutions:") for inst in group: q_suffix = "" if inst.ghcid and "-Q" in inst.ghcid: q_suffix = f" → {inst.ghcid}" city = inst.locations[0].city if inst.locations else "Unknown" lines.append( f" - {inst.name} ({city}){q_suffix}" ) lines.append("") lines.append("=" * 80) return "\n".join(lines) def main(): """Main execution function""" print("GHCID Collision Resolution - Dutch Datasets") print("=" * 80) # Initialize parsers and detector isil_parser = ISILRegistryParser() dutch_parser = DutchOrgsParser() detector = GHCIDCollisionDetector() generator = GHCIDGenerator() # Parse ISIL registry print("\n1. Parsing ISIL registry...") isil_csv = Path("data/ISIL-codes_2025-08-01.csv") if not isil_csv.exists(): print(f"ERROR: ISIL registry not found at {isil_csv}") return isil_institutions = isil_parser.parse_and_convert(isil_csv) print(f" Loaded {len(isil_institutions):,} institutions from ISIL registry") # Parse Dutch organizations print("\n2. Parsing Dutch organizations CSV...") dutch_csv = Path( "data/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv" ) if not dutch_csv.exists(): print(f"ERROR: Dutch organizations CSV not found at {dutch_csv}") return dutch_institutions = dutch_parser.parse_and_convert(dutch_csv) print(f" Loaded {len(dutch_institutions):,} institutions from Dutch orgs CSV") # Combine datasets all_institutions = isil_institutions + dutch_institutions print(f"\n3. Combined dataset: {len(all_institutions):,} institutions") # Deduplicate institutions before GHCID generation print("\n4. Deduplicating institutions...") deduplicator = InstitutionDeduplicator() deduplicated_institutions = deduplicator.deduplicate( all_institutions, merge_metadata=True ) duplicates_removed = len(all_institutions) - len(deduplicated_institutions) print(f" Removed {duplicates_removed:,} duplicates") print(f" {len(deduplicated_institutions):,} unique institutions remain") # Generate deduplication report dedup_report_lines = [ "Dutch Dataset Deduplication Report", "=" * 80, f"\nGenerated: {datetime.now(timezone.utc).isoformat()}", f"\nTotal institutions (before deduplication): {len(all_institutions):,}", f"Unique institutions (after deduplication): {len(deduplicated_institutions):,}", f"Duplicates removed: {duplicates_removed:,}", f"\nDuplicate groups detected: {len(deduplicator.duplicate_groups)}", "\n" + "=" * 80, "\nDuplicate Groups:\n" ] for i, group in enumerate(deduplicator.duplicate_groups, 1): dedup_report_lines.append(f"\nGroup {i} ({len(group)} duplicates):") for inst in group: city = inst.locations[0].city if inst.locations else "Unknown" tier = inst.provenance.data_tier if inst.provenance else "Unknown" dedup_report_lines.append(f" - {inst.name} ({city}) [{tier}]") dedup_report = "\n".join(dedup_report_lines) # Use deduplicated set for GHCID generation all_institutions = deduplicated_institutions # Generate GHCIDs for all institutions print("\n5. Generating GHCIDs...") successful_ghcids = 0 for inst in all_institutions: if not inst.ghcid: # Only generate if not already present generate_ghcid_for_institution(inst, generator) if inst.ghcid: successful_ghcids += 1 print(f" Generated GHCIDs for {successful_ghcids:,} institutions") # Filter institutions with GHCIDs for collision detection institutions_with_ghcids = [inst for inst in all_institutions if inst.ghcid] print( f" {len(institutions_with_ghcids):,} institutions have GHCIDs " f"({len(all_institutions) - len(institutions_with_ghcids):,} without)" ) # Detect and resolve collisions print("\n6. Resolving collisions...") resolved_institutions = detector.resolve_collisions(institutions_with_ghcids) print(f" Resolved {len(resolved_institutions):,} institutions") # Analyze collisions print("\n7. Analyzing collision patterns...") stats = analyze_collisions(resolved_institutions) # Generate report print("\n8. Generating collision report...") report = generate_collision_report(resolved_institutions, stats) # Export results output_dir = Path("data") output_dir.mkdir(exist_ok=True) # Export merged dataset print("\n9. Exporting results...") # Convert to dict for YAML serialization institutions_dict = [ { "id": inst.id, "name": inst.name, "institution_type": inst.institution_type, "ghcid": inst.ghcid, "ghcid_uuid": inst.ghcid_uuid, "ghcid_uuid_sha256": inst.ghcid_uuid_sha256, "ghcid_numeric": inst.ghcid_numeric, "identifiers": [ { "identifier_scheme": i.identifier_scheme, "identifier_value": i.identifier_value, "identifier_url": str(i.identifier_url) if i.identifier_url else None, } for i in (inst.identifiers or []) ], "locations": [ { "city": loc.city, "country": loc.country, "street_address": loc.street_address, } for loc in (inst.locations or []) ], "provenance": { "data_source": inst.provenance.data_source, "data_tier": inst.provenance.data_tier, "extraction_date": inst.provenance.extraction_date.isoformat(), "confidence_score": inst.provenance.confidence_score, }, "ghcid_history": [ { "ghcid": entry.ghcid, "ghcid_numeric": entry.ghcid_numeric, "valid_from": entry.valid_from.isoformat(), "valid_to": entry.valid_to.isoformat() if entry.valid_to else None, "reason": entry.reason, } for entry in (inst.ghcid_history or []) ] if inst.ghcid_history else [], } for inst in resolved_institutions ] yaml_path = output_dir / "dutch_institutions_with_ghcids.yaml" with open(yaml_path, "w") as f: yaml.dump(institutions_dict, f, default_flow_style=False, sort_keys=False) print(f" ✓ Exported dataset to {yaml_path}") # Export collision report report_path = output_dir / "dutch_collision_report.txt" with open(report_path, "w") as f: f.write(report) print(f" ✓ Exported collision report to {report_path}") # Export statistics stats_path = output_dir / "dutch_collision_stats.json" # Convert defaultdict to regular dict for JSON serialization stats_serializable = { k: dict(v) if isinstance(v, defaultdict) else v for k, v in stats.items() } with open(stats_path, "w") as f: json.dump(stats_serializable, f, indent=2, default=str) print(f" ✓ Exported statistics to {stats_path}") # Export deduplication report dedup_report_path = output_dir / "dutch_deduplication_report.txt" with open(dedup_report_path, "w") as f: f.write(dedup_report) print(f" ✓ Exported deduplication report to {dedup_report_path}") # Print summary print("\n" + "=" * 80) print("SUMMARY") print("=" * 80) print(f"Initial institutions: {len(isil_institutions) + len(dutch_institutions):,}") print(f"Duplicates removed: {duplicates_removed:,}") print(f"Unique institutions: {stats['total_institutions']:,}") print(f"Collision groups: {stats['collision_groups']:,}") print(f"Q-numbers added: {stats['q_numbers_added']:,}") print(f" - Wikidata: {stats['wikidata_q_numbers']:,}") print(f" - Synthetic: {stats['synthetic_q_numbers']:,}") print("\nOutput files:") print(f" - {yaml_path}") print(f" - {report_path}") print(f" - {stats_path}") print(f" - {dedup_report_path}") print("=" * 80) if __name__ == "__main__": main()