#!/usr/bin/env python3 """ Compare ISIL registry and Dutch organizations datasets. Identifies overlaps, unique records, and data quality insights. """ from pathlib import Path from glam_extractor.parsers.isil_registry import ISILRegistryParser from glam_extractor.parsers.dutch_orgs import DutchOrgsParser from collections import defaultdict def main(): isil_path = Path("data/ISIL-codes_2025-08-01.csv") dutch_orgs_path = Path("data/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv") print("=" * 70) print("DUTCH HERITAGE DATASETS COMPARISON") print("=" * 70) print() # Parse both datasets print("📄 Loading datasets...") isil_parser = ISILRegistryParser() dutch_parser = DutchOrgsParser() isil_custodians = isil_parser.parse_and_convert(isil_path) dutch_custodians = dutch_parser.parse_and_convert(dutch_orgs_path) print(f"✅ ISIL Registry: {len(isil_custodians):4d} institutions") print(f"✅ Dutch Organizations: {len(dutch_custodians):4d} institutions") print() # Extract ISIL codes from both datasets isil_codes_from_registry = {} for custodian in isil_custodians: for identifier in custodian.identifiers: if identifier.identifier_scheme == "ISIL": isil_codes_from_registry[identifier.identifier_value] = custodian.name isil_codes_from_orgs = {} for custodian in dutch_custodians: for identifier in custodian.identifiers: if identifier.identifier_scheme == "ISIL": isil_codes_from_orgs[identifier.identifier_value] = custodian.name print("🔍 ISIL Code Analysis:") print("-" * 70) print(f"ISIL codes in registry: {len(isil_codes_from_registry):4d}") print(f"ISIL codes in organizations: {len(isil_codes_from_orgs):4d}") print() # Find overlaps common_isil_codes = set(isil_codes_from_registry.keys()) & set(isil_codes_from_orgs.keys()) only_in_registry = set(isil_codes_from_registry.keys()) - set(isil_codes_from_orgs.keys()) only_in_orgs = set(isil_codes_from_orgs.keys()) - set(isil_codes_from_registry.keys()) print("📊 ISIL Code Overlap:") print("-" * 70) print(f"Common ISIL codes: {len(common_isil_codes):4d}") print(f"Only in registry: {len(only_in_registry):4d}") print(f"Only in organizations: {len(only_in_orgs):4d}") print() # Calculate overlap percentage if isil_codes_from_registry: registry_overlap = (len(common_isil_codes) / len(isil_codes_from_registry)) * 100 print(f"Registry overlap: {registry_overlap:.1f}%") if isil_codes_from_orgs: orgs_overlap = (len(common_isil_codes) / len(isil_codes_from_orgs)) * 100 print(f"Organizations overlap: {orgs_overlap:.1f}%") print() # Show sample overlapping institutions if common_isil_codes: print("📝 Sample Overlapping Institutions:") print("-" * 70) for i, isil_code in enumerate(sorted(common_isil_codes)[:10], 1): registry_name = isil_codes_from_registry[isil_code] orgs_name = isil_codes_from_orgs[isil_code] match_indicator = "✓" if registry_name == orgs_name else "≈" print(f"{i:2d}. {isil_code}") print(f" Registry: {registry_name}") print(f" Orgs: {orgs_name} {match_indicator}") print() # Geography comparison print("🌍 Geographic Coverage:") print("-" * 70) isil_cities = set() for custodian in isil_custodians: for location in custodian.locations: if location.city: isil_cities.add(location.city) dutch_cities = set() for custodian in dutch_custodians: for location in custodian.locations: if location.city: dutch_cities.add(location.city) print(f"Cities in ISIL registry: {len(isil_cities):4d}") print(f"Cities in organizations: {len(dutch_cities):4d}") print(f"Common cities: {len(isil_cities & dutch_cities):4d}") print() # Institution type comparison print("🏛️ Institution Types:") print("-" * 70) isil_types = defaultdict(int) for custodian in isil_custodians: isil_types[custodian.institution_type] += 1 dutch_types = defaultdict(int) for custodian in dutch_custodians: dutch_types[custodian.institution_type] += 1 print("ISIL Registry:") for inst_type, count in sorted(isil_types.items(), key=lambda x: x[1], reverse=True): print(f" {inst_type:20} {count:4d}") print("\nDutch Organizations:") for inst_type, count in sorted(dutch_types.items(), key=lambda x: x[1], reverse=True): print(f" {inst_type:20} {count:4d}") print() # Data quality insights print("✨ Data Quality Insights:") print("-" * 70) # Organizations with both ISIL and platforms with_both = sum( 1 for c in dutch_custodians if any(i.identifier_scheme == "ISIL" for i in c.identifiers) and c.digital_platforms ) print(f"Organizations with ISIL + platforms: {with_both:4d}") # ISIL records missing from orgs dataset print(f"ISIL records not in orgs dataset: {len(only_in_registry):4d}") print(f" → Potential for enrichment") # Orgs without ISIL codes orgs_without_isil = len(dutch_custodians) - len(isil_codes_from_orgs) print(f"Organizations without ISIL codes: {orgs_without_isil:4d}") print(f" → Candidates for ISIL assignment") print() # Recommendations print("💡 Recommendations:") print("-" * 70) print("1. Merge datasets using ISIL codes as primary key") print("2. Enrich ISIL registry records with platform data from organizations") print("3. Review name mismatches in overlapping records") print("4. Consider geocoding both datasets for spatial analysis") print(f"5. {orgs_without_isil} organizations may benefit from ISIL code assignment") print() print("✅ Comparison complete!") if __name__ == "__main__": main()