glam/compare_dutch_datasets.py
2025-11-19 23:25:22 +01:00

163 lines
6.1 KiB
Python

#!/usr/bin/env python3
"""
Compare ISIL registry and Dutch organizations datasets.
Identifies overlaps, unique records, and data quality insights.
"""
from pathlib import Path
from glam_extractor.parsers.isil_registry import ISILRegistryParser
from glam_extractor.parsers.dutch_orgs import DutchOrgsParser
from collections import defaultdict
def main():
isil_path = Path("data/ISIL-codes_2025-08-01.csv")
dutch_orgs_path = Path("data/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv")
print("=" * 70)
print("DUTCH HERITAGE DATASETS COMPARISON")
print("=" * 70)
print()
# Parse both datasets
print("📄 Loading datasets...")
isil_parser = ISILRegistryParser()
dutch_parser = DutchOrgsParser()
isil_custodians = isil_parser.parse_and_convert(isil_path)
dutch_custodians = dutch_parser.parse_and_convert(dutch_orgs_path)
print(f"✅ ISIL Registry: {len(isil_custodians):4d} institutions")
print(f"✅ Dutch Organizations: {len(dutch_custodians):4d} institutions")
print()
# Extract ISIL codes from both datasets
isil_codes_from_registry = {}
for custodian in isil_custodians:
for identifier in custodian.identifiers:
if identifier.identifier_scheme == "ISIL":
isil_codes_from_registry[identifier.identifier_value] = custodian.name
isil_codes_from_orgs = {}
for custodian in dutch_custodians:
for identifier in custodian.identifiers:
if identifier.identifier_scheme == "ISIL":
isil_codes_from_orgs[identifier.identifier_value] = custodian.name
print("🔍 ISIL Code Analysis:")
print("-" * 70)
print(f"ISIL codes in registry: {len(isil_codes_from_registry):4d}")
print(f"ISIL codes in organizations: {len(isil_codes_from_orgs):4d}")
print()
# Find overlaps
common_isil_codes = set(isil_codes_from_registry.keys()) & set(isil_codes_from_orgs.keys())
only_in_registry = set(isil_codes_from_registry.keys()) - set(isil_codes_from_orgs.keys())
only_in_orgs = set(isil_codes_from_orgs.keys()) - set(isil_codes_from_registry.keys())
print("📊 ISIL Code Overlap:")
print("-" * 70)
print(f"Common ISIL codes: {len(common_isil_codes):4d}")
print(f"Only in registry: {len(only_in_registry):4d}")
print(f"Only in organizations: {len(only_in_orgs):4d}")
print()
# Calculate overlap percentage
if isil_codes_from_registry:
registry_overlap = (len(common_isil_codes) / len(isil_codes_from_registry)) * 100
print(f"Registry overlap: {registry_overlap:.1f}%")
if isil_codes_from_orgs:
orgs_overlap = (len(common_isil_codes) / len(isil_codes_from_orgs)) * 100
print(f"Organizations overlap: {orgs_overlap:.1f}%")
print()
# Show sample overlapping institutions
if common_isil_codes:
print("📝 Sample Overlapping Institutions:")
print("-" * 70)
for i, isil_code in enumerate(sorted(common_isil_codes)[:10], 1):
registry_name = isil_codes_from_registry[isil_code]
orgs_name = isil_codes_from_orgs[isil_code]
match_indicator = "" if registry_name == orgs_name else ""
print(f"{i:2d}. {isil_code}")
print(f" Registry: {registry_name}")
print(f" Orgs: {orgs_name} {match_indicator}")
print()
# Geography comparison
print("🌍 Geographic Coverage:")
print("-" * 70)
isil_cities = set()
for custodian in isil_custodians:
for location in custodian.locations:
if location.city:
isil_cities.add(location.city)
dutch_cities = set()
for custodian in dutch_custodians:
for location in custodian.locations:
if location.city:
dutch_cities.add(location.city)
print(f"Cities in ISIL registry: {len(isil_cities):4d}")
print(f"Cities in organizations: {len(dutch_cities):4d}")
print(f"Common cities: {len(isil_cities & dutch_cities):4d}")
print()
# Institution type comparison
print("🏛️ Institution Types:")
print("-" * 70)
isil_types = defaultdict(int)
for custodian in isil_custodians:
isil_types[custodian.institution_type] += 1
dutch_types = defaultdict(int)
for custodian in dutch_custodians:
dutch_types[custodian.institution_type] += 1
print("ISIL Registry:")
for inst_type, count in sorted(isil_types.items(), key=lambda x: x[1], reverse=True):
print(f" {inst_type:20} {count:4d}")
print("\nDutch Organizations:")
for inst_type, count in sorted(dutch_types.items(), key=lambda x: x[1], reverse=True):
print(f" {inst_type:20} {count:4d}")
print()
# Data quality insights
print("✨ Data Quality Insights:")
print("-" * 70)
# Organizations with both ISIL and platforms
with_both = sum(
1 for c in dutch_custodians
if any(i.identifier_scheme == "ISIL" for i in c.identifiers)
and c.digital_platforms
)
print(f"Organizations with ISIL + platforms: {with_both:4d}")
# ISIL records missing from orgs dataset
print(f"ISIL records not in orgs dataset: {len(only_in_registry):4d}")
print(f" → Potential for enrichment")
# Orgs without ISIL codes
orgs_without_isil = len(dutch_custodians) - len(isil_codes_from_orgs)
print(f"Organizations without ISIL codes: {orgs_without_isil:4d}")
print(f" → Candidates for ISIL assignment")
print()
# Recommendations
print("💡 Recommendations:")
print("-" * 70)
print("1. Merge datasets using ISIL codes as primary key")
print("2. Enrich ISIL registry records with platform data from organizations")
print("3. Review name mismatches in overlapping records")
print("4. Consider geocoding both datasets for spatial analysis")
print(f"5. {orgs_without_isil} organizations may benefit from ISIL code assignment")
print()
print("✅ Comparison complete!")
if __name__ == "__main__":
main()