163 lines
6.1 KiB
Python
163 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Compare ISIL registry and Dutch organizations datasets.
|
|
Identifies overlaps, unique records, and data quality insights.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
from glam_extractor.parsers.isil_registry import ISILRegistryParser
|
|
from glam_extractor.parsers.dutch_orgs import DutchOrgsParser
|
|
from collections import defaultdict
|
|
|
|
def main():
|
|
isil_path = Path("data/ISIL-codes_2025-08-01.csv")
|
|
dutch_orgs_path = Path("data/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv")
|
|
|
|
print("=" * 70)
|
|
print("DUTCH HERITAGE DATASETS COMPARISON")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Parse both datasets
|
|
print("📄 Loading datasets...")
|
|
isil_parser = ISILRegistryParser()
|
|
dutch_parser = DutchOrgsParser()
|
|
|
|
isil_custodians = isil_parser.parse_and_convert(isil_path)
|
|
dutch_custodians = dutch_parser.parse_and_convert(dutch_orgs_path)
|
|
|
|
print(f"✅ ISIL Registry: {len(isil_custodians):4d} institutions")
|
|
print(f"✅ Dutch Organizations: {len(dutch_custodians):4d} institutions")
|
|
print()
|
|
|
|
# Extract ISIL codes from both datasets
|
|
isil_codes_from_registry = {}
|
|
for custodian in isil_custodians:
|
|
for identifier in custodian.identifiers:
|
|
if identifier.identifier_scheme == "ISIL":
|
|
isil_codes_from_registry[identifier.identifier_value] = custodian.name
|
|
|
|
isil_codes_from_orgs = {}
|
|
for custodian in dutch_custodians:
|
|
for identifier in custodian.identifiers:
|
|
if identifier.identifier_scheme == "ISIL":
|
|
isil_codes_from_orgs[identifier.identifier_value] = custodian.name
|
|
|
|
print("🔍 ISIL Code Analysis:")
|
|
print("-" * 70)
|
|
print(f"ISIL codes in registry: {len(isil_codes_from_registry):4d}")
|
|
print(f"ISIL codes in organizations: {len(isil_codes_from_orgs):4d}")
|
|
print()
|
|
|
|
# Find overlaps
|
|
common_isil_codes = set(isil_codes_from_registry.keys()) & set(isil_codes_from_orgs.keys())
|
|
only_in_registry = set(isil_codes_from_registry.keys()) - set(isil_codes_from_orgs.keys())
|
|
only_in_orgs = set(isil_codes_from_orgs.keys()) - set(isil_codes_from_registry.keys())
|
|
|
|
print("📊 ISIL Code Overlap:")
|
|
print("-" * 70)
|
|
print(f"Common ISIL codes: {len(common_isil_codes):4d}")
|
|
print(f"Only in registry: {len(only_in_registry):4d}")
|
|
print(f"Only in organizations: {len(only_in_orgs):4d}")
|
|
print()
|
|
|
|
# Calculate overlap percentage
|
|
if isil_codes_from_registry:
|
|
registry_overlap = (len(common_isil_codes) / len(isil_codes_from_registry)) * 100
|
|
print(f"Registry overlap: {registry_overlap:.1f}%")
|
|
if isil_codes_from_orgs:
|
|
orgs_overlap = (len(common_isil_codes) / len(isil_codes_from_orgs)) * 100
|
|
print(f"Organizations overlap: {orgs_overlap:.1f}%")
|
|
print()
|
|
|
|
# Show sample overlapping institutions
|
|
if common_isil_codes:
|
|
print("📝 Sample Overlapping Institutions:")
|
|
print("-" * 70)
|
|
for i, isil_code in enumerate(sorted(common_isil_codes)[:10], 1):
|
|
registry_name = isil_codes_from_registry[isil_code]
|
|
orgs_name = isil_codes_from_orgs[isil_code]
|
|
match_indicator = "✓" if registry_name == orgs_name else "≈"
|
|
print(f"{i:2d}. {isil_code}")
|
|
print(f" Registry: {registry_name}")
|
|
print(f" Orgs: {orgs_name} {match_indicator}")
|
|
print()
|
|
|
|
# Geography comparison
|
|
print("🌍 Geographic Coverage:")
|
|
print("-" * 70)
|
|
|
|
isil_cities = set()
|
|
for custodian in isil_custodians:
|
|
for location in custodian.locations:
|
|
if location.city:
|
|
isil_cities.add(location.city)
|
|
|
|
dutch_cities = set()
|
|
for custodian in dutch_custodians:
|
|
for location in custodian.locations:
|
|
if location.city:
|
|
dutch_cities.add(location.city)
|
|
|
|
print(f"Cities in ISIL registry: {len(isil_cities):4d}")
|
|
print(f"Cities in organizations: {len(dutch_cities):4d}")
|
|
print(f"Common cities: {len(isil_cities & dutch_cities):4d}")
|
|
print()
|
|
|
|
# Institution type comparison
|
|
print("🏛️ Institution Types:")
|
|
print("-" * 70)
|
|
|
|
isil_types = defaultdict(int)
|
|
for custodian in isil_custodians:
|
|
isil_types[custodian.institution_type] += 1
|
|
|
|
dutch_types = defaultdict(int)
|
|
for custodian in dutch_custodians:
|
|
dutch_types[custodian.institution_type] += 1
|
|
|
|
print("ISIL Registry:")
|
|
for inst_type, count in sorted(isil_types.items(), key=lambda x: x[1], reverse=True):
|
|
print(f" {inst_type:20} {count:4d}")
|
|
|
|
print("\nDutch Organizations:")
|
|
for inst_type, count in sorted(dutch_types.items(), key=lambda x: x[1], reverse=True):
|
|
print(f" {inst_type:20} {count:4d}")
|
|
print()
|
|
|
|
# Data quality insights
|
|
print("✨ Data Quality Insights:")
|
|
print("-" * 70)
|
|
|
|
# Organizations with both ISIL and platforms
|
|
with_both = sum(
|
|
1 for c in dutch_custodians
|
|
if any(i.identifier_scheme == "ISIL" for i in c.identifiers)
|
|
and c.digital_platforms
|
|
)
|
|
print(f"Organizations with ISIL + platforms: {with_both:4d}")
|
|
|
|
# ISIL records missing from orgs dataset
|
|
print(f"ISIL records not in orgs dataset: {len(only_in_registry):4d}")
|
|
print(f" → Potential for enrichment")
|
|
|
|
# Orgs without ISIL codes
|
|
orgs_without_isil = len(dutch_custodians) - len(isil_codes_from_orgs)
|
|
print(f"Organizations without ISIL codes: {orgs_without_isil:4d}")
|
|
print(f" → Candidates for ISIL assignment")
|
|
print()
|
|
|
|
# Recommendations
|
|
print("💡 Recommendations:")
|
|
print("-" * 70)
|
|
print("1. Merge datasets using ISIL codes as primary key")
|
|
print("2. Enrich ISIL registry records with platform data from organizations")
|
|
print("3. Review name mismatches in overlapping records")
|
|
print("4. Consider geocoding both datasets for spatial analysis")
|
|
print(f"5. {orgs_without_isil} organizations may benefit from ISIL code assignment")
|
|
print()
|
|
|
|
print("✅ Comparison complete!")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|