glam/crosslink_dutch_datasets.py
2025-11-19 23:25:22 +01:00

241 lines
8.8 KiB
Python

#!/usr/bin/env python3
"""
Cross-link and merge ISIL registry with Dutch organizations dataset.
Demonstrates TIER_1 data source merging using ISIL codes as primary key.
"""
from pathlib import Path
from typing import Dict, List, Optional
from glam_extractor.parsers.isil_registry import ISILRegistryParser
from glam_extractor.parsers.dutch_orgs import DutchOrgsParser
from glam_extractor.models import HeritageCustodian, DigitalPlatform, Identifier
from dataclasses import dataclass
@dataclass
class MergedRecord:
"""Represents a merged institution record from multiple sources"""
custodian: HeritageCustodian
isil_code: str
in_registry: bool
in_orgs: bool
platforms: List[DigitalPlatform]
enrichment_notes: List[str]
def merge_custodians(
isil_custodian: Optional[HeritageCustodian],
orgs_custodian: Optional[HeritageCustodian],
isil_code: str
) -> MergedRecord:
"""
Merge two custodian records, preferring ISIL registry for core data
and enriching with platform information from organizations dataset.
"""
enrichment_notes = []
# Determine base record (prefer ISIL registry as it's authoritative)
if isil_custodian and orgs_custodian:
base = isil_custodian
platforms = orgs_custodian.digital_platforms or []
enrichment_notes.append("Merged ISIL registry + organizations data")
enrichment_notes.append(f"Added {len(platforms)} digital platforms from orgs dataset")
# Check for name differences
if isil_custodian.name != orgs_custodian.name:
enrichment_notes.append(
f"Name variation: Registry='{isil_custodian.name}' vs "
f"Orgs='{orgs_custodian.name}'"
)
# Check for location differences
if (isil_custodian.locations and orgs_custodian.locations and
isil_custodian.locations[0].city != orgs_custodian.locations[0].city):
enrichment_notes.append(
f"Location difference: Registry={isil_custodian.locations[0].city} vs "
f"Orgs={orgs_custodian.locations[0].city}"
)
in_registry = True
in_orgs = True
elif isil_custodian:
base = isil_custodian
platforms = []
enrichment_notes.append("Only in ISIL registry (no org data available)")
in_registry = True
in_orgs = False
elif orgs_custodian:
base = orgs_custodian
platforms = orgs_custodian.digital_platforms or []
enrichment_notes.append("Only in organizations dataset (not in ISIL registry)")
in_registry = False
in_orgs = True
else:
raise ValueError("At least one custodian must be provided")
return MergedRecord(
custodian=base,
isil_code=isil_code,
in_registry=in_registry,
in_orgs=in_orgs,
platforms=platforms,
enrichment_notes=enrichment_notes
)
def main():
isil_path = Path("data/ISIL-codes_2025-08-01.csv")
dutch_orgs_path = Path("data/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv")
print("=" * 70)
print("DUTCH HERITAGE DATASETS CROSS-LINKING")
print("=" * 70)
print()
# Parse both datasets
print("📄 Loading datasets...")
isil_parser = ISILRegistryParser()
dutch_parser = DutchOrgsParser()
isil_custodians = isil_parser.parse_and_convert(isil_path)
dutch_custodians = dutch_parser.parse_and_convert(dutch_orgs_path)
print(f"✅ Loaded {len(isil_custodians)} ISIL records")
print(f"✅ Loaded {len(dutch_custodians)} organization records")
print()
# Build lookup dictionaries by ISIL code
print("🔗 Building ISIL code indexes...")
isil_by_code: Dict[str, HeritageCustodian] = {}
for custodian in isil_custodians:
for identifier in custodian.identifiers:
if identifier.identifier_scheme == "ISIL":
isil_by_code[identifier.identifier_value] = custodian
break
orgs_by_code: Dict[str, HeritageCustodian] = {}
for custodian in dutch_custodians:
for identifier in custodian.identifiers:
if identifier.identifier_scheme == "ISIL":
orgs_by_code[identifier.identifier_value] = custodian
break
print(f"✅ Indexed {len(isil_by_code)} ISIL codes from registry")
print(f"✅ Indexed {len(orgs_by_code)} ISIL codes from organizations")
print()
# Merge records
print("🔄 Cross-linking records by ISIL code...")
all_isil_codes = set(isil_by_code.keys()) | set(orgs_by_code.keys())
merged_records: List[MergedRecord] = []
for isil_code in sorted(all_isil_codes):
isil_record = isil_by_code.get(isil_code)
orgs_record = orgs_by_code.get(isil_code)
merged = merge_custodians(isil_record, orgs_record, isil_code)
merged_records.append(merged)
print(f"✅ Created {len(merged_records)} merged records")
print()
# Statistics
print("📊 Merge Statistics:")
print("-" * 70)
both = sum(1 for r in merged_records if r.in_registry and r.in_orgs)
only_registry = sum(1 for r in merged_records if r.in_registry and not r.in_orgs)
only_orgs = sum(1 for r in merged_records if not r.in_registry and r.in_orgs)
print(f"In both datasets: {both:4d} ({both/len(merged_records)*100:.1f}%)")
print(f"Only in ISIL registry: {only_registry:4d}")
print(f"Only in organizations: {only_orgs:4d}")
print()
with_platforms = sum(1 for r in merged_records if r.platforms)
print(f"Records with platforms: {with_platforms:4d} ({with_platforms/len(merged_records)*100:.1f}%)")
print()
# Show enrichment examples
print("✨ Enrichment Examples (First 10 with platforms):")
print("-" * 70)
enriched_count = 0
for record in merged_records:
if record.platforms and enriched_count < 10:
enriched_count += 1
print(f"{enriched_count}. {record.custodian.name}")
print(f" ISIL: {record.isil_code}")
print(f" Status: {'Registry+Orgs' if record.in_registry and record.in_orgs else 'Orgs only'}")
if record.custodian.locations:
print(f" Location: {record.custodian.locations[0].city}")
print(f" Platforms: {len(record.platforms)}")
for platform in record.platforms[:3]: # Show up to 3 platforms
print(f" - {platform.platform_type}: {platform.platform_name}")
if record.enrichment_notes:
print(f" Notes: {record.enrichment_notes[0]}")
print()
# Show conflict examples (name mismatches)
print("⚠️ Name Conflict Examples:")
print("-" * 70)
conflicts = [r for r in merged_records
if any("Name variation" in note for note in r.enrichment_notes)]
for i, record in enumerate(conflicts[:5], 1):
print(f"{i}. {record.isil_code}")
note = [n for n in record.enrichment_notes if "Name variation" in n][0]
print(f" {note}")
print()
if len(conflicts) > 5:
print(f" ... and {len(conflicts) - 5} more name conflicts")
print()
# Organizations without ISIL codes (candidates for assignment)
orgs_without_isil = [
c for c in dutch_custodians
if not any(i.identifier_scheme == "ISIL" for i in c.identifiers)
]
print("💡 ISIL Assignment Candidates:")
print("-" * 70)
print(f"Organizations without ISIL codes: {len(orgs_without_isil)}")
print()
print("Sample candidates (first 10):")
for i, custodian in enumerate(orgs_without_isil[:10], 1):
location = custodian.locations[0].city if custodian.locations else "Unknown"
print(f"{i:2d}. {custodian.name}")
print(f" Type: {custodian.institution_type}, Location: {location}")
print()
# Summary recommendations
print("🎯 Cross-linking Summary:")
print("-" * 70)
print(f"✅ Successfully linked {both} institutions via ISIL codes")
print(f"✅ Enriched {with_platforms} records with digital platform data")
print(f"⚠️ Found {len(conflicts)} name conflicts requiring review")
print(f"💡 Identified {len(orgs_without_isil)} candidates for ISIL code assignment")
print()
print("Next steps:")
print("1. Export merged records to JSON-LD/RDF for SPARQL queries")
print("2. Review and resolve name conflicts")
print("3. Geocode all locations for spatial analysis")
print("4. Create provenance graph showing data lineage")
print("5. Submit ISIL code applications for qualified organizations")
print()
print("✅ Cross-linking complete!")
if __name__ == "__main__":
main()