241 lines
8.8 KiB
Python
241 lines
8.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Cross-link and merge ISIL registry with Dutch organizations dataset.
|
|
Demonstrates TIER_1 data source merging using ISIL codes as primary key.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
from glam_extractor.parsers.isil_registry import ISILRegistryParser
|
|
from glam_extractor.parsers.dutch_orgs import DutchOrgsParser
|
|
from glam_extractor.models import HeritageCustodian, DigitalPlatform, Identifier
|
|
from dataclasses import dataclass
|
|
|
|
@dataclass
|
|
class MergedRecord:
|
|
"""Represents a merged institution record from multiple sources"""
|
|
custodian: HeritageCustodian
|
|
isil_code: str
|
|
in_registry: bool
|
|
in_orgs: bool
|
|
platforms: List[DigitalPlatform]
|
|
enrichment_notes: List[str]
|
|
|
|
|
|
def merge_custodians(
|
|
isil_custodian: Optional[HeritageCustodian],
|
|
orgs_custodian: Optional[HeritageCustodian],
|
|
isil_code: str
|
|
) -> MergedRecord:
|
|
"""
|
|
Merge two custodian records, preferring ISIL registry for core data
|
|
and enriching with platform information from organizations dataset.
|
|
"""
|
|
enrichment_notes = []
|
|
|
|
# Determine base record (prefer ISIL registry as it's authoritative)
|
|
if isil_custodian and orgs_custodian:
|
|
base = isil_custodian
|
|
platforms = orgs_custodian.digital_platforms or []
|
|
enrichment_notes.append("Merged ISIL registry + organizations data")
|
|
enrichment_notes.append(f"Added {len(platforms)} digital platforms from orgs dataset")
|
|
|
|
# Check for name differences
|
|
if isil_custodian.name != orgs_custodian.name:
|
|
enrichment_notes.append(
|
|
f"Name variation: Registry='{isil_custodian.name}' vs "
|
|
f"Orgs='{orgs_custodian.name}'"
|
|
)
|
|
|
|
# Check for location differences
|
|
if (isil_custodian.locations and orgs_custodian.locations and
|
|
isil_custodian.locations[0].city != orgs_custodian.locations[0].city):
|
|
enrichment_notes.append(
|
|
f"Location difference: Registry={isil_custodian.locations[0].city} vs "
|
|
f"Orgs={orgs_custodian.locations[0].city}"
|
|
)
|
|
|
|
in_registry = True
|
|
in_orgs = True
|
|
|
|
elif isil_custodian:
|
|
base = isil_custodian
|
|
platforms = []
|
|
enrichment_notes.append("Only in ISIL registry (no org data available)")
|
|
in_registry = True
|
|
in_orgs = False
|
|
|
|
elif orgs_custodian:
|
|
base = orgs_custodian
|
|
platforms = orgs_custodian.digital_platforms or []
|
|
enrichment_notes.append("Only in organizations dataset (not in ISIL registry)")
|
|
in_registry = False
|
|
in_orgs = True
|
|
|
|
else:
|
|
raise ValueError("At least one custodian must be provided")
|
|
|
|
return MergedRecord(
|
|
custodian=base,
|
|
isil_code=isil_code,
|
|
in_registry=in_registry,
|
|
in_orgs=in_orgs,
|
|
platforms=platforms,
|
|
enrichment_notes=enrichment_notes
|
|
)
|
|
|
|
|
|
def main():
|
|
isil_path = Path("data/ISIL-codes_2025-08-01.csv")
|
|
dutch_orgs_path = Path("data/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv")
|
|
|
|
print("=" * 70)
|
|
print("DUTCH HERITAGE DATASETS CROSS-LINKING")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Parse both datasets
|
|
print("📄 Loading datasets...")
|
|
isil_parser = ISILRegistryParser()
|
|
dutch_parser = DutchOrgsParser()
|
|
|
|
isil_custodians = isil_parser.parse_and_convert(isil_path)
|
|
dutch_custodians = dutch_parser.parse_and_convert(dutch_orgs_path)
|
|
|
|
print(f"✅ Loaded {len(isil_custodians)} ISIL records")
|
|
print(f"✅ Loaded {len(dutch_custodians)} organization records")
|
|
print()
|
|
|
|
# Build lookup dictionaries by ISIL code
|
|
print("🔗 Building ISIL code indexes...")
|
|
|
|
isil_by_code: Dict[str, HeritageCustodian] = {}
|
|
for custodian in isil_custodians:
|
|
for identifier in custodian.identifiers:
|
|
if identifier.identifier_scheme == "ISIL":
|
|
isil_by_code[identifier.identifier_value] = custodian
|
|
break
|
|
|
|
orgs_by_code: Dict[str, HeritageCustodian] = {}
|
|
for custodian in dutch_custodians:
|
|
for identifier in custodian.identifiers:
|
|
if identifier.identifier_scheme == "ISIL":
|
|
orgs_by_code[identifier.identifier_value] = custodian
|
|
break
|
|
|
|
print(f"✅ Indexed {len(isil_by_code)} ISIL codes from registry")
|
|
print(f"✅ Indexed {len(orgs_by_code)} ISIL codes from organizations")
|
|
print()
|
|
|
|
# Merge records
|
|
print("🔄 Cross-linking records by ISIL code...")
|
|
all_isil_codes = set(isil_by_code.keys()) | set(orgs_by_code.keys())
|
|
|
|
merged_records: List[MergedRecord] = []
|
|
|
|
for isil_code in sorted(all_isil_codes):
|
|
isil_record = isil_by_code.get(isil_code)
|
|
orgs_record = orgs_by_code.get(isil_code)
|
|
|
|
merged = merge_custodians(isil_record, orgs_record, isil_code)
|
|
merged_records.append(merged)
|
|
|
|
print(f"✅ Created {len(merged_records)} merged records")
|
|
print()
|
|
|
|
# Statistics
|
|
print("📊 Merge Statistics:")
|
|
print("-" * 70)
|
|
|
|
both = sum(1 for r in merged_records if r.in_registry and r.in_orgs)
|
|
only_registry = sum(1 for r in merged_records if r.in_registry and not r.in_orgs)
|
|
only_orgs = sum(1 for r in merged_records if not r.in_registry and r.in_orgs)
|
|
|
|
print(f"In both datasets: {both:4d} ({both/len(merged_records)*100:.1f}%)")
|
|
print(f"Only in ISIL registry: {only_registry:4d}")
|
|
print(f"Only in organizations: {only_orgs:4d}")
|
|
print()
|
|
|
|
with_platforms = sum(1 for r in merged_records if r.platforms)
|
|
print(f"Records with platforms: {with_platforms:4d} ({with_platforms/len(merged_records)*100:.1f}%)")
|
|
print()
|
|
|
|
# Show enrichment examples
|
|
print("✨ Enrichment Examples (First 10 with platforms):")
|
|
print("-" * 70)
|
|
|
|
enriched_count = 0
|
|
for record in merged_records:
|
|
if record.platforms and enriched_count < 10:
|
|
enriched_count += 1
|
|
print(f"{enriched_count}. {record.custodian.name}")
|
|
print(f" ISIL: {record.isil_code}")
|
|
print(f" Status: {'Registry+Orgs' if record.in_registry and record.in_orgs else 'Orgs only'}")
|
|
|
|
if record.custodian.locations:
|
|
print(f" Location: {record.custodian.locations[0].city}")
|
|
|
|
print(f" Platforms: {len(record.platforms)}")
|
|
for platform in record.platforms[:3]: # Show up to 3 platforms
|
|
print(f" - {platform.platform_type}: {platform.platform_name}")
|
|
|
|
if record.enrichment_notes:
|
|
print(f" Notes: {record.enrichment_notes[0]}")
|
|
print()
|
|
|
|
# Show conflict examples (name mismatches)
|
|
print("⚠️ Name Conflict Examples:")
|
|
print("-" * 70)
|
|
|
|
conflicts = [r for r in merged_records
|
|
if any("Name variation" in note for note in r.enrichment_notes)]
|
|
|
|
for i, record in enumerate(conflicts[:5], 1):
|
|
print(f"{i}. {record.isil_code}")
|
|
note = [n for n in record.enrichment_notes if "Name variation" in n][0]
|
|
print(f" {note}")
|
|
print()
|
|
|
|
if len(conflicts) > 5:
|
|
print(f" ... and {len(conflicts) - 5} more name conflicts")
|
|
print()
|
|
|
|
# Organizations without ISIL codes (candidates for assignment)
|
|
orgs_without_isil = [
|
|
c for c in dutch_custodians
|
|
if not any(i.identifier_scheme == "ISIL" for i in c.identifiers)
|
|
]
|
|
|
|
print("💡 ISIL Assignment Candidates:")
|
|
print("-" * 70)
|
|
print(f"Organizations without ISIL codes: {len(orgs_without_isil)}")
|
|
print()
|
|
print("Sample candidates (first 10):")
|
|
for i, custodian in enumerate(orgs_without_isil[:10], 1):
|
|
location = custodian.locations[0].city if custodian.locations else "Unknown"
|
|
print(f"{i:2d}. {custodian.name}")
|
|
print(f" Type: {custodian.institution_type}, Location: {location}")
|
|
print()
|
|
|
|
# Summary recommendations
|
|
print("🎯 Cross-linking Summary:")
|
|
print("-" * 70)
|
|
print(f"✅ Successfully linked {both} institutions via ISIL codes")
|
|
print(f"✅ Enriched {with_platforms} records with digital platform data")
|
|
print(f"⚠️ Found {len(conflicts)} name conflicts requiring review")
|
|
print(f"💡 Identified {len(orgs_without_isil)} candidates for ISIL code assignment")
|
|
print()
|
|
|
|
print("Next steps:")
|
|
print("1. Export merged records to JSON-LD/RDF for SPARQL queries")
|
|
print("2. Review and resolve name conflicts")
|
|
print("3. Geocode all locations for spatial analysis")
|
|
print("4. Create provenance graph showing data lineage")
|
|
print("5. Submit ISIL code applications for qualified organizations")
|
|
print()
|
|
|
|
print("✅ Cross-linking complete!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|