#!/usr/bin/env python3 """ Test script for parsing the real Dutch organizations CSV file. """ from pathlib import Path from glam_extractor.parsers.dutch_orgs import DutchOrgsParser def main(): csv_path = Path("data/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv") if not csv_path.exists(): print(f"❌ CSV file not found: {csv_path}") return print(f"📄 Parsing: {csv_path}") print(f"📊 File size: {csv_path.stat().st_size / 1024:.1f} KB") print() parser = DutchOrgsParser() # Parse to DutchOrgRecord objects print("🔄 Parsing CSV to DutchOrgRecord objects...") records = parser.parse_file(csv_path) print(f"✅ Parsed {len(records)} organizations") print() # Convert to HeritageCustodian objects print("🔄 Converting to HeritageCustodian models...") custodians = parser.parse_and_convert(csv_path) print(f"✅ Converted {len(custodians)} custodians") print() # Statistics print("📊 Statistics:") print("-" * 50) # Count by institution type type_counts = {} for custodian in custodians: inst_type = custodian.institution_type type_counts[inst_type] = type_counts.get(inst_type, 0) + 1 print("Institution Types:") for inst_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True): print(f" {inst_type:20} {count:4d}") print() # Count with ISIL codes with_isil = sum(1 for c in custodians if any(i.identifier_scheme == "ISIL" for i in c.identifiers)) print(f"With ISIL codes: {with_isil:4d}") print() # Count with digital platforms with_platforms = sum(1 for c in custodians if c.digital_platforms) print(f"With digital platforms: {with_platforms:4d}") print() # Cities represented cities = set() for custodian in custodians: for location in custodian.locations: if location.city: cities.add(location.city) print(f"Cities represented: {len(cities):4d}") print() # Sample records print("📝 Sample Records:") print("-" * 50) for i, custodian in enumerate(custodians[:5], 1): print(f"{i}. {custodian.name}") print(f" Type: {custodian.institution_type}") if custodian.locations and custodian.locations[0].city: print(f" Location: {custodian.locations[0].city}") if custodian.identifiers: isil = [i for i in custodian.identifiers if i.identifier_scheme == "ISIL"] if isil: print(f" ISIL: {isil[0].identifier_value}") if custodian.digital_platforms: cms = [p for p in custodian.digital_platforms if p.platform_type == "COLLECTION_MANAGEMENT"] if cms: print(f" System: {cms[0].platform_name}") print() print("✅ Parsing complete!") if __name__ == "__main__": main()