glam/test_real_dutch_orgs.py
2025-11-19 23:25:22 +01:00

89 lines
2.9 KiB
Python

#!/usr/bin/env python3
"""
Test script for parsing the real Dutch organizations CSV file.
"""
from pathlib import Path
from glam_extractor.parsers.dutch_orgs import DutchOrgsParser
def main():
csv_path = Path("data/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv")
if not csv_path.exists():
print(f"❌ CSV file not found: {csv_path}")
return
print(f"📄 Parsing: {csv_path}")
print(f"📊 File size: {csv_path.stat().st_size / 1024:.1f} KB")
print()
parser = DutchOrgsParser()
# Parse to DutchOrgRecord objects
print("🔄 Parsing CSV to DutchOrgRecord objects...")
records = parser.parse_file(csv_path)
print(f"✅ Parsed {len(records)} organizations")
print()
# Convert to HeritageCustodian objects
print("🔄 Converting to HeritageCustodian models...")
custodians = parser.parse_and_convert(csv_path)
print(f"✅ Converted {len(custodians)} custodians")
print()
# Statistics
print("📊 Statistics:")
print("-" * 50)
# Count by institution type
type_counts = {}
for custodian in custodians:
inst_type = custodian.institution_type
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
print("Institution Types:")
for inst_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
print(f" {inst_type:20} {count:4d}")
print()
# Count with ISIL codes
with_isil = sum(1 for c in custodians if any(i.identifier_scheme == "ISIL" for i in c.identifiers))
print(f"With ISIL codes: {with_isil:4d}")
print()
# Count with digital platforms
with_platforms = sum(1 for c in custodians if c.digital_platforms)
print(f"With digital platforms: {with_platforms:4d}")
print()
# Cities represented
cities = set()
for custodian in custodians:
for location in custodian.locations:
if location.city:
cities.add(location.city)
print(f"Cities represented: {len(cities):4d}")
print()
# Sample records
print("📝 Sample Records:")
print("-" * 50)
for i, custodian in enumerate(custodians[:5], 1):
print(f"{i}. {custodian.name}")
print(f" Type: {custodian.institution_type}")
if custodian.locations and custodian.locations[0].city:
print(f" Location: {custodian.locations[0].city}")
if custodian.identifiers:
isil = [i for i in custodian.identifiers if i.identifier_scheme == "ISIL"]
if isil:
print(f" ISIL: {isil[0].identifier_value}")
if custodian.digital_platforms:
cms = [p for p in custodian.digital_platforms if p.platform_type == "COLLECTION_MANAGEMENT"]
if cms:
print(f" System: {cms[0].platform_name}")
print()
print("✅ Parsing complete!")
if __name__ == "__main__":
main()