89 lines
2.9 KiB
Python
89 lines
2.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test script for parsing the real Dutch organizations CSV file.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
from glam_extractor.parsers.dutch_orgs import DutchOrgsParser
|
|
|
|
def main():
|
|
csv_path = Path("data/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv")
|
|
|
|
if not csv_path.exists():
|
|
print(f"❌ CSV file not found: {csv_path}")
|
|
return
|
|
|
|
print(f"📄 Parsing: {csv_path}")
|
|
print(f"📊 File size: {csv_path.stat().st_size / 1024:.1f} KB")
|
|
print()
|
|
|
|
parser = DutchOrgsParser()
|
|
|
|
# Parse to DutchOrgRecord objects
|
|
print("🔄 Parsing CSV to DutchOrgRecord objects...")
|
|
records = parser.parse_file(csv_path)
|
|
print(f"✅ Parsed {len(records)} organizations")
|
|
print()
|
|
|
|
# Convert to HeritageCustodian objects
|
|
print("🔄 Converting to HeritageCustodian models...")
|
|
custodians = parser.parse_and_convert(csv_path)
|
|
print(f"✅ Converted {len(custodians)} custodians")
|
|
print()
|
|
|
|
# Statistics
|
|
print("📊 Statistics:")
|
|
print("-" * 50)
|
|
|
|
# Count by institution type
|
|
type_counts = {}
|
|
for custodian in custodians:
|
|
inst_type = custodian.institution_type
|
|
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
|
|
|
|
print("Institution Types:")
|
|
for inst_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
|
|
print(f" {inst_type:20} {count:4d}")
|
|
print()
|
|
|
|
# Count with ISIL codes
|
|
with_isil = sum(1 for c in custodians if any(i.identifier_scheme == "ISIL" for i in c.identifiers))
|
|
print(f"With ISIL codes: {with_isil:4d}")
|
|
print()
|
|
|
|
# Count with digital platforms
|
|
with_platforms = sum(1 for c in custodians if c.digital_platforms)
|
|
print(f"With digital platforms: {with_platforms:4d}")
|
|
print()
|
|
|
|
# Cities represented
|
|
cities = set()
|
|
for custodian in custodians:
|
|
for location in custodian.locations:
|
|
if location.city:
|
|
cities.add(location.city)
|
|
print(f"Cities represented: {len(cities):4d}")
|
|
print()
|
|
|
|
# Sample records
|
|
print("📝 Sample Records:")
|
|
print("-" * 50)
|
|
for i, custodian in enumerate(custodians[:5], 1):
|
|
print(f"{i}. {custodian.name}")
|
|
print(f" Type: {custodian.institution_type}")
|
|
if custodian.locations and custodian.locations[0].city:
|
|
print(f" Location: {custodian.locations[0].city}")
|
|
if custodian.identifiers:
|
|
isil = [i for i in custodian.identifiers if i.identifier_scheme == "ISIL"]
|
|
if isil:
|
|
print(f" ISIL: {isil[0].identifier_value}")
|
|
if custodian.digital_platforms:
|
|
cms = [p for p in custodian.digital_platforms if p.platform_type == "COLLECTION_MANAGEMENT"]
|
|
if cms:
|
|
print(f" System: {cms[0].platform_name}")
|
|
print()
|
|
|
|
print("✅ Parsing complete!")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|