#!/usr/bin/env python3 """ Parse ZCBS institutions list from HTML and extract structured data. ZCBS = Zijper Collectie Beheer Systeem (Zijpe Collection Management System) Used by 170+ heritage institutions in Netherlands and Belgium. Source: https://www.dezijpe.nl/cgi-bin/boerderij.pl?misc=90 """ import re import json from pathlib import Path from datetime import datetime from typing import Optional from dataclasses import dataclass, field, asdict @dataclass class ZCBSInstitution: """A heritage institution using ZCBS.""" zcbs_id: str # e.g., "#001", "#002" name: str location: Optional[str] = None # City/region country: str = "NL" # Default Netherlands # Collection URLs by type objects_url: Optional[str] = None photos_url: Optional[str] = None library_url: Optional[str] = None archaeology_url: Optional[str] = None bidprentjes_url: Optional[str] = None # Memorial cards farms_url: Optional[str] = None graves_url: Optional[str] = None clippings_url: Optional[str] = None # Newspaper clippings documents_url: Optional[str] = None periodicals_url: Optional[str] = None population_registers_url: Optional[str] = None films_url: Optional[str] = None audio_url: Optional[str] = None archives_url: Optional[str] = None buildings_url: Optional[str] = None family_notices_url: Optional[str] = None special_collections: list = field(default_factory=list) # Metadata intranet_only: bool = False notes: Optional[str] = None # Collection types this institution uses collection_types: list = field(default_factory=list) def extract_location_from_name(name: str) -> tuple[str, str | None]: """Extract location from institution name if present in parentheses.""" # Pattern: "Institution Name (Location)" match = re.search(r'^(.+?)\s*\(([^)]+)\)\s*$', name) if match: return match.group(1).strip(), match.group(2).strip() # Pattern: "Institution Name, Location" match = re.search(r'^(.+?),\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)$', name) if match: return match.group(1).strip(), match.group(2).strip() return name, None def parse_zcbs_entry(text: str, url: str | None = None) -> Optional[dict]: """Parse a single ZCBS entry from the list.""" # Extract ZCBS ID (e.g., "#001", "#002") id_match = re.search(r'\(#(\d+)\)', text) if not id_match: return None zcbs_id = f"#{id_match.group(1)}" # Check if intranet only intranet_only = "voorlopig nog alleen op intranet" in text.lower() # Extract name (before the ID) name_part = text[:id_match.start()].strip() # Remove URL anchor text if present name_part = re.sub(r']*>([^<]+)', r'\1', name_part) name_part = name_part.strip(' ,;:\n\t') # Extract location name, location = extract_location_from_name(name_part) # Detect country (Belgium entries) country = "BE" if "belgi" in text.lower() else "NL" return { "zcbs_id": zcbs_id, "name": name, "location": location, "country": country, "url": url, "intranet_only": intranet_only } def parse_zcbs_html(html_content: str) -> list[dict]: """Parse the full ZCBS institutions list HTML.""" institutions = {} # keyed by zcbs_id to merge multiple entries # Collection type markers collection_sections = { "A": "objects", "B": "photos", "C": "library", "D": "archaeology", "E": "bidprentjes", "F": "farms", "G": "graves", "H": "clippings", "I": "documents", "J": "periodicals", "K": "population_registers", "L": "films", "M": "audio", "N": "archives", "O": "buildings", "P": "family_notices", "Q": "special" } # Parse each
  • entry with URL and ID pattern = r'
  • \s*]*>([^<]+)\s*(?: )?\s*\(#(\d+)\)' for match in re.finditer(pattern, html_content, re.IGNORECASE): url = match.group(1) name = match.group(2).strip() zcbs_id = f"#{match.group(3)}" # Extract location from name clean_name, location = extract_location_from_name(name) # Detect country country = "BE" if "belgi" in html_content[match.start():match.end()+100].lower() else "NL" # Create or update institution if zcbs_id not in institutions: institutions[zcbs_id] = { "zcbs_id": zcbs_id, "name": clean_name, "location": location, "country": country, "urls": [], "collection_types": [], "intranet_only": False } # Add URL if url and url not in [u["url"] for u in institutions[zcbs_id]["urls"]]: institutions[zcbs_id]["urls"].append({ "url": url, "type": classify_url_type(url) }) # Also find intranet-only entries intranet_pattern = r'\(#(\d+);?\s*voorlopig nog alleen op intranet\)' for match in re.finditer(intranet_pattern, html_content, re.IGNORECASE): zcbs_id = f"#{match.group(1)}" if zcbs_id in institutions: institutions[zcbs_id]["intranet_only"] = True # Find entries without links (intranet only, with names) intranet_name_pattern = r'
  • \s*([^<\(]+)\s*\(#(\d+);?\s*voorlopig nog alleen op intranet\)' for match in re.finditer(intranet_name_pattern, html_content, re.IGNORECASE): name = match.group(1).strip() zcbs_id = f"#{match.group(2)}" if zcbs_id not in institutions: clean_name, location = extract_location_from_name(name) institutions[zcbs_id] = { "zcbs_id": zcbs_id, "name": clean_name, "location": location, "country": "NL", "urls": [], "collection_types": [], "intranet_only": True } else: institutions[zcbs_id]["intranet_only"] = True return list(institutions.values()) def classify_url_type(url: str) -> str: """Classify URL by collection type based on CGI script name.""" url_lower = url.lower() type_mappings = { "objecten": "objects", "voorwerpen": "objects", "collectie": "objects", "museum": "objects", "beeldbank": "photos", "foto": "photos", "fotos": "photos", "library": "library", "boeken": "library", "bibliotheek": "library", "archeo": "archaeology", "bidprent": "bidprentjes", "boerderij": "farms", "graf": "graves", "graven": "graves", "knipsels": "clippings", "advertent": "clippings", "artikelen": "clippings", "kranten": "clippings", "document": "documents", "archief": "archives", "periodiek": "periodicals", "magazine": "periodicals", "kwartaalblad": "periodicals", "tijdschrift": "periodicals", "bevolking": "population_registers", "film": "films", "video": "films", "audio": "audio", "dialect": "audio", "huizen": "buildings", "gebouw": "buildings", "monument": "buildings", "familie": "family_notices", "rouw": "family_notices", "trouw": "family_notices", "geboorte": "family_notices", } for pattern, coll_type in type_mappings.items(): if pattern in url_lower: return coll_type return "unknown" def main(): """Main entry point.""" # Read HTML file html_path = Path("/Users/kempersc/apps/glam/data/nde/enriched/sources/zcbs") # We'll parse from the raw HTML content (passed via stdin or file) # For now, output the parsing logic # Sample parsing of the embedded HTML sample_html = """
  • Zijper Museum, Schagerbrug   (#001)
  • Historische Vereniging Oud Broek in Waterland   (#002) """ print("ZCBS Parser ready. Use parse_zcbs_html() with HTML content.") print(f"Collection types supported: {len(classify_url_type.__doc__ or '')} patterns") if __name__ == "__main__": main()