glam/scripts/parse_zcbs_list.py

#!/usr/bin/env python3
"""
Parse ZCBS institutions list from HTML and extract structured data.

ZCBS = Zijper Collectie Beheer Systeem (Zijpe Collection Management System)
Used by 170+ heritage institutions in Netherlands and Belgium.

Source: https://www.dezijpe.nl/cgi-bin/boerderij.pl?misc=90
"""

import re
import json
from pathlib import Path
from datetime import datetime
from typing import Optional
from dataclasses import dataclass, field, asdict


@dataclass
class ZCBSInstitution:
    """A heritage institution using ZCBS."""
    zcbs_id: str  # e.g., "#001", "#002"
    name: str
    location: Optional[str] = None  # City/region
    country: str = "NL"  # Default Netherlands

    # Collection URLs by type
    objects_url: Optional[str] = None
    photos_url: Optional[str] = None
    library_url: Optional[str] = None
    archaeology_url: Optional[str] = None
    bidprentjes_url: Optional[str] = None  # Memorial cards
    farms_url: Optional[str] = None
    graves_url: Optional[str] = None
    clippings_url: Optional[str] = None  # Newspaper clippings
    documents_url: Optional[str] = None
    periodicals_url: Optional[str] = None
    population_registers_url: Optional[str] = None
    films_url: Optional[str] = None
    audio_url: Optional[str] = None
    archives_url: Optional[str] = None
    buildings_url: Optional[str] = None
    family_notices_url: Optional[str] = None
    special_collections: list = field(default_factory=list)

    # Metadata
    intranet_only: bool = False
    notes: Optional[str] = None

    # Collection types this institution uses
    collection_types: list = field(default_factory=list)


def extract_location_from_name(name: str) -> tuple[str, str | None]:
    """Extract location from institution name if present in parentheses."""
    # Pattern: "Institution Name (Location)"
    match = re.search(r'^(.+?)\s*\(([^)]+)\)\s*$', name)
    if match:
        return match.group(1).strip(), match.group(2).strip()

    # Pattern: "Institution Name, Location"
    match = re.search(r'^(.+?),\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)$', name)
    if match:
        return match.group(1).strip(), match.group(2).strip()

    return name, None


def parse_zcbs_entry(text: str, url: str | None = None) -> Optional[dict]:
    """Parse a single ZCBS entry from the list."""
    # Extract ZCBS ID (e.g., "#001", "#002")
    id_match = re.search(r'\(#(\d+)\)', text)
    if not id_match:
        return None

    zcbs_id = f"#{id_match.group(1)}"

    # Check if intranet only
    intranet_only = "voorlopig nog alleen op intranet" in text.lower()

    # Extract name (before the ID)
    name_part = text[:id_match.start()].strip()

    # Remove URL anchor text if present
    name_part = re.sub(r'<a[^>]*>([^<]+)</a>', r'\1', name_part)
    name_part = name_part.strip(' ,;:\n\t')

    # Extract location
    name, location = extract_location_from_name(name_part)

    # Detect country (Belgium entries)
    country = "BE" if "belgi" in text.lower() else "NL"

    return {
        "zcbs_id": zcbs_id,
        "name": name,
        "location": location,
        "country": country,
        "url": url,
        "intranet_only": intranet_only
    }


def parse_zcbs_html(html_content: str) -> list[dict]:
    """Parse the full ZCBS institutions list HTML."""
    institutions = {}  # keyed by zcbs_id to merge multiple entries

    # Collection type markers
    collection_sections = {
        "A": "objects",
        "B": "photos",
        "C": "library",
        "D": "archaeology",
        "E": "bidprentjes",
        "F": "farms",
        "G": "graves",
        "H": "clippings",
        "I": "documents",
        "J": "periodicals",
        "K": "population_registers",
        "L": "films",
        "M": "audio",
        "N": "archives",
        "O": "buildings",
        "P": "family_notices",
        "Q": "special"
    }

    # Parse each <li> entry with URL and ID
    pattern = r'<li>\s*<a href="([^"]+)"[^>]*>([^<]+)</a>\s*(?:&nbsp;)?\s*\(#(\d+)\)'

    for match in re.finditer(pattern, html_content, re.IGNORECASE):
        url = match.group(1)
        name = match.group(2).strip()
        zcbs_id = f"#{match.group(3)}"

        # Extract location from name
        clean_name, location = extract_location_from_name(name)

        # Detect country
        country = "BE" if "belgi" in html_content[match.start():match.end()+100].lower() else "NL"

        # Create or update institution
        if zcbs_id not in institutions:
            institutions[zcbs_id] = {
                "zcbs_id": zcbs_id,
                "name": clean_name,
                "location": location,
                "country": country,
                "urls": [],
                "collection_types": [],
                "intranet_only": False
            }

        # Add URL
        if url and url not in [u["url"] for u in institutions[zcbs_id]["urls"]]:
            institutions[zcbs_id]["urls"].append({
                "url": url,
                "type": classify_url_type(url)
            })

    # Also find intranet-only entries
    intranet_pattern = r'\(#(\d+);?\s*voorlopig nog alleen op intranet\)'
    for match in re.finditer(intranet_pattern, html_content, re.IGNORECASE):
        zcbs_id = f"#{match.group(1)}"
        if zcbs_id in institutions:
            institutions[zcbs_id]["intranet_only"] = True

    # Find entries without links (intranet only, with names)
    intranet_name_pattern = r'<li>\s*([^<\(]+)\s*\(#(\d+);?\s*voorlopig nog alleen op intranet\)'
    for match in re.finditer(intranet_name_pattern, html_content, re.IGNORECASE):
        name = match.group(1).strip()
        zcbs_id = f"#{match.group(2)}"

        if zcbs_id not in institutions:
            clean_name, location = extract_location_from_name(name)
            institutions[zcbs_id] = {
                "zcbs_id": zcbs_id,
                "name": clean_name,
                "location": location,
                "country": "NL",
                "urls": [],
                "collection_types": [],
                "intranet_only": True
            }
        else:
            institutions[zcbs_id]["intranet_only"] = True

    return list(institutions.values())


def classify_url_type(url: str) -> str:
    """Classify URL by collection type based on CGI script name."""
    url_lower = url.lower()

    type_mappings = {
        "objecten": "objects",
        "voorwerpen": "objects",
        "collectie": "objects",
        "museum": "objects",
        "beeldbank": "photos",
        "foto": "photos",
        "fotos": "photos",
        "library": "library",
        "boeken": "library",
        "bibliotheek": "library",
        "archeo": "archaeology",
        "bidprent": "bidprentjes",
        "boerderij": "farms",
        "graf": "graves",
        "graven": "graves",
        "knipsels": "clippings",
        "advertent": "clippings",
        "artikelen": "clippings",
        "kranten": "clippings",
        "document": "documents",
        "archief": "archives",
        "periodiek": "periodicals",
        "magazine": "periodicals",
        "kwartaalblad": "periodicals",
        "tijdschrift": "periodicals",
        "bevolking": "population_registers",
        "film": "films",
        "video": "films",
        "audio": "audio",
        "dialect": "audio",
        "huizen": "buildings",
        "gebouw": "buildings",
        "monument": "buildings",
        "familie": "family_notices",
        "rouw": "family_notices",
        "trouw": "family_notices",
        "geboorte": "family_notices",
    }

    for pattern, coll_type in type_mappings.items():
        if pattern in url_lower:
            return coll_type

    return "unknown"


def main():
    """Main entry point."""
    # Read HTML file
    html_path = Path("/Users/kempersc/apps/glam/data/nde/enriched/sources/zcbs")

    # We'll parse from the raw HTML content (passed via stdin or file)
    # For now, output the parsing logic

    # Sample parsing of the embedded HTML
    sample_html = """
    <li> <a href="http://www.zijpermuseum.nl/cgi-bin/objecten.pl" target="_ZCBS">Zijper Museum, Schagerbrug</a> &nbsp; (#001)
    <li> <a href="http://www.geheugenvanbroekinwaterland.nl/cgi-bin/objecten.pl" target="_ZCBS">Historische Vereniging Oud Broek in Waterland</a> &nbsp; (#002)
    """

    print("ZCBS Parser ready. Use parse_zcbs_html() with HTML content.")
    print(f"Collection types supported: {len(classify_url_type.__doc__ or '')} patterns")


if __name__ == "__main__":
    main()