glam/scripts/parse_zcbs_list.py
2025-11-30 23:30:29 +01:00

262 lines
8.4 KiB
Python

#!/usr/bin/env python3
"""
Parse ZCBS institutions list from HTML and extract structured data.
ZCBS = Zijper Collectie Beheer Systeem (Zijpe Collection Management System)
Used by 170+ heritage institutions in Netherlands and Belgium.
Source: https://www.dezijpe.nl/cgi-bin/boerderij.pl?misc=90
"""
import re
import json
from pathlib import Path
from datetime import datetime
from typing import Optional
from dataclasses import dataclass, field, asdict
@dataclass
class ZCBSInstitution:
"""A heritage institution using ZCBS."""
zcbs_id: str # e.g., "#001", "#002"
name: str
location: Optional[str] = None # City/region
country: str = "NL" # Default Netherlands
# Collection URLs by type
objects_url: Optional[str] = None
photos_url: Optional[str] = None
library_url: Optional[str] = None
archaeology_url: Optional[str] = None
bidprentjes_url: Optional[str] = None # Memorial cards
farms_url: Optional[str] = None
graves_url: Optional[str] = None
clippings_url: Optional[str] = None # Newspaper clippings
documents_url: Optional[str] = None
periodicals_url: Optional[str] = None
population_registers_url: Optional[str] = None
films_url: Optional[str] = None
audio_url: Optional[str] = None
archives_url: Optional[str] = None
buildings_url: Optional[str] = None
family_notices_url: Optional[str] = None
special_collections: list = field(default_factory=list)
# Metadata
intranet_only: bool = False
notes: Optional[str] = None
# Collection types this institution uses
collection_types: list = field(default_factory=list)
def extract_location_from_name(name: str) -> tuple[str, str | None]:
"""Extract location from institution name if present in parentheses."""
# Pattern: "Institution Name (Location)"
match = re.search(r'^(.+?)\s*\(([^)]+)\)\s*$', name)
if match:
return match.group(1).strip(), match.group(2).strip()
# Pattern: "Institution Name, Location"
match = re.search(r'^(.+?),\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)$', name)
if match:
return match.group(1).strip(), match.group(2).strip()
return name, None
def parse_zcbs_entry(text: str, url: str | None = None) -> Optional[dict]:
"""Parse a single ZCBS entry from the list."""
# Extract ZCBS ID (e.g., "#001", "#002")
id_match = re.search(r'\(#(\d+)\)', text)
if not id_match:
return None
zcbs_id = f"#{id_match.group(1)}"
# Check if intranet only
intranet_only = "voorlopig nog alleen op intranet" in text.lower()
# Extract name (before the ID)
name_part = text[:id_match.start()].strip()
# Remove URL anchor text if present
name_part = re.sub(r'<a[^>]*>([^<]+)</a>', r'\1', name_part)
name_part = name_part.strip(' ,;:\n\t')
# Extract location
name, location = extract_location_from_name(name_part)
# Detect country (Belgium entries)
country = "BE" if "belgi" in text.lower() else "NL"
return {
"zcbs_id": zcbs_id,
"name": name,
"location": location,
"country": country,
"url": url,
"intranet_only": intranet_only
}
def parse_zcbs_html(html_content: str) -> list[dict]:
"""Parse the full ZCBS institutions list HTML."""
institutions = {} # keyed by zcbs_id to merge multiple entries
# Collection type markers
collection_sections = {
"A": "objects",
"B": "photos",
"C": "library",
"D": "archaeology",
"E": "bidprentjes",
"F": "farms",
"G": "graves",
"H": "clippings",
"I": "documents",
"J": "periodicals",
"K": "population_registers",
"L": "films",
"M": "audio",
"N": "archives",
"O": "buildings",
"P": "family_notices",
"Q": "special"
}
# Parse each <li> entry with URL and ID
pattern = r'<li>\s*<a href="([^"]+)"[^>]*>([^<]+)</a>\s*(?:&nbsp;)?\s*\(#(\d+)\)'
for match in re.finditer(pattern, html_content, re.IGNORECASE):
url = match.group(1)
name = match.group(2).strip()
zcbs_id = f"#{match.group(3)}"
# Extract location from name
clean_name, location = extract_location_from_name(name)
# Detect country
country = "BE" if "belgi" in html_content[match.start():match.end()+100].lower() else "NL"
# Create or update institution
if zcbs_id not in institutions:
institutions[zcbs_id] = {
"zcbs_id": zcbs_id,
"name": clean_name,
"location": location,
"country": country,
"urls": [],
"collection_types": [],
"intranet_only": False
}
# Add URL
if url and url not in [u["url"] for u in institutions[zcbs_id]["urls"]]:
institutions[zcbs_id]["urls"].append({
"url": url,
"type": classify_url_type(url)
})
# Also find intranet-only entries
intranet_pattern = r'\(#(\d+);?\s*voorlopig nog alleen op intranet\)'
for match in re.finditer(intranet_pattern, html_content, re.IGNORECASE):
zcbs_id = f"#{match.group(1)}"
if zcbs_id in institutions:
institutions[zcbs_id]["intranet_only"] = True
# Find entries without links (intranet only, with names)
intranet_name_pattern = r'<li>\s*([^<\(]+)\s*\(#(\d+);?\s*voorlopig nog alleen op intranet\)'
for match in re.finditer(intranet_name_pattern, html_content, re.IGNORECASE):
name = match.group(1).strip()
zcbs_id = f"#{match.group(2)}"
if zcbs_id not in institutions:
clean_name, location = extract_location_from_name(name)
institutions[zcbs_id] = {
"zcbs_id": zcbs_id,
"name": clean_name,
"location": location,
"country": "NL",
"urls": [],
"collection_types": [],
"intranet_only": True
}
else:
institutions[zcbs_id]["intranet_only"] = True
return list(institutions.values())
def classify_url_type(url: str) -> str:
"""Classify URL by collection type based on CGI script name."""
url_lower = url.lower()
type_mappings = {
"objecten": "objects",
"voorwerpen": "objects",
"collectie": "objects",
"museum": "objects",
"beeldbank": "photos",
"foto": "photos",
"fotos": "photos",
"library": "library",
"boeken": "library",
"bibliotheek": "library",
"archeo": "archaeology",
"bidprent": "bidprentjes",
"boerderij": "farms",
"graf": "graves",
"graven": "graves",
"knipsels": "clippings",
"advertent": "clippings",
"artikelen": "clippings",
"kranten": "clippings",
"document": "documents",
"archief": "archives",
"periodiek": "periodicals",
"magazine": "periodicals",
"kwartaalblad": "periodicals",
"tijdschrift": "periodicals",
"bevolking": "population_registers",
"film": "films",
"video": "films",
"audio": "audio",
"dialect": "audio",
"huizen": "buildings",
"gebouw": "buildings",
"monument": "buildings",
"familie": "family_notices",
"rouw": "family_notices",
"trouw": "family_notices",
"geboorte": "family_notices",
}
for pattern, coll_type in type_mappings.items():
if pattern in url_lower:
return coll_type
return "unknown"
def main():
"""Main entry point."""
# Read HTML file
html_path = Path("/Users/kempersc/apps/glam/data/nde/enriched/sources/zcbs")
# We'll parse from the raw HTML content (passed via stdin or file)
# For now, output the parsing logic
# Sample parsing of the embedded HTML
sample_html = """
<li> <a href="http://www.zijpermuseum.nl/cgi-bin/objecten.pl" target="_ZCBS">Zijper Museum, Schagerbrug</a> &nbsp; (#001)
<li> <a href="http://www.geheugenvanbroekinwaterland.nl/cgi-bin/objecten.pl" target="_ZCBS">Historische Vereniging Oud Broek in Waterland</a> &nbsp; (#002)
"""
print("ZCBS Parser ready. Use parse_zcbs_html() with HTML content.")
print(f"Collection types supported: {len(classify_url_type.__doc__ or '')} patterns")
if __name__ == "__main__":
main()