262 lines
8.4 KiB
Python
262 lines
8.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Parse ZCBS institutions list from HTML and extract structured data.
|
|
|
|
ZCBS = Zijper Collectie Beheer Systeem (Zijpe Collection Management System)
|
|
Used by 170+ heritage institutions in Netherlands and Belgium.
|
|
|
|
Source: https://www.dezijpe.nl/cgi-bin/boerderij.pl?misc=90
|
|
"""
|
|
|
|
import re
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
from dataclasses import dataclass, field, asdict
|
|
|
|
|
|
@dataclass
|
|
class ZCBSInstitution:
|
|
"""A heritage institution using ZCBS."""
|
|
zcbs_id: str # e.g., "#001", "#002"
|
|
name: str
|
|
location: Optional[str] = None # City/region
|
|
country: str = "NL" # Default Netherlands
|
|
|
|
# Collection URLs by type
|
|
objects_url: Optional[str] = None
|
|
photos_url: Optional[str] = None
|
|
library_url: Optional[str] = None
|
|
archaeology_url: Optional[str] = None
|
|
bidprentjes_url: Optional[str] = None # Memorial cards
|
|
farms_url: Optional[str] = None
|
|
graves_url: Optional[str] = None
|
|
clippings_url: Optional[str] = None # Newspaper clippings
|
|
documents_url: Optional[str] = None
|
|
periodicals_url: Optional[str] = None
|
|
population_registers_url: Optional[str] = None
|
|
films_url: Optional[str] = None
|
|
audio_url: Optional[str] = None
|
|
archives_url: Optional[str] = None
|
|
buildings_url: Optional[str] = None
|
|
family_notices_url: Optional[str] = None
|
|
special_collections: list = field(default_factory=list)
|
|
|
|
# Metadata
|
|
intranet_only: bool = False
|
|
notes: Optional[str] = None
|
|
|
|
# Collection types this institution uses
|
|
collection_types: list = field(default_factory=list)
|
|
|
|
|
|
def extract_location_from_name(name: str) -> tuple[str, str | None]:
|
|
"""Extract location from institution name if present in parentheses."""
|
|
# Pattern: "Institution Name (Location)"
|
|
match = re.search(r'^(.+?)\s*\(([^)]+)\)\s*$', name)
|
|
if match:
|
|
return match.group(1).strip(), match.group(2).strip()
|
|
|
|
# Pattern: "Institution Name, Location"
|
|
match = re.search(r'^(.+?),\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)$', name)
|
|
if match:
|
|
return match.group(1).strip(), match.group(2).strip()
|
|
|
|
return name, None
|
|
|
|
|
|
def parse_zcbs_entry(text: str, url: str | None = None) -> Optional[dict]:
|
|
"""Parse a single ZCBS entry from the list."""
|
|
# Extract ZCBS ID (e.g., "#001", "#002")
|
|
id_match = re.search(r'\(#(\d+)\)', text)
|
|
if not id_match:
|
|
return None
|
|
|
|
zcbs_id = f"#{id_match.group(1)}"
|
|
|
|
# Check if intranet only
|
|
intranet_only = "voorlopig nog alleen op intranet" in text.lower()
|
|
|
|
# Extract name (before the ID)
|
|
name_part = text[:id_match.start()].strip()
|
|
|
|
# Remove URL anchor text if present
|
|
name_part = re.sub(r'<a[^>]*>([^<]+)</a>', r'\1', name_part)
|
|
name_part = name_part.strip(' ,;:\n\t')
|
|
|
|
# Extract location
|
|
name, location = extract_location_from_name(name_part)
|
|
|
|
# Detect country (Belgium entries)
|
|
country = "BE" if "belgi" in text.lower() else "NL"
|
|
|
|
return {
|
|
"zcbs_id": zcbs_id,
|
|
"name": name,
|
|
"location": location,
|
|
"country": country,
|
|
"url": url,
|
|
"intranet_only": intranet_only
|
|
}
|
|
|
|
|
|
def parse_zcbs_html(html_content: str) -> list[dict]:
|
|
"""Parse the full ZCBS institutions list HTML."""
|
|
institutions = {} # keyed by zcbs_id to merge multiple entries
|
|
|
|
# Collection type markers
|
|
collection_sections = {
|
|
"A": "objects",
|
|
"B": "photos",
|
|
"C": "library",
|
|
"D": "archaeology",
|
|
"E": "bidprentjes",
|
|
"F": "farms",
|
|
"G": "graves",
|
|
"H": "clippings",
|
|
"I": "documents",
|
|
"J": "periodicals",
|
|
"K": "population_registers",
|
|
"L": "films",
|
|
"M": "audio",
|
|
"N": "archives",
|
|
"O": "buildings",
|
|
"P": "family_notices",
|
|
"Q": "special"
|
|
}
|
|
|
|
# Parse each <li> entry with URL and ID
|
|
pattern = r'<li>\s*<a href="([^"]+)"[^>]*>([^<]+)</a>\s*(?: )?\s*\(#(\d+)\)'
|
|
|
|
for match in re.finditer(pattern, html_content, re.IGNORECASE):
|
|
url = match.group(1)
|
|
name = match.group(2).strip()
|
|
zcbs_id = f"#{match.group(3)}"
|
|
|
|
# Extract location from name
|
|
clean_name, location = extract_location_from_name(name)
|
|
|
|
# Detect country
|
|
country = "BE" if "belgi" in html_content[match.start():match.end()+100].lower() else "NL"
|
|
|
|
# Create or update institution
|
|
if zcbs_id not in institutions:
|
|
institutions[zcbs_id] = {
|
|
"zcbs_id": zcbs_id,
|
|
"name": clean_name,
|
|
"location": location,
|
|
"country": country,
|
|
"urls": [],
|
|
"collection_types": [],
|
|
"intranet_only": False
|
|
}
|
|
|
|
# Add URL
|
|
if url and url not in [u["url"] for u in institutions[zcbs_id]["urls"]]:
|
|
institutions[zcbs_id]["urls"].append({
|
|
"url": url,
|
|
"type": classify_url_type(url)
|
|
})
|
|
|
|
# Also find intranet-only entries
|
|
intranet_pattern = r'\(#(\d+);?\s*voorlopig nog alleen op intranet\)'
|
|
for match in re.finditer(intranet_pattern, html_content, re.IGNORECASE):
|
|
zcbs_id = f"#{match.group(1)}"
|
|
if zcbs_id in institutions:
|
|
institutions[zcbs_id]["intranet_only"] = True
|
|
|
|
# Find entries without links (intranet only, with names)
|
|
intranet_name_pattern = r'<li>\s*([^<\(]+)\s*\(#(\d+);?\s*voorlopig nog alleen op intranet\)'
|
|
for match in re.finditer(intranet_name_pattern, html_content, re.IGNORECASE):
|
|
name = match.group(1).strip()
|
|
zcbs_id = f"#{match.group(2)}"
|
|
|
|
if zcbs_id not in institutions:
|
|
clean_name, location = extract_location_from_name(name)
|
|
institutions[zcbs_id] = {
|
|
"zcbs_id": zcbs_id,
|
|
"name": clean_name,
|
|
"location": location,
|
|
"country": "NL",
|
|
"urls": [],
|
|
"collection_types": [],
|
|
"intranet_only": True
|
|
}
|
|
else:
|
|
institutions[zcbs_id]["intranet_only"] = True
|
|
|
|
return list(institutions.values())
|
|
|
|
|
|
def classify_url_type(url: str) -> str:
|
|
"""Classify URL by collection type based on CGI script name."""
|
|
url_lower = url.lower()
|
|
|
|
type_mappings = {
|
|
"objecten": "objects",
|
|
"voorwerpen": "objects",
|
|
"collectie": "objects",
|
|
"museum": "objects",
|
|
"beeldbank": "photos",
|
|
"foto": "photos",
|
|
"fotos": "photos",
|
|
"library": "library",
|
|
"boeken": "library",
|
|
"bibliotheek": "library",
|
|
"archeo": "archaeology",
|
|
"bidprent": "bidprentjes",
|
|
"boerderij": "farms",
|
|
"graf": "graves",
|
|
"graven": "graves",
|
|
"knipsels": "clippings",
|
|
"advertent": "clippings",
|
|
"artikelen": "clippings",
|
|
"kranten": "clippings",
|
|
"document": "documents",
|
|
"archief": "archives",
|
|
"periodiek": "periodicals",
|
|
"magazine": "periodicals",
|
|
"kwartaalblad": "periodicals",
|
|
"tijdschrift": "periodicals",
|
|
"bevolking": "population_registers",
|
|
"film": "films",
|
|
"video": "films",
|
|
"audio": "audio",
|
|
"dialect": "audio",
|
|
"huizen": "buildings",
|
|
"gebouw": "buildings",
|
|
"monument": "buildings",
|
|
"familie": "family_notices",
|
|
"rouw": "family_notices",
|
|
"trouw": "family_notices",
|
|
"geboorte": "family_notices",
|
|
}
|
|
|
|
for pattern, coll_type in type_mappings.items():
|
|
if pattern in url_lower:
|
|
return coll_type
|
|
|
|
return "unknown"
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
# Read HTML file
|
|
html_path = Path("/Users/kempersc/apps/glam/data/nde/enriched/sources/zcbs")
|
|
|
|
# We'll parse from the raw HTML content (passed via stdin or file)
|
|
# For now, output the parsing logic
|
|
|
|
# Sample parsing of the embedded HTML
|
|
sample_html = """
|
|
<li> <a href="http://www.zijpermuseum.nl/cgi-bin/objecten.pl" target="_ZCBS">Zijper Museum, Schagerbrug</a> (#001)
|
|
<li> <a href="http://www.geheugenvanbroekinwaterland.nl/cgi-bin/objecten.pl" target="_ZCBS">Historische Vereniging Oud Broek in Waterland</a> (#002)
|
|
"""
|
|
|
|
print("ZCBS Parser ready. Use parse_zcbs_html() with HTML content.")
|
|
print(f"Collection types supported: {len(classify_url_type.__doc__ or '')} patterns")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|