#!/usr/bin/env python3 """ Extract structured contact data from Dutch heritage institution contact pages. This script: 1. Finds all contact pages in web archives (contact_index.html, etc.) 2. Extracts person-role-contact relationships using pattern matching 3. Adds web_contact_data section to custodian YAML files Data extracted: - Board members (bestuur) with roles (voorzitter, secretaris, penningmeester) - Staff members (medewerkers) with functions - Contact information (address, phone, email, RSIN) - Postal and physical addresses Usage: python scripts/extract_contact_page_data.py [--dry-run] [--verbose] [--limit N] """ import argparse import glob import os import re from dataclasses import dataclass, field from datetime import datetime, timezone from html.parser import HTMLParser from pathlib import Path from typing import Any, Optional import yaml # ============================================================================ # DATA CLASSES FOR STRUCTURED CONTACT DATA # ============================================================================ @dataclass class PersonContact: """A person with role and contact information.""" name: str role: Optional[str] = None role_type: Optional[str] = None # BOARD, STAFF, VOLUNTEER, EDITORIAL, ADMIN street_address: Optional[str] = None postal_code: Optional[str] = None city: Optional[str] = None phone: Optional[str] = None email: Optional[str] = None @dataclass class OrganizationContact: """Organization-level contact information.""" postal_address: Optional[str] = None postal_code: Optional[str] = None postal_city: Optional[str] = None physical_address: Optional[str] = None physical_postal_code: Optional[str] = None physical_city: Optional[str] = None phone: Optional[str] = None email: Optional[str] = None rsin: Optional[str] = None kvk: Optional[str] = None iban: Optional[str] = None @dataclass class ContactPageData: """All data extracted from a contact page.""" persons: list[PersonContact] = field(default_factory=list) organization: Optional[OrganizationContact] = None source_file: Optional[str] = None source_url: Optional[str] = None extraction_date: Optional[str] = None # ============================================================================ # HTML TEXT EXTRACTION # ============================================================================ class HTMLTextExtractor(HTMLParser): """Extract visible text from HTML.""" def __init__(self): super().__init__() self.text_parts = [] self.skip_tags = {'script', 'style', 'head', 'noscript'} self.skipping = False def handle_starttag(self, tag, attrs): if tag in self.skip_tags: self.skipping = True # Add newline for block elements if tag in ('p', 'div', 'br', 'li', 'tr', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'): self.text_parts.append('\n') def handle_endtag(self, tag): if tag in self.skip_tags: self.skipping = False if tag in ('p', 'div', 'li', 'tr'): self.text_parts.append('\n') def handle_data(self, data): if not self.skipping: text = data.strip() if text: self.text_parts.append(text) self.text_parts.append(' ') def get_text(self) -> str: return ''.join(self.text_parts) def extract_text_from_html(html_content: str) -> str: """Extract visible text from HTML content.""" parser = HTMLTextExtractor() try: parser.feed(html_content) return parser.get_text() except Exception: return html_content # ============================================================================ # DUTCH CONTACT PAGE PATTERNS # ============================================================================ # Role keywords in Dutch and English ROLE_PATTERNS = { 'BOARD': [ r'\bvoorzitter\b', r'\bsecretaris\b', r'\bpenningmeester\b', r'\bbestuurslid\b', r'\bvice[-\s]?voorzitter\b', r'\bchairman\b', r'\bpresident\b', r'\bsecretary\b', r'\btreasurer\b', r'\bboard\s+member\b', ], 'STAFF': [ r'\bmedewerk(?:er|ster)\b', r'\bcoördinator\b', r'\bcoordinator\b', r'\bbeheerder\b', r'\bcurator\b', r'\bconservator\b', r'\barchivaris\b', r'\bdirecteur\b', r'\bdirector\b', r'\bmanager\b', ], 'VOLUNTEER': [ r'\bvrijwilliger\b', r'\bvolunteer\b', ], 'EDITORIAL': [ r'\bredactie\b', r'\bredacteur\b', r'\beditor\b', r'\beditorial\b', ], 'ADMIN': [ r'\bledenadministratie\b', r'\badministratie\b', r'\badministration\b', r'\bmembership\b', ], } # Dutch postal code pattern: 4 digits + space + 2 letters POSTAL_CODE_PATTERN = r'\b(\d{4}\s*[A-Z]{2})\b' # Phone patterns (Dutch format) PHONE_PATTERNS = [ r'tel\.?:?\s*([\d\s\-\(\)]+)', r'telefoon:?\s*([\d\s\-\(\)]+)', r'phone:?\s*([\d\s\-\(\)]+)', r'(\d{4}[-\s]?\d{6})', # 0226-451592 r'(\d{3}[-\s]?\d{7})', # 020-1234567 r'(\+31[\d\s\-]+)', # +31 format ] # Email pattern EMAIL_PATTERN = r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})' # RSIN pattern (Dutch fiscal number) RSIN_PATTERN = r'RSIN[:\s\-]*(\d{4}[\.\s]?\d{2}[\.\s]?\d{3}|\d{9})' # KvK pattern (Dutch Chamber of Commerce) KVK_PATTERN = r'(?:KvK|Kvk|kvk|Kamer\s+van\s+Koophandel)[:\s\-]*(\d{8})' # IBAN pattern IBAN_PATTERN = r'\b(NL\d{2}\s*[A-Z]{4}\s*[\d\s]{10,18})\b' # Section headers indicating person lists SECTION_HEADERS = [ r'(?:Het\s+)?(?:dagelijks[e]?\s+)?bestuur(?:\s+bestaat\s+uit)?:?', r'Overige?\s+bestuurslede?n:?', r'Overige?\s+medewerk(?:er|st)e?rs?:?', r'Vrijwilligers?:?', r'Redactie(?:\s+magazine)?:?', r'Ledenadministratie:?', r'Contactpersonen?:?', r'Staff\s*:?', r'Team\s*:?', r'Board(?:\s+members)?:?', ] # ============================================================================ # PERSON EXTRACTION FROM CONTACT PAGE TEXT # ============================================================================ def extract_person_with_role_inline(line: str) -> Optional[PersonContact]: """ Extract person from lines like: - "Ella Molenaar, voorzitter" - "Koos Bijvoet, secretaris" """ # Pattern: Name, role (where role is a known keyword) for role_type, patterns in ROLE_PATTERNS.items(): for pattern in patterns: match = re.search(rf'^([A-Z][a-z]+(?:\s+[A-Za-z\-]+)+),\s*({pattern})', line, re.IGNORECASE) if match: return PersonContact( name=match.group(1).strip(), role=match.group(2).strip().lower(), role_type=role_type ) return None def extract_person_with_address(line: str) -> Optional[PersonContact]: """ Extract person with full contact info from lines like: "Afra Oudejans-Ursem, Dorpsstraat 234, 1713 HP Obdam, tel.: 0226-451592" """ # Pattern: Name, Street Number, PostalCode City, tel: Phone pattern = ( r'^([A-Z][a-zA-Z\-]+(?:\s+[A-Za-z\-]+)*),\s*' # Name r'([A-Za-z\s]+\s+\d+[a-zA-Z]?),\s*' # Street + number r'(\d{4}\s*[A-Z]{2})\s+' # Postal code r'([A-Za-z\s\-]+)' # City r'(?:,\s*tel\.?:?\s*([\d\-\s]+))?' # Optional phone ) match = re.match(pattern, line, re.IGNORECASE) if match: return PersonContact( name=match.group(1).strip(), street_address=match.group(2).strip(), postal_code=match.group(3).strip().replace(' ', ' '), # Normalize spacing city=match.group(4).strip().rstrip(','), phone=match.group(5).strip() if match.group(5) else None ) return None def extract_person_with_location(line: str) -> Optional[PersonContact]: """ Extract person from lines like: "Herman Tielen, Obdam" "Niek Wever, Obdam" """ # Pattern: Name, City (where City starts with capital) pattern = r'^([A-Z][a-z]+(?:\s+[A-Za-z\-]+)+),\s+([A-Z][a-z]+(?:[\s\-][A-Za-z]+)*)$' match = re.match(pattern, line.strip()) if match: name = match.group(1).strip() location = match.group(2).strip() # Verify this isn't a role (like "voorzitter") is_role = False for patterns in ROLE_PATTERNS.values(): for p in patterns: if re.search(p, location, re.IGNORECASE): is_role = True break if not is_role and len(location) > 2: return PersonContact( name=name, city=location ) return None def extract_simple_name(line: str, context_role_type: Optional[str] = None) -> Optional[PersonContact]: """ Extract simple name (just a name, no other info). "Ed Groustra" """ # Pattern: Just a name (2-4 words, each capitalized) pattern = r'^([A-Z][a-z]+(?:\s+[A-Za-z\-]+){0,3})$' match = re.match(pattern, line.strip()) if match: name = match.group(1).strip() # Filter out common non-names (form labels, navigation, etc.) non_names = { # Navigation elements 'Contact', 'Bestuur', 'Team', 'Staff', 'Redactie', 'Informatie', 'Over', 'Home', 'Menu', 'Start', 'Links', 'Artikelen', 'Beeldbank', 'Magazines', # Form field labels (common in contact forms) 'Naam', 'Name', 'Bericht', 'Message', 'Indienen', 'Submit', 'Verzenden', 'Send', 'Email', 'Telefoon', 'Phone', 'Adres', 'Address', 'Onderwerp', 'Subject', 'Captcha', 'Voornaam', 'Achternaam', 'Mobiel', 'Verstuur', # Other non-names 'Copyright', 'Privacy', 'Disclaimer', 'Cookies', 'Lees', 'Meer', 'Terug', 'Back', 'Volgende', 'Next', 'Facebook', 'Twitter', 'Instagram', 'LinkedIn', 'YouTube', # Common Dutch words that might match pattern 'Donateur', 'Worden', 'Stichting', 'Vereniging', 'Folder', 'Educatie', 'Nieuwsbrief', 'Projecten', 'Openingstijden', 'Entreeprijzen', 'Museumwinkel', 'Toegankelijkheid', 'Privacybeleid', # Section headers that aren't names 'Beleidsplan', 'Jaarverslag', 'Werkgroepen', 'Bezoekersinformatie', 'Collectie', } # Also filter out longer phrases that are clearly not names non_name_patterns = [ r'^Bestuur\s+', r'^Vrienden\s+', r'^Werkgroep', r'^Platform\s+', r'^Project', r'^Museum', r'^Geschiedenis', r'^Algemene\s+', r'^Inrichting', r'^Speciaal\s+', r'^Sociale\s+', r'^Design\s+', r'^Digitalisering', r'^Herdenkings', r'^Restauratie', r'^Monument\s+', r'molen\b', r'^Toegankelijkheid', r'^Privacy\b', r'^Disclaimer\b', r'Policy\b', r'^Copyright\b', r'^Terms\b', r'^Cookies?\b', r'^GDPR\b', r'^AVG\b', ] # Filter out organization names starting with common prefixes if name not in non_names and len(name) > 3: # Check against non-name patterns for pattern in non_name_patterns: if re.search(pattern, name, re.IGNORECASE): return None # Filter out names that look like organization names if name.startswith('Stichting ') or name.startswith('Vereniging '): return None # Names should typically be 2-4 words and look like personal names # Personal names usually have a structure like "First Last" or "First Middle Last" words = name.split() if len(words) == 1: # Single word - could be a personal name but also could be menu item # Only accept if it looks like a Dutch/European personal name # (this is a heuristic - may need refinement) return None return PersonContact( name=name, role_type=context_role_type ) return None def extract_organization_contact(text: str) -> OrganizationContact: """Extract organization-level contact information.""" org = OrganizationContact() # Extract RSIN rsin_match = re.search(RSIN_PATTERN, text, re.IGNORECASE) if rsin_match: org.rsin = rsin_match.group(1).replace('.', '').replace(' ', '') # Extract KvK kvk_match = re.search(KVK_PATTERN, text, re.IGNORECASE) if kvk_match: org.kvk = kvk_match.group(1) # Extract IBAN iban_match = re.search(IBAN_PATTERN, text) if iban_match: org.iban = iban_match.group(1).replace(' ', '') # Extract postal address block # Look for "Postadres:" section # City should be a single word or hyphenated word, not multiple lines postadres_match = re.search( r'Postadres:?\s*\n?\s*(?:Stichting\s+[^\n]+\s*\n?)?\s*' r'([A-Za-z\s\']+\s+\d+[a-zA-Z]?)\s*\n?\s*' r'(\d{4}\s*[A-Z]{2})\s+' r'([A-Za-z]+(?:[\-][A-Za-z]+)?)', # City: single word or hyphenated text, re.IGNORECASE ) if postadres_match: org.postal_address = postadres_match.group(1).strip() org.postal_code = postadres_match.group(2).strip() org.postal_city = postadres_match.group(3).strip() # Extract building/physical address # Look for "Gebouw:" or physical address section gebouw_match = re.search( r'(?:Gebouw|Bezoekadres|Locatie):?\s*\n?\s*(?:[^\n]*\n)?\s*' r'([A-Za-z\s\']+\s+\d+[a-zA-Z]?)\s*\n?\s*' r'(\d{4}\s*[A-Z]{2})\s+' r'([A-Za-z]+(?:[\-][A-Za-z]+)?)', # City: single word or hyphenated text, re.IGNORECASE ) if gebouw_match: org.physical_address = gebouw_match.group(1).strip() org.physical_postal_code = gebouw_match.group(2).strip() org.physical_city = gebouw_match.group(3).strip() return org def extract_persons_from_text(text: str) -> list[PersonContact]: """Extract all persons from contact page text.""" persons = [] lines = text.split('\n') current_section = None current_role_type = None for i, line in enumerate(lines): line = line.strip() if not line: continue # Check if this is a section header for header in SECTION_HEADERS: if re.match(header, line, re.IGNORECASE): # Determine role type from header header_lower = line.lower() if 'bestuur' in header_lower: current_role_type = 'BOARD' elif 'medewerk' in header_lower: current_role_type = 'STAFF' elif 'vrijwillig' in header_lower: current_role_type = 'VOLUNTEER' elif 'redactie' in header_lower: current_role_type = 'EDITORIAL' elif 'ledenadmin' in header_lower or 'administratie' in header_lower: current_role_type = 'ADMIN' current_section = line break # Try different extraction patterns person = extract_person_with_role_inline(line) if person: if not person.role_type and current_role_type: person.role_type = current_role_type persons.append(person) continue person = extract_person_with_address(line) if person: person.role_type = current_role_type or 'BOARD' # Assume board if has address persons.append(person) continue person = extract_person_with_location(line) if person: person.role_type = current_role_type persons.append(person) continue # Only extract simple names if we're in a known section if current_role_type: person = extract_simple_name(line, current_role_type) if person: persons.append(person) return persons def extract_contact_page_data(html_path: str) -> Optional[ContactPageData]: """Extract all contact data from an HTML file.""" try: with open(html_path, 'r', encoding='utf-8', errors='replace') as f: html_content = f.read() except Exception as e: print(f" Error reading {html_path}: {e}") return None text = extract_text_from_html(html_content) persons = extract_persons_from_text(text) organization = extract_organization_contact(text) # Only return if we found something if not persons and not organization.rsin and not organization.kvk: return None return ContactPageData( persons=persons, organization=organization, source_file=html_path, extraction_date=datetime.now(timezone.utc).isoformat() ) # ============================================================================ # CUSTODIAN FILE PROCESSING # ============================================================================ def find_contact_pages(web_archive_dir: str) -> list[str]: """Find all contact pages in a web archive directory.""" contact_pages = [] pages_dir = os.path.join(web_archive_dir, 'pages') if not os.path.isdir(pages_dir): return contact_pages # Look for contact pages (various naming patterns) for pattern in ['*contact*.html', '*Contact*.html', '*kontakt*.html']: contact_pages.extend(glob.glob(os.path.join(pages_dir, pattern))) return contact_pages def person_to_dict(person: PersonContact) -> dict[str, Any]: """Convert PersonContact to dictionary for YAML output.""" d = {'name': person.name} if person.role: d['role'] = person.role if person.role_type: d['role_type'] = person.role_type if person.street_address: d['street_address'] = person.street_address if person.postal_code: d['postal_code'] = person.postal_code if person.city: d['city'] = person.city if person.phone: d['phone'] = person.phone if person.email: d['email'] = person.email return d def org_to_dict(org: OrganizationContact) -> dict[str, Any]: """Convert OrganizationContact to dictionary for YAML output.""" d = {} if org.postal_address: d['postal_address'] = { 'street': org.postal_address, 'postal_code': org.postal_code, 'city': org.postal_city } if org.physical_address: d['physical_address'] = { 'street': org.physical_address, 'postal_code': org.physical_postal_code, 'city': org.physical_city } if org.rsin: d['rsin'] = org.rsin if org.kvk: d['kvk'] = org.kvk if org.iban: d['iban'] = org.iban if org.phone: d['phone'] = org.phone if org.email: d['email'] = org.email return d def process_custodian_file( custodian_path: str, dry_run: bool = False, verbose: bool = False ) -> tuple[bool, int, Optional[str]]: """ Process a single custodian file and extract contact page data. Returns: (updated, person_count, error_message) """ try: with open(custodian_path, 'r', encoding='utf-8') as f: custodian_data = yaml.safe_load(f) except Exception as e: return False, 0, f"Error reading file: {e}" if not custodian_data: return False, 0, "Empty file" # Find web archive directory web_enrichment = custodian_data.get('web_enrichment', {}) web_archives = web_enrichment.get('web_archives', []) if not web_archives: return False, 0, None # No web archives, not an error all_persons = [] all_orgs = [] source_files = [] for archive in web_archives: # Support both 'directory' and 'archive_path' field names archive_path = archive.get('directory', '') or archive.get('archive_path', '') if not archive_path: continue # Convert to web archive directory path # archive_path is like "web/0787/oudobdam-hensbroek.nl" web_dir = os.path.join('/Users/kempersc/apps/glam/data/custodian', archive_path) if not os.path.isdir(web_dir): continue # Find contact pages contact_pages = find_contact_pages(web_dir) for contact_page in contact_pages: if verbose: print(f" Processing: {contact_page}") data = extract_contact_page_data(contact_page) if data: all_persons.extend(data.persons) if data.organization: all_orgs.append(data.organization) source_files.append(contact_page) if not all_persons and not all_orgs: return False, 0, None # No contact data found # Deduplicate persons by name seen_names = set() unique_persons = [] for person in all_persons: if person.name not in seen_names: seen_names.add(person.name) unique_persons.append(person) # Build web_contact_data section contact_data = { 'extraction_date': datetime.now(timezone.utc).isoformat(), 'extraction_method': 'contact_page_pattern_matching', 'source_files': [os.path.relpath(f, '/Users/kempersc/apps/glam/data/custodian') for f in source_files], } if unique_persons: contact_data['persons'] = [person_to_dict(p) for p in unique_persons] # Merge organization data (take first non-empty values) merged_org = OrganizationContact() for org in all_orgs: if org.rsin and not merged_org.rsin: merged_org.rsin = org.rsin if org.kvk and not merged_org.kvk: merged_org.kvk = org.kvk if org.iban and not merged_org.iban: merged_org.iban = org.iban if org.postal_address and not merged_org.postal_address: merged_org.postal_address = org.postal_address merged_org.postal_code = org.postal_code merged_org.postal_city = org.postal_city if org.physical_address and not merged_org.physical_address: merged_org.physical_address = org.physical_address merged_org.physical_postal_code = org.physical_postal_code merged_org.physical_city = org.physical_city org_dict = org_to_dict(merged_org) if org_dict: contact_data['organization'] = org_dict # Update custodian data custodian_data['web_contact_data'] = contact_data if not dry_run: with open(custodian_path, 'w', encoding='utf-8') as f: yaml.dump(custodian_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return True, len(unique_persons), None # ============================================================================ # MAIN EXECUTION # ============================================================================ def main(): parser = argparse.ArgumentParser( description='Extract contact data from Dutch heritage institution contact pages' ) parser.add_argument('--dry-run', action='store_true', help='Do not modify files') parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output') parser.add_argument('--limit', type=int, help='Limit number of files to process') parser.add_argument('--file', type=str, help='Process a single custodian file') args = parser.parse_args() # Find custodian files if args.file: custodian_files = [args.file] else: custodian_files = sorted(glob.glob('/Users/kempersc/apps/glam/data/custodian/NL-*.yaml')) if args.limit: custodian_files = custodian_files[:args.limit] print(f"Processing {len(custodian_files)} custodian files...") if args.dry_run: print(" [DRY RUN - no files will be modified]") total_updated = 0 total_persons = 0 errors = [] for i, custodian_path in enumerate(custodian_files, 1): filename = os.path.basename(custodian_path) if args.verbose: print(f"\n[{i}/{len(custodian_files)}] {filename}") updated, person_count, error = process_custodian_file( custodian_path, dry_run=args.dry_run, verbose=args.verbose ) if error: errors.append((filename, error)) if args.verbose: print(f" Error: {error}") elif updated: total_updated += 1 total_persons += person_count if args.verbose: print(f" Updated: {person_count} persons extracted") elif args.verbose: print(f" Skipped: no contact data found") # Summary print(f"\n{'='*60}") print(f"SUMMARY") print(f"{'='*60}") print(f"Files processed: {len(custodian_files)}") print(f"Files updated: {total_updated}") print(f"Total persons: {total_persons}") print(f"Errors: {len(errors)}") if errors and args.verbose: print(f"\nErrors:") for filename, error in errors[:10]: print(f" {filename}: {error}") if len(errors) > 10: print(f" ... and {len(errors) - 10} more") if __name__ == '__main__': main()