glam/scripts/extract_contact_page_data.py
2025-12-14 17:09:55 +01:00

749 lines
25 KiB
Python

#!/usr/bin/env python3
"""
Extract structured contact data from Dutch heritage institution contact pages.
This script:
1. Finds all contact pages in web archives (contact_index.html, etc.)
2. Extracts person-role-contact relationships using pattern matching
3. Adds web_contact_data section to custodian YAML files
Data extracted:
- Board members (bestuur) with roles (voorzitter, secretaris, penningmeester)
- Staff members (medewerkers) with functions
- Contact information (address, phone, email, RSIN)
- Postal and physical addresses
Usage:
python scripts/extract_contact_page_data.py [--dry-run] [--verbose] [--limit N]
"""
import argparse
import glob
import os
import re
from dataclasses import dataclass, field
from datetime import datetime, timezone
from html.parser import HTMLParser
from pathlib import Path
from typing import Any, Optional
import yaml
# ============================================================================
# DATA CLASSES FOR STRUCTURED CONTACT DATA
# ============================================================================
@dataclass
class PersonContact:
"""A person with role and contact information."""
name: str
role: Optional[str] = None
role_type: Optional[str] = None # BOARD, STAFF, VOLUNTEER, EDITORIAL, ADMIN
street_address: Optional[str] = None
postal_code: Optional[str] = None
city: Optional[str] = None
phone: Optional[str] = None
email: Optional[str] = None
@dataclass
class OrganizationContact:
"""Organization-level contact information."""
postal_address: Optional[str] = None
postal_code: Optional[str] = None
postal_city: Optional[str] = None
physical_address: Optional[str] = None
physical_postal_code: Optional[str] = None
physical_city: Optional[str] = None
phone: Optional[str] = None
email: Optional[str] = None
rsin: Optional[str] = None
kvk: Optional[str] = None
iban: Optional[str] = None
@dataclass
class ContactPageData:
"""All data extracted from a contact page."""
persons: list[PersonContact] = field(default_factory=list)
organization: Optional[OrganizationContact] = None
source_file: Optional[str] = None
source_url: Optional[str] = None
extraction_date: Optional[str] = None
# ============================================================================
# HTML TEXT EXTRACTION
# ============================================================================
class HTMLTextExtractor(HTMLParser):
"""Extract visible text from HTML."""
def __init__(self):
super().__init__()
self.text_parts = []
self.skip_tags = {'script', 'style', 'head', 'noscript'}
self.skipping = False
def handle_starttag(self, tag, attrs):
if tag in self.skip_tags:
self.skipping = True
# Add newline for block elements
if tag in ('p', 'div', 'br', 'li', 'tr', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
self.text_parts.append('\n')
def handle_endtag(self, tag):
if tag in self.skip_tags:
self.skipping = False
if tag in ('p', 'div', 'li', 'tr'):
self.text_parts.append('\n')
def handle_data(self, data):
if not self.skipping:
text = data.strip()
if text:
self.text_parts.append(text)
self.text_parts.append(' ')
def get_text(self) -> str:
return ''.join(self.text_parts)
def extract_text_from_html(html_content: str) -> str:
"""Extract visible text from HTML content."""
parser = HTMLTextExtractor()
try:
parser.feed(html_content)
return parser.get_text()
except Exception:
return html_content
# ============================================================================
# DUTCH CONTACT PAGE PATTERNS
# ============================================================================
# Role keywords in Dutch and English
ROLE_PATTERNS = {
'BOARD': [
r'\bvoorzitter\b',
r'\bsecretaris\b',
r'\bpenningmeester\b',
r'\bbestuurslid\b',
r'\bvice[-\s]?voorzitter\b',
r'\bchairman\b',
r'\bpresident\b',
r'\bsecretary\b',
r'\btreasurer\b',
r'\bboard\s+member\b',
],
'STAFF': [
r'\bmedewerk(?:er|ster)\b',
r'\bcoördinator\b',
r'\bcoordinator\b',
r'\bbeheerder\b',
r'\bcurator\b',
r'\bconservator\b',
r'\barchivaris\b',
r'\bdirecteur\b',
r'\bdirector\b',
r'\bmanager\b',
],
'VOLUNTEER': [
r'\bvrijwilliger\b',
r'\bvolunteer\b',
],
'EDITORIAL': [
r'\bredactie\b',
r'\bredacteur\b',
r'\beditor\b',
r'\beditorial\b',
],
'ADMIN': [
r'\bledenadministratie\b',
r'\badministratie\b',
r'\badministration\b',
r'\bmembership\b',
],
}
# Dutch postal code pattern: 4 digits + space + 2 letters
POSTAL_CODE_PATTERN = r'\b(\d{4}\s*[A-Z]{2})\b'
# Phone patterns (Dutch format)
PHONE_PATTERNS = [
r'tel\.?:?\s*([\d\s\-\(\)]+)',
r'telefoon:?\s*([\d\s\-\(\)]+)',
r'phone:?\s*([\d\s\-\(\)]+)',
r'(\d{4}[-\s]?\d{6})', # 0226-451592
r'(\d{3}[-\s]?\d{7})', # 020-1234567
r'(\+31[\d\s\-]+)', # +31 format
]
# Email pattern
EMAIL_PATTERN = r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'
# RSIN pattern (Dutch fiscal number)
RSIN_PATTERN = r'RSIN[:\s\-]*(\d{4}[\.\s]?\d{2}[\.\s]?\d{3}|\d{9})'
# KvK pattern (Dutch Chamber of Commerce)
KVK_PATTERN = r'(?:KvK|Kvk|kvk|Kamer\s+van\s+Koophandel)[:\s\-]*(\d{8})'
# IBAN pattern
IBAN_PATTERN = r'\b(NL\d{2}\s*[A-Z]{4}\s*[\d\s]{10,18})\b'
# Section headers indicating person lists
SECTION_HEADERS = [
r'(?:Het\s+)?(?:dagelijks[e]?\s+)?bestuur(?:\s+bestaat\s+uit)?:?',
r'Overige?\s+bestuurslede?n:?',
r'Overige?\s+medewerk(?:er|st)e?rs?:?',
r'Vrijwilligers?:?',
r'Redactie(?:\s+magazine)?:?',
r'Ledenadministratie:?',
r'Contactpersonen?:?',
r'Staff\s*:?',
r'Team\s*:?',
r'Board(?:\s+members)?:?',
]
# ============================================================================
# PERSON EXTRACTION FROM CONTACT PAGE TEXT
# ============================================================================
def extract_person_with_role_inline(line: str) -> Optional[PersonContact]:
"""
Extract person from lines like:
- "Ella Molenaar, voorzitter"
- "Koos Bijvoet, secretaris"
"""
# Pattern: Name, role (where role is a known keyword)
for role_type, patterns in ROLE_PATTERNS.items():
for pattern in patterns:
match = re.search(rf'^([A-Z][a-z]+(?:\s+[A-Za-z\-]+)+),\s*({pattern})', line, re.IGNORECASE)
if match:
return PersonContact(
name=match.group(1).strip(),
role=match.group(2).strip().lower(),
role_type=role_type
)
return None
def extract_person_with_address(line: str) -> Optional[PersonContact]:
"""
Extract person with full contact info from lines like:
"Afra Oudejans-Ursem, Dorpsstraat 234, 1713 HP Obdam, tel.: 0226-451592"
"""
# Pattern: Name, Street Number, PostalCode City, tel: Phone
pattern = (
r'^([A-Z][a-zA-Z\-]+(?:\s+[A-Za-z\-]+)*),\s*' # Name
r'([A-Za-z\s]+\s+\d+[a-zA-Z]?),\s*' # Street + number
r'(\d{4}\s*[A-Z]{2})\s+' # Postal code
r'([A-Za-z\s\-]+)' # City
r'(?:,\s*tel\.?:?\s*([\d\-\s]+))?' # Optional phone
)
match = re.match(pattern, line, re.IGNORECASE)
if match:
return PersonContact(
name=match.group(1).strip(),
street_address=match.group(2).strip(),
postal_code=match.group(3).strip().replace(' ', ' '), # Normalize spacing
city=match.group(4).strip().rstrip(','),
phone=match.group(5).strip() if match.group(5) else None
)
return None
def extract_person_with_location(line: str) -> Optional[PersonContact]:
"""
Extract person from lines like:
"Herman Tielen, Obdam"
"Niek Wever, Obdam"
"""
# Pattern: Name, City (where City starts with capital)
pattern = r'^([A-Z][a-z]+(?:\s+[A-Za-z\-]+)+),\s+([A-Z][a-z]+(?:[\s\-][A-Za-z]+)*)$'
match = re.match(pattern, line.strip())
if match:
name = match.group(1).strip()
location = match.group(2).strip()
# Verify this isn't a role (like "voorzitter")
is_role = False
for patterns in ROLE_PATTERNS.values():
for p in patterns:
if re.search(p, location, re.IGNORECASE):
is_role = True
break
if not is_role and len(location) > 2:
return PersonContact(
name=name,
city=location
)
return None
def extract_simple_name(line: str, context_role_type: Optional[str] = None) -> Optional[PersonContact]:
"""
Extract simple name (just a name, no other info).
"Ed Groustra"
"""
# Pattern: Just a name (2-4 words, each capitalized)
pattern = r'^([A-Z][a-z]+(?:\s+[A-Za-z\-]+){0,3})$'
match = re.match(pattern, line.strip())
if match:
name = match.group(1).strip()
# Filter out common non-names (form labels, navigation, etc.)
non_names = {
# Navigation elements
'Contact', 'Bestuur', 'Team', 'Staff', 'Redactie',
'Informatie', 'Over', 'Home', 'Menu', 'Start', 'Links',
'Artikelen', 'Beeldbank', 'Magazines',
# Form field labels (common in contact forms)
'Naam', 'Name', 'Bericht', 'Message', 'Indienen', 'Submit',
'Verzenden', 'Send', 'Email', 'Telefoon', 'Phone', 'Adres',
'Address', 'Onderwerp', 'Subject', 'Captcha',
'Voornaam', 'Achternaam', 'Mobiel', 'Verstuur',
# Other non-names
'Copyright', 'Privacy', 'Disclaimer', 'Cookies',
'Lees', 'Meer', 'Terug', 'Back', 'Volgende', 'Next',
'Facebook', 'Twitter', 'Instagram', 'LinkedIn', 'YouTube',
# Common Dutch words that might match pattern
'Donateur', 'Worden', 'Stichting', 'Vereniging',
'Folder', 'Educatie', 'Nieuwsbrief', 'Projecten',
'Openingstijden', 'Entreeprijzen', 'Museumwinkel',
'Toegankelijkheid', 'Privacybeleid',
# Section headers that aren't names
'Beleidsplan', 'Jaarverslag', 'Werkgroepen',
'Bezoekersinformatie', 'Collectie',
}
# Also filter out longer phrases that are clearly not names
non_name_patterns = [
r'^Bestuur\s+', r'^Vrienden\s+', r'^Werkgroep', r'^Platform\s+',
r'^Project', r'^Museum', r'^Geschiedenis', r'^Algemene\s+',
r'^Inrichting', r'^Speciaal\s+', r'^Sociale\s+', r'^Design\s+',
r'^Digitalisering', r'^Herdenkings', r'^Restauratie',
r'^Monument\s+', r'molen\b', r'^Toegankelijkheid',
r'^Privacy\b', r'^Disclaimer\b', r'Policy\b', r'^Copyright\b',
r'^Terms\b', r'^Cookies?\b', r'^GDPR\b', r'^AVG\b',
]
# Filter out organization names starting with common prefixes
if name not in non_names and len(name) > 3:
# Check against non-name patterns
for pattern in non_name_patterns:
if re.search(pattern, name, re.IGNORECASE):
return None
# Filter out names that look like organization names
if name.startswith('Stichting ') or name.startswith('Vereniging '):
return None
# Names should typically be 2-4 words and look like personal names
# Personal names usually have a structure like "First Last" or "First Middle Last"
words = name.split()
if len(words) == 1:
# Single word - could be a personal name but also could be menu item
# Only accept if it looks like a Dutch/European personal name
# (this is a heuristic - may need refinement)
return None
return PersonContact(
name=name,
role_type=context_role_type
)
return None
def extract_organization_contact(text: str) -> OrganizationContact:
"""Extract organization-level contact information."""
org = OrganizationContact()
# Extract RSIN
rsin_match = re.search(RSIN_PATTERN, text, re.IGNORECASE)
if rsin_match:
org.rsin = rsin_match.group(1).replace('.', '').replace(' ', '')
# Extract KvK
kvk_match = re.search(KVK_PATTERN, text, re.IGNORECASE)
if kvk_match:
org.kvk = kvk_match.group(1)
# Extract IBAN
iban_match = re.search(IBAN_PATTERN, text)
if iban_match:
org.iban = iban_match.group(1).replace(' ', '')
# Extract postal address block
# Look for "Postadres:" section
# City should be a single word or hyphenated word, not multiple lines
postadres_match = re.search(
r'Postadres:?\s*\n?\s*(?:Stichting\s+[^\n]+\s*\n?)?\s*'
r'([A-Za-z\s\']+\s+\d+[a-zA-Z]?)\s*\n?\s*'
r'(\d{4}\s*[A-Z]{2})\s+'
r'([A-Za-z]+(?:[\-][A-Za-z]+)?)', # City: single word or hyphenated
text, re.IGNORECASE
)
if postadres_match:
org.postal_address = postadres_match.group(1).strip()
org.postal_code = postadres_match.group(2).strip()
org.postal_city = postadres_match.group(3).strip()
# Extract building/physical address
# Look for "Gebouw:" or physical address section
gebouw_match = re.search(
r'(?:Gebouw|Bezoekadres|Locatie):?\s*\n?\s*(?:[^\n]*\n)?\s*'
r'([A-Za-z\s\']+\s+\d+[a-zA-Z]?)\s*\n?\s*'
r'(\d{4}\s*[A-Z]{2})\s+'
r'([A-Za-z]+(?:[\-][A-Za-z]+)?)', # City: single word or hyphenated
text, re.IGNORECASE
)
if gebouw_match:
org.physical_address = gebouw_match.group(1).strip()
org.physical_postal_code = gebouw_match.group(2).strip()
org.physical_city = gebouw_match.group(3).strip()
return org
def extract_persons_from_text(text: str) -> list[PersonContact]:
"""Extract all persons from contact page text."""
persons = []
lines = text.split('\n')
current_section = None
current_role_type = None
for i, line in enumerate(lines):
line = line.strip()
if not line:
continue
# Check if this is a section header
for header in SECTION_HEADERS:
if re.match(header, line, re.IGNORECASE):
# Determine role type from header
header_lower = line.lower()
if 'bestuur' in header_lower:
current_role_type = 'BOARD'
elif 'medewerk' in header_lower:
current_role_type = 'STAFF'
elif 'vrijwillig' in header_lower:
current_role_type = 'VOLUNTEER'
elif 'redactie' in header_lower:
current_role_type = 'EDITORIAL'
elif 'ledenadmin' in header_lower or 'administratie' in header_lower:
current_role_type = 'ADMIN'
current_section = line
break
# Try different extraction patterns
person = extract_person_with_role_inline(line)
if person:
if not person.role_type and current_role_type:
person.role_type = current_role_type
persons.append(person)
continue
person = extract_person_with_address(line)
if person:
person.role_type = current_role_type or 'BOARD' # Assume board if has address
persons.append(person)
continue
person = extract_person_with_location(line)
if person:
person.role_type = current_role_type
persons.append(person)
continue
# Only extract simple names if we're in a known section
if current_role_type:
person = extract_simple_name(line, current_role_type)
if person:
persons.append(person)
return persons
def extract_contact_page_data(html_path: str) -> Optional[ContactPageData]:
"""Extract all contact data from an HTML file."""
try:
with open(html_path, 'r', encoding='utf-8', errors='replace') as f:
html_content = f.read()
except Exception as e:
print(f" Error reading {html_path}: {e}")
return None
text = extract_text_from_html(html_content)
persons = extract_persons_from_text(text)
organization = extract_organization_contact(text)
# Only return if we found something
if not persons and not organization.rsin and not organization.kvk:
return None
return ContactPageData(
persons=persons,
organization=organization,
source_file=html_path,
extraction_date=datetime.now(timezone.utc).isoformat()
)
# ============================================================================
# CUSTODIAN FILE PROCESSING
# ============================================================================
def find_contact_pages(web_archive_dir: str) -> list[str]:
"""Find all contact pages in a web archive directory."""
contact_pages = []
pages_dir = os.path.join(web_archive_dir, 'pages')
if not os.path.isdir(pages_dir):
return contact_pages
# Look for contact pages (various naming patterns)
for pattern in ['*contact*.html', '*Contact*.html', '*kontakt*.html']:
contact_pages.extend(glob.glob(os.path.join(pages_dir, pattern)))
return contact_pages
def person_to_dict(person: PersonContact) -> dict[str, Any]:
"""Convert PersonContact to dictionary for YAML output."""
d = {'name': person.name}
if person.role:
d['role'] = person.role
if person.role_type:
d['role_type'] = person.role_type
if person.street_address:
d['street_address'] = person.street_address
if person.postal_code:
d['postal_code'] = person.postal_code
if person.city:
d['city'] = person.city
if person.phone:
d['phone'] = person.phone
if person.email:
d['email'] = person.email
return d
def org_to_dict(org: OrganizationContact) -> dict[str, Any]:
"""Convert OrganizationContact to dictionary for YAML output."""
d = {}
if org.postal_address:
d['postal_address'] = {
'street': org.postal_address,
'postal_code': org.postal_code,
'city': org.postal_city
}
if org.physical_address:
d['physical_address'] = {
'street': org.physical_address,
'postal_code': org.physical_postal_code,
'city': org.physical_city
}
if org.rsin:
d['rsin'] = org.rsin
if org.kvk:
d['kvk'] = org.kvk
if org.iban:
d['iban'] = org.iban
if org.phone:
d['phone'] = org.phone
if org.email:
d['email'] = org.email
return d
def process_custodian_file(
custodian_path: str,
dry_run: bool = False,
verbose: bool = False
) -> tuple[bool, int, Optional[str]]:
"""
Process a single custodian file and extract contact page data.
Returns: (updated, person_count, error_message)
"""
try:
with open(custodian_path, 'r', encoding='utf-8') as f:
custodian_data = yaml.safe_load(f)
except Exception as e:
return False, 0, f"Error reading file: {e}"
if not custodian_data:
return False, 0, "Empty file"
# Find web archive directory
web_enrichment = custodian_data.get('web_enrichment', {})
web_archives = web_enrichment.get('web_archives', [])
if not web_archives:
return False, 0, None # No web archives, not an error
all_persons = []
all_orgs = []
source_files = []
for archive in web_archives:
# Support both 'directory' and 'archive_path' field names
archive_path = archive.get('directory', '') or archive.get('archive_path', '')
if not archive_path:
continue
# Convert to web archive directory path
# archive_path is like "web/0787/oudobdam-hensbroek.nl"
web_dir = os.path.join('/Users/kempersc/apps/glam/data/custodian', archive_path)
if not os.path.isdir(web_dir):
continue
# Find contact pages
contact_pages = find_contact_pages(web_dir)
for contact_page in contact_pages:
if verbose:
print(f" Processing: {contact_page}")
data = extract_contact_page_data(contact_page)
if data:
all_persons.extend(data.persons)
if data.organization:
all_orgs.append(data.organization)
source_files.append(contact_page)
if not all_persons and not all_orgs:
return False, 0, None # No contact data found
# Deduplicate persons by name
seen_names = set()
unique_persons = []
for person in all_persons:
if person.name not in seen_names:
seen_names.add(person.name)
unique_persons.append(person)
# Build web_contact_data section
contact_data = {
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'contact_page_pattern_matching',
'source_files': [os.path.relpath(f, '/Users/kempersc/apps/glam/data/custodian') for f in source_files],
}
if unique_persons:
contact_data['persons'] = [person_to_dict(p) for p in unique_persons]
# Merge organization data (take first non-empty values)
merged_org = OrganizationContact()
for org in all_orgs:
if org.rsin and not merged_org.rsin:
merged_org.rsin = org.rsin
if org.kvk and not merged_org.kvk:
merged_org.kvk = org.kvk
if org.iban and not merged_org.iban:
merged_org.iban = org.iban
if org.postal_address and not merged_org.postal_address:
merged_org.postal_address = org.postal_address
merged_org.postal_code = org.postal_code
merged_org.postal_city = org.postal_city
if org.physical_address and not merged_org.physical_address:
merged_org.physical_address = org.physical_address
merged_org.physical_postal_code = org.physical_postal_code
merged_org.physical_city = org.physical_city
org_dict = org_to_dict(merged_org)
if org_dict:
contact_data['organization'] = org_dict
# Update custodian data
custodian_data['web_contact_data'] = contact_data
if not dry_run:
with open(custodian_path, 'w', encoding='utf-8') as f:
yaml.dump(custodian_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return True, len(unique_persons), None
# ============================================================================
# MAIN EXECUTION
# ============================================================================
def main():
parser = argparse.ArgumentParser(
description='Extract contact data from Dutch heritage institution contact pages'
)
parser.add_argument('--dry-run', action='store_true', help='Do not modify files')
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
parser.add_argument('--limit', type=int, help='Limit number of files to process')
parser.add_argument('--file', type=str, help='Process a single custodian file')
args = parser.parse_args()
# Find custodian files
if args.file:
custodian_files = [args.file]
else:
custodian_files = sorted(glob.glob('/Users/kempersc/apps/glam/data/custodian/NL-*.yaml'))
if args.limit:
custodian_files = custodian_files[:args.limit]
print(f"Processing {len(custodian_files)} custodian files...")
if args.dry_run:
print(" [DRY RUN - no files will be modified]")
total_updated = 0
total_persons = 0
errors = []
for i, custodian_path in enumerate(custodian_files, 1):
filename = os.path.basename(custodian_path)
if args.verbose:
print(f"\n[{i}/{len(custodian_files)}] {filename}")
updated, person_count, error = process_custodian_file(
custodian_path,
dry_run=args.dry_run,
verbose=args.verbose
)
if error:
errors.append((filename, error))
if args.verbose:
print(f" Error: {error}")
elif updated:
total_updated += 1
total_persons += person_count
if args.verbose:
print(f" Updated: {person_count} persons extracted")
elif args.verbose:
print(f" Skipped: no contact data found")
# Summary
print(f"\n{'='*60}")
print(f"SUMMARY")
print(f"{'='*60}")
print(f"Files processed: {len(custodian_files)}")
print(f"Files updated: {total_updated}")
print(f"Total persons: {total_persons}")
print(f"Errors: {len(errors)}")
if errors and args.verbose:
print(f"\nErrors:")
for filename, error in errors[:10]:
print(f" {filename}: {error}")
if len(errors) > 10:
print(f" ... and {len(errors) - 10} more")
if __name__ == '__main__':
main()