749 lines
25 KiB
Python
749 lines
25 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract structured contact data from Dutch heritage institution contact pages.
|
|
|
|
This script:
|
|
1. Finds all contact pages in web archives (contact_index.html, etc.)
|
|
2. Extracts person-role-contact relationships using pattern matching
|
|
3. Adds web_contact_data section to custodian YAML files
|
|
|
|
Data extracted:
|
|
- Board members (bestuur) with roles (voorzitter, secretaris, penningmeester)
|
|
- Staff members (medewerkers) with functions
|
|
- Contact information (address, phone, email, RSIN)
|
|
- Postal and physical addresses
|
|
|
|
Usage:
|
|
python scripts/extract_contact_page_data.py [--dry-run] [--verbose] [--limit N]
|
|
"""
|
|
|
|
import argparse
|
|
import glob
|
|
import os
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from html.parser import HTMLParser
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
import yaml
|
|
|
|
|
|
# ============================================================================
|
|
# DATA CLASSES FOR STRUCTURED CONTACT DATA
|
|
# ============================================================================
|
|
|
|
@dataclass
|
|
class PersonContact:
|
|
"""A person with role and contact information."""
|
|
name: str
|
|
role: Optional[str] = None
|
|
role_type: Optional[str] = None # BOARD, STAFF, VOLUNTEER, EDITORIAL, ADMIN
|
|
street_address: Optional[str] = None
|
|
postal_code: Optional[str] = None
|
|
city: Optional[str] = None
|
|
phone: Optional[str] = None
|
|
email: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class OrganizationContact:
|
|
"""Organization-level contact information."""
|
|
postal_address: Optional[str] = None
|
|
postal_code: Optional[str] = None
|
|
postal_city: Optional[str] = None
|
|
physical_address: Optional[str] = None
|
|
physical_postal_code: Optional[str] = None
|
|
physical_city: Optional[str] = None
|
|
phone: Optional[str] = None
|
|
email: Optional[str] = None
|
|
rsin: Optional[str] = None
|
|
kvk: Optional[str] = None
|
|
iban: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class ContactPageData:
|
|
"""All data extracted from a contact page."""
|
|
persons: list[PersonContact] = field(default_factory=list)
|
|
organization: Optional[OrganizationContact] = None
|
|
source_file: Optional[str] = None
|
|
source_url: Optional[str] = None
|
|
extraction_date: Optional[str] = None
|
|
|
|
|
|
# ============================================================================
|
|
# HTML TEXT EXTRACTION
|
|
# ============================================================================
|
|
|
|
class HTMLTextExtractor(HTMLParser):
|
|
"""Extract visible text from HTML."""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.text_parts = []
|
|
self.skip_tags = {'script', 'style', 'head', 'noscript'}
|
|
self.skipping = False
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
if tag in self.skip_tags:
|
|
self.skipping = True
|
|
# Add newline for block elements
|
|
if tag in ('p', 'div', 'br', 'li', 'tr', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
|
self.text_parts.append('\n')
|
|
|
|
def handle_endtag(self, tag):
|
|
if tag in self.skip_tags:
|
|
self.skipping = False
|
|
if tag in ('p', 'div', 'li', 'tr'):
|
|
self.text_parts.append('\n')
|
|
|
|
def handle_data(self, data):
|
|
if not self.skipping:
|
|
text = data.strip()
|
|
if text:
|
|
self.text_parts.append(text)
|
|
self.text_parts.append(' ')
|
|
|
|
def get_text(self) -> str:
|
|
return ''.join(self.text_parts)
|
|
|
|
|
|
def extract_text_from_html(html_content: str) -> str:
|
|
"""Extract visible text from HTML content."""
|
|
parser = HTMLTextExtractor()
|
|
try:
|
|
parser.feed(html_content)
|
|
return parser.get_text()
|
|
except Exception:
|
|
return html_content
|
|
|
|
|
|
# ============================================================================
|
|
# DUTCH CONTACT PAGE PATTERNS
|
|
# ============================================================================
|
|
|
|
# Role keywords in Dutch and English
|
|
ROLE_PATTERNS = {
|
|
'BOARD': [
|
|
r'\bvoorzitter\b',
|
|
r'\bsecretaris\b',
|
|
r'\bpenningmeester\b',
|
|
r'\bbestuurslid\b',
|
|
r'\bvice[-\s]?voorzitter\b',
|
|
r'\bchairman\b',
|
|
r'\bpresident\b',
|
|
r'\bsecretary\b',
|
|
r'\btreasurer\b',
|
|
r'\bboard\s+member\b',
|
|
],
|
|
'STAFF': [
|
|
r'\bmedewerk(?:er|ster)\b',
|
|
r'\bcoördinator\b',
|
|
r'\bcoordinator\b',
|
|
r'\bbeheerder\b',
|
|
r'\bcurator\b',
|
|
r'\bconservator\b',
|
|
r'\barchivaris\b',
|
|
r'\bdirecteur\b',
|
|
r'\bdirector\b',
|
|
r'\bmanager\b',
|
|
],
|
|
'VOLUNTEER': [
|
|
r'\bvrijwilliger\b',
|
|
r'\bvolunteer\b',
|
|
],
|
|
'EDITORIAL': [
|
|
r'\bredactie\b',
|
|
r'\bredacteur\b',
|
|
r'\beditor\b',
|
|
r'\beditorial\b',
|
|
],
|
|
'ADMIN': [
|
|
r'\bledenadministratie\b',
|
|
r'\badministratie\b',
|
|
r'\badministration\b',
|
|
r'\bmembership\b',
|
|
],
|
|
}
|
|
|
|
# Dutch postal code pattern: 4 digits + space + 2 letters
|
|
POSTAL_CODE_PATTERN = r'\b(\d{4}\s*[A-Z]{2})\b'
|
|
|
|
# Phone patterns (Dutch format)
|
|
PHONE_PATTERNS = [
|
|
r'tel\.?:?\s*([\d\s\-\(\)]+)',
|
|
r'telefoon:?\s*([\d\s\-\(\)]+)',
|
|
r'phone:?\s*([\d\s\-\(\)]+)',
|
|
r'(\d{4}[-\s]?\d{6})', # 0226-451592
|
|
r'(\d{3}[-\s]?\d{7})', # 020-1234567
|
|
r'(\+31[\d\s\-]+)', # +31 format
|
|
]
|
|
|
|
# Email pattern
|
|
EMAIL_PATTERN = r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'
|
|
|
|
# RSIN pattern (Dutch fiscal number)
|
|
RSIN_PATTERN = r'RSIN[:\s\-]*(\d{4}[\.\s]?\d{2}[\.\s]?\d{3}|\d{9})'
|
|
|
|
# KvK pattern (Dutch Chamber of Commerce)
|
|
KVK_PATTERN = r'(?:KvK|Kvk|kvk|Kamer\s+van\s+Koophandel)[:\s\-]*(\d{8})'
|
|
|
|
# IBAN pattern
|
|
IBAN_PATTERN = r'\b(NL\d{2}\s*[A-Z]{4}\s*[\d\s]{10,18})\b'
|
|
|
|
# Section headers indicating person lists
|
|
SECTION_HEADERS = [
|
|
r'(?:Het\s+)?(?:dagelijks[e]?\s+)?bestuur(?:\s+bestaat\s+uit)?:?',
|
|
r'Overige?\s+bestuurslede?n:?',
|
|
r'Overige?\s+medewerk(?:er|st)e?rs?:?',
|
|
r'Vrijwilligers?:?',
|
|
r'Redactie(?:\s+magazine)?:?',
|
|
r'Ledenadministratie:?',
|
|
r'Contactpersonen?:?',
|
|
r'Staff\s*:?',
|
|
r'Team\s*:?',
|
|
r'Board(?:\s+members)?:?',
|
|
]
|
|
|
|
|
|
# ============================================================================
|
|
# PERSON EXTRACTION FROM CONTACT PAGE TEXT
|
|
# ============================================================================
|
|
|
|
def extract_person_with_role_inline(line: str) -> Optional[PersonContact]:
|
|
"""
|
|
Extract person from lines like:
|
|
- "Ella Molenaar, voorzitter"
|
|
- "Koos Bijvoet, secretaris"
|
|
"""
|
|
# Pattern: Name, role (where role is a known keyword)
|
|
for role_type, patterns in ROLE_PATTERNS.items():
|
|
for pattern in patterns:
|
|
match = re.search(rf'^([A-Z][a-z]+(?:\s+[A-Za-z\-]+)+),\s*({pattern})', line, re.IGNORECASE)
|
|
if match:
|
|
return PersonContact(
|
|
name=match.group(1).strip(),
|
|
role=match.group(2).strip().lower(),
|
|
role_type=role_type
|
|
)
|
|
return None
|
|
|
|
|
|
def extract_person_with_address(line: str) -> Optional[PersonContact]:
|
|
"""
|
|
Extract person with full contact info from lines like:
|
|
"Afra Oudejans-Ursem, Dorpsstraat 234, 1713 HP Obdam, tel.: 0226-451592"
|
|
"""
|
|
# Pattern: Name, Street Number, PostalCode City, tel: Phone
|
|
pattern = (
|
|
r'^([A-Z][a-zA-Z\-]+(?:\s+[A-Za-z\-]+)*),\s*' # Name
|
|
r'([A-Za-z\s]+\s+\d+[a-zA-Z]?),\s*' # Street + number
|
|
r'(\d{4}\s*[A-Z]{2})\s+' # Postal code
|
|
r'([A-Za-z\s\-]+)' # City
|
|
r'(?:,\s*tel\.?:?\s*([\d\-\s]+))?' # Optional phone
|
|
)
|
|
|
|
match = re.match(pattern, line, re.IGNORECASE)
|
|
if match:
|
|
return PersonContact(
|
|
name=match.group(1).strip(),
|
|
street_address=match.group(2).strip(),
|
|
postal_code=match.group(3).strip().replace(' ', ' '), # Normalize spacing
|
|
city=match.group(4).strip().rstrip(','),
|
|
phone=match.group(5).strip() if match.group(5) else None
|
|
)
|
|
return None
|
|
|
|
|
|
def extract_person_with_location(line: str) -> Optional[PersonContact]:
|
|
"""
|
|
Extract person from lines like:
|
|
"Herman Tielen, Obdam"
|
|
"Niek Wever, Obdam"
|
|
"""
|
|
# Pattern: Name, City (where City starts with capital)
|
|
pattern = r'^([A-Z][a-z]+(?:\s+[A-Za-z\-]+)+),\s+([A-Z][a-z]+(?:[\s\-][A-Za-z]+)*)$'
|
|
|
|
match = re.match(pattern, line.strip())
|
|
if match:
|
|
name = match.group(1).strip()
|
|
location = match.group(2).strip()
|
|
|
|
# Verify this isn't a role (like "voorzitter")
|
|
is_role = False
|
|
for patterns in ROLE_PATTERNS.values():
|
|
for p in patterns:
|
|
if re.search(p, location, re.IGNORECASE):
|
|
is_role = True
|
|
break
|
|
|
|
if not is_role and len(location) > 2:
|
|
return PersonContact(
|
|
name=name,
|
|
city=location
|
|
)
|
|
return None
|
|
|
|
|
|
def extract_simple_name(line: str, context_role_type: Optional[str] = None) -> Optional[PersonContact]:
|
|
"""
|
|
Extract simple name (just a name, no other info).
|
|
"Ed Groustra"
|
|
"""
|
|
# Pattern: Just a name (2-4 words, each capitalized)
|
|
pattern = r'^([A-Z][a-z]+(?:\s+[A-Za-z\-]+){0,3})$'
|
|
|
|
match = re.match(pattern, line.strip())
|
|
if match:
|
|
name = match.group(1).strip()
|
|
# Filter out common non-names (form labels, navigation, etc.)
|
|
non_names = {
|
|
# Navigation elements
|
|
'Contact', 'Bestuur', 'Team', 'Staff', 'Redactie',
|
|
'Informatie', 'Over', 'Home', 'Menu', 'Start', 'Links',
|
|
'Artikelen', 'Beeldbank', 'Magazines',
|
|
# Form field labels (common in contact forms)
|
|
'Naam', 'Name', 'Bericht', 'Message', 'Indienen', 'Submit',
|
|
'Verzenden', 'Send', 'Email', 'Telefoon', 'Phone', 'Adres',
|
|
'Address', 'Onderwerp', 'Subject', 'Captcha',
|
|
'Voornaam', 'Achternaam', 'Mobiel', 'Verstuur',
|
|
# Other non-names
|
|
'Copyright', 'Privacy', 'Disclaimer', 'Cookies',
|
|
'Lees', 'Meer', 'Terug', 'Back', 'Volgende', 'Next',
|
|
'Facebook', 'Twitter', 'Instagram', 'LinkedIn', 'YouTube',
|
|
# Common Dutch words that might match pattern
|
|
'Donateur', 'Worden', 'Stichting', 'Vereniging',
|
|
'Folder', 'Educatie', 'Nieuwsbrief', 'Projecten',
|
|
'Openingstijden', 'Entreeprijzen', 'Museumwinkel',
|
|
'Toegankelijkheid', 'Privacybeleid',
|
|
# Section headers that aren't names
|
|
'Beleidsplan', 'Jaarverslag', 'Werkgroepen',
|
|
'Bezoekersinformatie', 'Collectie',
|
|
}
|
|
|
|
# Also filter out longer phrases that are clearly not names
|
|
non_name_patterns = [
|
|
r'^Bestuur\s+', r'^Vrienden\s+', r'^Werkgroep', r'^Platform\s+',
|
|
r'^Project', r'^Museum', r'^Geschiedenis', r'^Algemene\s+',
|
|
r'^Inrichting', r'^Speciaal\s+', r'^Sociale\s+', r'^Design\s+',
|
|
r'^Digitalisering', r'^Herdenkings', r'^Restauratie',
|
|
r'^Monument\s+', r'molen\b', r'^Toegankelijkheid',
|
|
r'^Privacy\b', r'^Disclaimer\b', r'Policy\b', r'^Copyright\b',
|
|
r'^Terms\b', r'^Cookies?\b', r'^GDPR\b', r'^AVG\b',
|
|
]
|
|
|
|
# Filter out organization names starting with common prefixes
|
|
if name not in non_names and len(name) > 3:
|
|
# Check against non-name patterns
|
|
for pattern in non_name_patterns:
|
|
if re.search(pattern, name, re.IGNORECASE):
|
|
return None
|
|
# Filter out names that look like organization names
|
|
if name.startswith('Stichting ') or name.startswith('Vereniging '):
|
|
return None
|
|
# Names should typically be 2-4 words and look like personal names
|
|
# Personal names usually have a structure like "First Last" or "First Middle Last"
|
|
words = name.split()
|
|
if len(words) == 1:
|
|
# Single word - could be a personal name but also could be menu item
|
|
# Only accept if it looks like a Dutch/European personal name
|
|
# (this is a heuristic - may need refinement)
|
|
return None
|
|
return PersonContact(
|
|
name=name,
|
|
role_type=context_role_type
|
|
)
|
|
return None
|
|
|
|
|
|
def extract_organization_contact(text: str) -> OrganizationContact:
|
|
"""Extract organization-level contact information."""
|
|
org = OrganizationContact()
|
|
|
|
# Extract RSIN
|
|
rsin_match = re.search(RSIN_PATTERN, text, re.IGNORECASE)
|
|
if rsin_match:
|
|
org.rsin = rsin_match.group(1).replace('.', '').replace(' ', '')
|
|
|
|
# Extract KvK
|
|
kvk_match = re.search(KVK_PATTERN, text, re.IGNORECASE)
|
|
if kvk_match:
|
|
org.kvk = kvk_match.group(1)
|
|
|
|
# Extract IBAN
|
|
iban_match = re.search(IBAN_PATTERN, text)
|
|
if iban_match:
|
|
org.iban = iban_match.group(1).replace(' ', '')
|
|
|
|
# Extract postal address block
|
|
# Look for "Postadres:" section
|
|
# City should be a single word or hyphenated word, not multiple lines
|
|
postadres_match = re.search(
|
|
r'Postadres:?\s*\n?\s*(?:Stichting\s+[^\n]+\s*\n?)?\s*'
|
|
r'([A-Za-z\s\']+\s+\d+[a-zA-Z]?)\s*\n?\s*'
|
|
r'(\d{4}\s*[A-Z]{2})\s+'
|
|
r'([A-Za-z]+(?:[\-][A-Za-z]+)?)', # City: single word or hyphenated
|
|
text, re.IGNORECASE
|
|
)
|
|
if postadres_match:
|
|
org.postal_address = postadres_match.group(1).strip()
|
|
org.postal_code = postadres_match.group(2).strip()
|
|
org.postal_city = postadres_match.group(3).strip()
|
|
|
|
# Extract building/physical address
|
|
# Look for "Gebouw:" or physical address section
|
|
gebouw_match = re.search(
|
|
r'(?:Gebouw|Bezoekadres|Locatie):?\s*\n?\s*(?:[^\n]*\n)?\s*'
|
|
r'([A-Za-z\s\']+\s+\d+[a-zA-Z]?)\s*\n?\s*'
|
|
r'(\d{4}\s*[A-Z]{2})\s+'
|
|
r'([A-Za-z]+(?:[\-][A-Za-z]+)?)', # City: single word or hyphenated
|
|
text, re.IGNORECASE
|
|
)
|
|
if gebouw_match:
|
|
org.physical_address = gebouw_match.group(1).strip()
|
|
org.physical_postal_code = gebouw_match.group(2).strip()
|
|
org.physical_city = gebouw_match.group(3).strip()
|
|
|
|
return org
|
|
|
|
|
|
def extract_persons_from_text(text: str) -> list[PersonContact]:
|
|
"""Extract all persons from contact page text."""
|
|
persons = []
|
|
lines = text.split('\n')
|
|
|
|
current_section = None
|
|
current_role_type = None
|
|
|
|
for i, line in enumerate(lines):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
# Check if this is a section header
|
|
for header in SECTION_HEADERS:
|
|
if re.match(header, line, re.IGNORECASE):
|
|
# Determine role type from header
|
|
header_lower = line.lower()
|
|
if 'bestuur' in header_lower:
|
|
current_role_type = 'BOARD'
|
|
elif 'medewerk' in header_lower:
|
|
current_role_type = 'STAFF'
|
|
elif 'vrijwillig' in header_lower:
|
|
current_role_type = 'VOLUNTEER'
|
|
elif 'redactie' in header_lower:
|
|
current_role_type = 'EDITORIAL'
|
|
elif 'ledenadmin' in header_lower or 'administratie' in header_lower:
|
|
current_role_type = 'ADMIN'
|
|
current_section = line
|
|
break
|
|
|
|
# Try different extraction patterns
|
|
person = extract_person_with_role_inline(line)
|
|
if person:
|
|
if not person.role_type and current_role_type:
|
|
person.role_type = current_role_type
|
|
persons.append(person)
|
|
continue
|
|
|
|
person = extract_person_with_address(line)
|
|
if person:
|
|
person.role_type = current_role_type or 'BOARD' # Assume board if has address
|
|
persons.append(person)
|
|
continue
|
|
|
|
person = extract_person_with_location(line)
|
|
if person:
|
|
person.role_type = current_role_type
|
|
persons.append(person)
|
|
continue
|
|
|
|
# Only extract simple names if we're in a known section
|
|
if current_role_type:
|
|
person = extract_simple_name(line, current_role_type)
|
|
if person:
|
|
persons.append(person)
|
|
|
|
return persons
|
|
|
|
|
|
def extract_contact_page_data(html_path: str) -> Optional[ContactPageData]:
|
|
"""Extract all contact data from an HTML file."""
|
|
try:
|
|
with open(html_path, 'r', encoding='utf-8', errors='replace') as f:
|
|
html_content = f.read()
|
|
except Exception as e:
|
|
print(f" Error reading {html_path}: {e}")
|
|
return None
|
|
|
|
text = extract_text_from_html(html_content)
|
|
|
|
persons = extract_persons_from_text(text)
|
|
organization = extract_organization_contact(text)
|
|
|
|
# Only return if we found something
|
|
if not persons and not organization.rsin and not organization.kvk:
|
|
return None
|
|
|
|
return ContactPageData(
|
|
persons=persons,
|
|
organization=organization,
|
|
source_file=html_path,
|
|
extraction_date=datetime.now(timezone.utc).isoformat()
|
|
)
|
|
|
|
|
|
# ============================================================================
|
|
# CUSTODIAN FILE PROCESSING
|
|
# ============================================================================
|
|
|
|
def find_contact_pages(web_archive_dir: str) -> list[str]:
|
|
"""Find all contact pages in a web archive directory."""
|
|
contact_pages = []
|
|
|
|
pages_dir = os.path.join(web_archive_dir, 'pages')
|
|
if not os.path.isdir(pages_dir):
|
|
return contact_pages
|
|
|
|
# Look for contact pages (various naming patterns)
|
|
for pattern in ['*contact*.html', '*Contact*.html', '*kontakt*.html']:
|
|
contact_pages.extend(glob.glob(os.path.join(pages_dir, pattern)))
|
|
|
|
return contact_pages
|
|
|
|
|
|
def person_to_dict(person: PersonContact) -> dict[str, Any]:
|
|
"""Convert PersonContact to dictionary for YAML output."""
|
|
d = {'name': person.name}
|
|
if person.role:
|
|
d['role'] = person.role
|
|
if person.role_type:
|
|
d['role_type'] = person.role_type
|
|
if person.street_address:
|
|
d['street_address'] = person.street_address
|
|
if person.postal_code:
|
|
d['postal_code'] = person.postal_code
|
|
if person.city:
|
|
d['city'] = person.city
|
|
if person.phone:
|
|
d['phone'] = person.phone
|
|
if person.email:
|
|
d['email'] = person.email
|
|
return d
|
|
|
|
|
|
def org_to_dict(org: OrganizationContact) -> dict[str, Any]:
|
|
"""Convert OrganizationContact to dictionary for YAML output."""
|
|
d = {}
|
|
if org.postal_address:
|
|
d['postal_address'] = {
|
|
'street': org.postal_address,
|
|
'postal_code': org.postal_code,
|
|
'city': org.postal_city
|
|
}
|
|
if org.physical_address:
|
|
d['physical_address'] = {
|
|
'street': org.physical_address,
|
|
'postal_code': org.physical_postal_code,
|
|
'city': org.physical_city
|
|
}
|
|
if org.rsin:
|
|
d['rsin'] = org.rsin
|
|
if org.kvk:
|
|
d['kvk'] = org.kvk
|
|
if org.iban:
|
|
d['iban'] = org.iban
|
|
if org.phone:
|
|
d['phone'] = org.phone
|
|
if org.email:
|
|
d['email'] = org.email
|
|
return d
|
|
|
|
|
|
def process_custodian_file(
|
|
custodian_path: str,
|
|
dry_run: bool = False,
|
|
verbose: bool = False
|
|
) -> tuple[bool, int, Optional[str]]:
|
|
"""
|
|
Process a single custodian file and extract contact page data.
|
|
|
|
Returns: (updated, person_count, error_message)
|
|
"""
|
|
try:
|
|
with open(custodian_path, 'r', encoding='utf-8') as f:
|
|
custodian_data = yaml.safe_load(f)
|
|
except Exception as e:
|
|
return False, 0, f"Error reading file: {e}"
|
|
|
|
if not custodian_data:
|
|
return False, 0, "Empty file"
|
|
|
|
# Find web archive directory
|
|
web_enrichment = custodian_data.get('web_enrichment', {})
|
|
web_archives = web_enrichment.get('web_archives', [])
|
|
|
|
if not web_archives:
|
|
return False, 0, None # No web archives, not an error
|
|
|
|
all_persons = []
|
|
all_orgs = []
|
|
source_files = []
|
|
|
|
for archive in web_archives:
|
|
# Support both 'directory' and 'archive_path' field names
|
|
archive_path = archive.get('directory', '') or archive.get('archive_path', '')
|
|
if not archive_path:
|
|
continue
|
|
|
|
# Convert to web archive directory path
|
|
# archive_path is like "web/0787/oudobdam-hensbroek.nl"
|
|
web_dir = os.path.join('/Users/kempersc/apps/glam/data/custodian', archive_path)
|
|
|
|
if not os.path.isdir(web_dir):
|
|
continue
|
|
|
|
# Find contact pages
|
|
contact_pages = find_contact_pages(web_dir)
|
|
|
|
for contact_page in contact_pages:
|
|
if verbose:
|
|
print(f" Processing: {contact_page}")
|
|
|
|
data = extract_contact_page_data(contact_page)
|
|
if data:
|
|
all_persons.extend(data.persons)
|
|
if data.organization:
|
|
all_orgs.append(data.organization)
|
|
source_files.append(contact_page)
|
|
|
|
if not all_persons and not all_orgs:
|
|
return False, 0, None # No contact data found
|
|
|
|
# Deduplicate persons by name
|
|
seen_names = set()
|
|
unique_persons = []
|
|
for person in all_persons:
|
|
if person.name not in seen_names:
|
|
seen_names.add(person.name)
|
|
unique_persons.append(person)
|
|
|
|
# Build web_contact_data section
|
|
contact_data = {
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'contact_page_pattern_matching',
|
|
'source_files': [os.path.relpath(f, '/Users/kempersc/apps/glam/data/custodian') for f in source_files],
|
|
}
|
|
|
|
if unique_persons:
|
|
contact_data['persons'] = [person_to_dict(p) for p in unique_persons]
|
|
|
|
# Merge organization data (take first non-empty values)
|
|
merged_org = OrganizationContact()
|
|
for org in all_orgs:
|
|
if org.rsin and not merged_org.rsin:
|
|
merged_org.rsin = org.rsin
|
|
if org.kvk and not merged_org.kvk:
|
|
merged_org.kvk = org.kvk
|
|
if org.iban and not merged_org.iban:
|
|
merged_org.iban = org.iban
|
|
if org.postal_address and not merged_org.postal_address:
|
|
merged_org.postal_address = org.postal_address
|
|
merged_org.postal_code = org.postal_code
|
|
merged_org.postal_city = org.postal_city
|
|
if org.physical_address and not merged_org.physical_address:
|
|
merged_org.physical_address = org.physical_address
|
|
merged_org.physical_postal_code = org.physical_postal_code
|
|
merged_org.physical_city = org.physical_city
|
|
|
|
org_dict = org_to_dict(merged_org)
|
|
if org_dict:
|
|
contact_data['organization'] = org_dict
|
|
|
|
# Update custodian data
|
|
custodian_data['web_contact_data'] = contact_data
|
|
|
|
if not dry_run:
|
|
with open(custodian_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(custodian_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return True, len(unique_persons), None
|
|
|
|
|
|
# ============================================================================
|
|
# MAIN EXECUTION
|
|
# ============================================================================
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Extract contact data from Dutch heritage institution contact pages'
|
|
)
|
|
parser.add_argument('--dry-run', action='store_true', help='Do not modify files')
|
|
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
|
|
parser.add_argument('--limit', type=int, help='Limit number of files to process')
|
|
parser.add_argument('--file', type=str, help='Process a single custodian file')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Find custodian files
|
|
if args.file:
|
|
custodian_files = [args.file]
|
|
else:
|
|
custodian_files = sorted(glob.glob('/Users/kempersc/apps/glam/data/custodian/NL-*.yaml'))
|
|
|
|
if args.limit:
|
|
custodian_files = custodian_files[:args.limit]
|
|
|
|
print(f"Processing {len(custodian_files)} custodian files...")
|
|
if args.dry_run:
|
|
print(" [DRY RUN - no files will be modified]")
|
|
|
|
total_updated = 0
|
|
total_persons = 0
|
|
errors = []
|
|
|
|
for i, custodian_path in enumerate(custodian_files, 1):
|
|
filename = os.path.basename(custodian_path)
|
|
|
|
if args.verbose:
|
|
print(f"\n[{i}/{len(custodian_files)}] {filename}")
|
|
|
|
updated, person_count, error = process_custodian_file(
|
|
custodian_path,
|
|
dry_run=args.dry_run,
|
|
verbose=args.verbose
|
|
)
|
|
|
|
if error:
|
|
errors.append((filename, error))
|
|
if args.verbose:
|
|
print(f" Error: {error}")
|
|
elif updated:
|
|
total_updated += 1
|
|
total_persons += person_count
|
|
if args.verbose:
|
|
print(f" Updated: {person_count} persons extracted")
|
|
elif args.verbose:
|
|
print(f" Skipped: no contact data found")
|
|
|
|
# Summary
|
|
print(f"\n{'='*60}")
|
|
print(f"SUMMARY")
|
|
print(f"{'='*60}")
|
|
print(f"Files processed: {len(custodian_files)}")
|
|
print(f"Files updated: {total_updated}")
|
|
print(f"Total persons: {total_persons}")
|
|
print(f"Errors: {len(errors)}")
|
|
|
|
if errors and args.verbose:
|
|
print(f"\nErrors:")
|
|
for filename, error in errors[:10]:
|
|
print(f" {filename}: {error}")
|
|
if len(errors) > 10:
|
|
print(f" ... and {len(errors) - 10} more")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|