#!/usr/bin/env python3 """ Extract CustodianName from archived website HTML with XPath provenance. This script extracts the official emic institution name from archived websites following the WebObservation provenance rules defined in AGENTS.md Rule 6. CustodianName sources (in priority order): 1. tag - Often contains "Museum Name - Tagline" pattern 2. og:site_name meta tag - Clean site/organization name 3. og:title meta tag - Page title for social sharing 4. First <h1> element - Often the main institution name 5. Footer "Over" section heading - Dutch pattern "Over [Institution]" Output: Adds custodian_name field to entry YAML with XPath provenance Usage: python scripts/extract_custodian_name.py [--limit N] [--entry ENTRY_NUM] [--dry-run] """ import argparse import re import sys from datetime import datetime, timezone from pathlib import Path from typing import Optional import yaml try: from bs4 import BeautifulSoup HAS_DEPS = True except ImportError: HAS_DEPS = False print("Warning: Missing dependency: beautifulsoup4") print("Install with: pip install beautifulsoup4") # Directories ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries') WEB_DIR = ENTRIES_DIR / 'web' # Common title suffixes/taglines to strip (case-insensitive patterns) TITLE_STRIP_PATTERNS = [ # Dutch patterns r'\s*[-–—|]\s*(?:home(?:page)?|welkom|startpagina|website).*$', r'\s*[-–—|]\s*(?:het\s+)?(?:museum|archief|bibliotheek|galerie).*$', r'\s*[-–—|]\s*(?:mooie\s+)?tentoonstellingen.*$', r'\s*[-–—|]\s*ontdek\s+.*$', r'\s*[-–—|]\s*bezoek\s+.*$', # English patterns r'\s*[-–—|]\s*(?:the\s+)?official\s+(?:website|site).*$', r'\s*[-–—|]\s*(?:welcome|home|main).*$', r'\s*[-–—|]\s*(?:museum|archive|library|gallery).*$', # Generic separators with taglines r'\s*[-–—|:]\s*[^|–—-]{30,}$', # Long taglines after separator ] def clean_institution_name(name: str) -> str: """ Clean a raw title/name to extract the institution name. Removes common suffixes, taglines, and normalizes whitespace. """ if not name: return "" # Normalize whitespace first name = ' '.join(name.split()) # Apply strip patterns for pattern in TITLE_STRIP_PATTERNS: name = re.sub(pattern, '', name, flags=re.IGNORECASE) # Final cleanup name = name.strip(' -–—|:') name = ' '.join(name.split()) return name def get_xpath(element) -> str: """Generate XPath for an element (same as fetch_website_playwright.py).""" parts = [] while element and element.name: siblings = element.find_previous_siblings(element.name) index = len(siblings) + 1 parts.insert(0, f"{element.name}[{index}]") element = element.parent return '/' + '/'.join(parts) if parts else '/' def extract_name_from_title(soup: BeautifulSoup, html_file: str) -> Optional[dict]: """Extract institution name from <title> tag.""" title = soup.find('title') if title and title.string: raw_title = title.string.strip() cleaned = clean_institution_name(raw_title) if cleaned and len(cleaned) > 2: return { 'claim_type': 'custodian_name', 'claim_value': cleaned, 'raw_value': raw_title, 'extraction_source': 'title_tag', 'xpath': get_xpath(title), 'html_file': html_file, 'xpath_match_score': 1.0, } return None def extract_name_from_meta_og(soup: BeautifulSoup, html_file: str) -> Optional[dict]: """Extract institution name from og:site_name or og:title meta tags.""" # Prefer og:site_name as it's usually the clean organization name for og_property in ['og:site_name', 'og:title']: meta = soup.find('meta', property=og_property) if meta and meta.get('content'): raw_value = meta['content'].strip() cleaned = clean_institution_name(raw_value) if cleaned and len(cleaned) > 2: return { 'claim_type': 'custodian_name', 'claim_value': cleaned, 'raw_value': raw_value, 'extraction_source': f'meta_{og_property.replace(":", "_")}', 'xpath': get_xpath(meta), 'html_file': html_file, 'xpath_match_score': 1.0, } return None def extract_name_from_h1(soup: BeautifulSoup, html_file: str) -> Optional[dict]: """Extract institution name from first <h1> element.""" h1 = soup.find('h1') if h1: text = h1.get_text(strip=True) if text and len(text) > 2 and len(text) < 100: return { 'claim_type': 'custodian_name', 'claim_value': text, 'raw_value': text, 'extraction_source': 'h1_tag', 'xpath': get_xpath(h1), 'html_file': html_file, 'xpath_match_score': 1.0, } return None def extract_name_from_footer_over(soup: BeautifulSoup, html_file: str) -> Optional[dict]: """ Extract institution name from Dutch "Over [Institution]" pattern. Common pattern in Dutch museum websites: - Footer section heading: "Over Museum Catharijneconvent" - Link text: "Over het museum" """ # Look for h3 tags in footer containing "Over" pattern footer = soup.find('footer') if footer: for h3 in footer.find_all('h3'): text = h3.get_text(strip=True) match = re.match(r'^Over\s+(.+)$', text, re.IGNORECASE) if match: name = match.group(1).strip() if name and len(name) > 2 and name.lower() not in ['ons', 'het museum', 'de organisatie']: return { 'claim_type': 'custodian_name', 'claim_value': name, 'raw_value': text, 'extraction_source': 'footer_over_heading', 'xpath': get_xpath(h3), 'html_file': html_file, 'xpath_match_score': 0.9, # Slightly lower confidence } return None def extract_name_from_metadata_title(metadata: dict) -> Optional[dict]: """Extract institution name from metadata.yaml pages[0].title.""" pages = metadata.get('pages', []) if pages and pages[0].get('title'): raw_title = pages[0]['title'] cleaned = clean_institution_name(raw_title) if cleaned and len(cleaned) > 2: html_file = pages[0].get('archived_file', 'unknown') return { 'claim_type': 'custodian_name', 'claim_value': cleaned, 'raw_value': raw_title, 'extraction_source': 'metadata_page_title', 'html_file': html_file, 'xpath_match_score': 1.0, # Title is reliable } return None def extract_custodian_name( html_content: str, html_file_path: str, metadata: Optional[dict] = None ) -> Optional[dict]: """ Extract CustodianName from HTML content with XPath provenance. Tries multiple sources in priority order: 1. metadata.yaml page title (if available, most reliable) 2. <title> tag 3. og:site_name / og:title meta tags 4. First <h1> element 5. Footer "Over [Name]" pattern Returns dict with claim_type, claim_value, xpath, html_file, etc. """ # Try metadata page title first (already extracted by archiver) if metadata: result = extract_name_from_metadata_title(metadata) if result: return result # Parse HTML soup = BeautifulSoup(html_content, 'html.parser') # Try sources in priority order extractors = [ extract_name_from_title, extract_name_from_meta_og, extract_name_from_h1, extract_name_from_footer_over, ] for extractor in extractors: result = extractor(soup, html_file_path) if result: return result return None def get_web_archive_path(entry_data: dict, entry_num: str) -> Optional[Path]: """Get the web archive directory path for an entry.""" web_enrichment = entry_data.get('web_enrichment', {}) web_archives = web_enrichment.get('web_archives', []) if web_archives: # Use first archive archive = web_archives[0] directory = archive.get('directory') if directory: return ENTRIES_DIR / directory # Fallback: look for directory in web/{entry_num}/ entry_web_dir = WEB_DIR / entry_num if entry_web_dir.exists(): subdirs = [d for d in entry_web_dir.iterdir() if d.is_dir()] if subdirs: return subdirs[0] return None def load_html_and_metadata(archive_path: Path) -> tuple[Optional[str], Optional[dict]]: """Load HTML content and metadata from archive directory.""" html_content = None metadata = None # Load metadata metadata_file = archive_path / 'metadata.yaml' if metadata_file.exists(): try: with open(metadata_file, 'r', encoding='utf-8') as f: metadata = yaml.safe_load(f) except Exception as e: print(f" Warning: Failed to load {metadata_file}: {e}") # Load HTML from pages/ or rendered.html html_paths = [ archive_path / 'pages' / 'index.html', archive_path / 'rendered.html', ] # Also check mirror directories for legacy archives mirror_dir = archive_path / 'mirror' if mirror_dir.exists(): for subdir in mirror_dir.iterdir(): if subdir.is_dir(): html_paths.append(subdir / 'index.html') for html_path in html_paths: if html_path.exists(): try: with open(html_path, 'r', encoding='utf-8', errors='replace') as f: html_content = f.read() break except Exception as e: print(f" Warning: Failed to load {html_path}: {e}") return html_content, metadata def extract_entry_number(filename: str) -> str: """Extract entry number from filename.""" match = re.match(r'^(\d+)', filename) return match.group(1) if match else filename.replace('.yaml', '') def process_entry(filepath: Path, dry_run: bool = False) -> tuple[bool, Optional[str], list[str]]: """ Process a single entry file to extract CustodianName. Returns: (extracted, custodian_name, errors) """ with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: return False, None, ["Empty file"] # Skip if already has custodian_name if data.get('custodian_name') and data['custodian_name'].get('claim_value'): return False, data['custodian_name'].get('claim_value'), [] entry_num = extract_entry_number(filepath.name) errors = [] # Get web archive path archive_path = get_web_archive_path(data, entry_num) if not archive_path or not archive_path.exists(): return False, None, [f"No web archive found for entry {entry_num}"] # Load HTML and metadata html_content, metadata = load_html_and_metadata(archive_path) if not html_content and not metadata: return False, None, [f"No HTML content or metadata in {archive_path}"] # Determine HTML file path for provenance html_file_path = str(archive_path.relative_to(ENTRIES_DIR)) if metadata and metadata.get('pages'): html_file_path = metadata['pages'][0].get('archived_file', html_file_path) # Extract custodian name name_claim = extract_custodian_name(html_content or '', html_file_path, metadata) if not name_claim: return False, None, [f"Could not extract CustodianName from {entry_num}"] # Add provenance metadata name_claim['source_url'] = data.get('web_enrichment', {}).get('web_archives', [{}])[0].get('url', '') if not name_claim['source_url']: name_claim['source_url'] = data.get('original_entry', {}).get('webadres_organisatie', '') name_claim['retrieved_on'] = metadata.get('archive_timestamp') if metadata else None name_claim['extraction_timestamp'] = datetime.now(timezone.utc).isoformat() if not dry_run: # Add custodian_name to entry data data['custodian_name'] = name_claim # Write back with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return True, name_claim.get('claim_value'), errors def main(): parser = argparse.ArgumentParser(description='Extract CustodianName from archived websites') parser.add_argument('--limit', type=int, default=None, help='Limit number of entries') parser.add_argument('--entry', type=str, default=None, help='Process specific entry number') parser.add_argument('--dry-run', action='store_true', help='Show what would be done without writing') parser.add_argument('--force', action='store_true', help='Re-extract even if custodian_name exists') args = parser.parse_args() if not HAS_DEPS: print("Error: Required dependency beautifulsoup4 not installed.") print("Run: pip install beautifulsoup4") return 1 # Find entry files if args.entry: files = list(ENTRIES_DIR.glob(f'{args.entry}*.yaml')) else: files = sorted([f for f in ENTRIES_DIR.glob('*.yaml') if f.is_file() and not f.name.startswith('.')]) if args.limit: files = files[:args.limit] total_extracted = 0 total_skipped = 0 total_failed = 0 print(f"Processing {len(files)} entries...") for filepath in files: if filepath.is_dir(): continue # Skip if already has custodian_name (unless --force) if not args.force: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if data and data.get('custodian_name', {}).get('claim_value'): total_skipped += 1 continue extracted, name, errors = process_entry(filepath, dry_run=args.dry_run) if extracted: total_extracted += 1 print(f" ✓ {filepath.name}: {name}") elif name: total_skipped += 1 # Already had name else: total_failed += 1 for e in errors: print(f" ✗ {filepath.name}: {e}") print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:") print(f" Extracted: {total_extracted}") print(f" Skipped (already have name): {total_skipped}") print(f" Failed (no archive/name): {total_failed}") return 0 if total_failed == 0 else 1 if __name__ == '__main__': sys.exit(main())