glam/scripts/extract_custodian_name.py
2025-12-01 16:06:34 +01:00

430 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Extract CustodianName from archived website HTML with XPath provenance.
This script extracts the official emic institution name from archived websites
following the WebObservation provenance rules defined in AGENTS.md Rule 6.
CustodianName sources (in priority order):
1. <title> tag - Often contains "Museum Name - Tagline" pattern
2. og:site_name meta tag - Clean site/organization name
3. og:title meta tag - Page title for social sharing
4. First <h1> element - Often the main institution name
5. Footer "Over" section heading - Dutch pattern "Over [Institution]"
Output: Adds custodian_name field to entry YAML with XPath provenance
Usage:
python scripts/extract_custodian_name.py [--limit N] [--entry ENTRY_NUM] [--dry-run]
"""
import argparse
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import yaml
try:
from bs4 import BeautifulSoup
HAS_DEPS = True
except ImportError:
HAS_DEPS = False
print("Warning: Missing dependency: beautifulsoup4")
print("Install with: pip install beautifulsoup4")
# Directories
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
WEB_DIR = ENTRIES_DIR / 'web'
# Common title suffixes/taglines to strip (case-insensitive patterns)
TITLE_STRIP_PATTERNS = [
# Dutch patterns
r'\s*[-–—|]\s*(?:home(?:page)?|welkom|startpagina|website).*$',
r'\s*[-–—|]\s*(?:het\s+)?(?:museum|archief|bibliotheek|galerie).*$',
r'\s*[-–—|]\s*(?:mooie\s+)?tentoonstellingen.*$',
r'\s*[-–—|]\s*ontdek\s+.*$',
r'\s*[-–—|]\s*bezoek\s+.*$',
# English patterns
r'\s*[-–—|]\s*(?:the\s+)?official\s+(?:website|site).*$',
r'\s*[-–—|]\s*(?:welcome|home|main).*$',
r'\s*[-–—|]\s*(?:museum|archive|library|gallery).*$',
# Generic separators with taglines
r'\s*[-–—|:]\s*[^|–—-]{30,}$', # Long taglines after separator
]
def clean_institution_name(name: str) -> str:
"""
Clean a raw title/name to extract the institution name.
Removes common suffixes, taglines, and normalizes whitespace.
"""
if not name:
return ""
# Normalize whitespace first
name = ' '.join(name.split())
# Apply strip patterns
for pattern in TITLE_STRIP_PATTERNS:
name = re.sub(pattern, '', name, flags=re.IGNORECASE)
# Final cleanup
name = name.strip(' -–—|:')
name = ' '.join(name.split())
return name
def get_xpath(element) -> str:
"""Generate XPath for an element (same as fetch_website_playwright.py)."""
parts = []
while element and element.name:
siblings = element.find_previous_siblings(element.name)
index = len(siblings) + 1
parts.insert(0, f"{element.name}[{index}]")
element = element.parent
return '/' + '/'.join(parts) if parts else '/'
def extract_name_from_title(soup: BeautifulSoup, html_file: str) -> Optional[dict]:
"""Extract institution name from <title> tag."""
title = soup.find('title')
if title and title.string:
raw_title = title.string.strip()
cleaned = clean_institution_name(raw_title)
if cleaned and len(cleaned) > 2:
return {
'claim_type': 'custodian_name',
'claim_value': cleaned,
'raw_value': raw_title,
'extraction_source': 'title_tag',
'xpath': get_xpath(title),
'html_file': html_file,
'xpath_match_score': 1.0,
}
return None
def extract_name_from_meta_og(soup: BeautifulSoup, html_file: str) -> Optional[dict]:
"""Extract institution name from og:site_name or og:title meta tags."""
# Prefer og:site_name as it's usually the clean organization name
for og_property in ['og:site_name', 'og:title']:
meta = soup.find('meta', property=og_property)
if meta and meta.get('content'):
raw_value = meta['content'].strip()
cleaned = clean_institution_name(raw_value)
if cleaned and len(cleaned) > 2:
return {
'claim_type': 'custodian_name',
'claim_value': cleaned,
'raw_value': raw_value,
'extraction_source': f'meta_{og_property.replace(":", "_")}',
'xpath': get_xpath(meta),
'html_file': html_file,
'xpath_match_score': 1.0,
}
return None
def extract_name_from_h1(soup: BeautifulSoup, html_file: str) -> Optional[dict]:
"""Extract institution name from first <h1> element."""
h1 = soup.find('h1')
if h1:
text = h1.get_text(strip=True)
if text and len(text) > 2 and len(text) < 100:
return {
'claim_type': 'custodian_name',
'claim_value': text,
'raw_value': text,
'extraction_source': 'h1_tag',
'xpath': get_xpath(h1),
'html_file': html_file,
'xpath_match_score': 1.0,
}
return None
def extract_name_from_footer_over(soup: BeautifulSoup, html_file: str) -> Optional[dict]:
"""
Extract institution name from Dutch "Over [Institution]" pattern.
Common pattern in Dutch museum websites:
- Footer section heading: "Over Museum Catharijneconvent"
- Link text: "Over het museum"
"""
# Look for h3 tags in footer containing "Over" pattern
footer = soup.find('footer')
if footer:
for h3 in footer.find_all('h3'):
text = h3.get_text(strip=True)
match = re.match(r'^Over\s+(.+)$', text, re.IGNORECASE)
if match:
name = match.group(1).strip()
if name and len(name) > 2 and name.lower() not in ['ons', 'het museum', 'de organisatie']:
return {
'claim_type': 'custodian_name',
'claim_value': name,
'raw_value': text,
'extraction_source': 'footer_over_heading',
'xpath': get_xpath(h3),
'html_file': html_file,
'xpath_match_score': 0.9, # Slightly lower confidence
}
return None
def extract_name_from_metadata_title(metadata: dict) -> Optional[dict]:
"""Extract institution name from metadata.yaml pages[0].title."""
pages = metadata.get('pages', [])
if pages and pages[0].get('title'):
raw_title = pages[0]['title']
cleaned = clean_institution_name(raw_title)
if cleaned and len(cleaned) > 2:
html_file = pages[0].get('archived_file', 'unknown')
return {
'claim_type': 'custodian_name',
'claim_value': cleaned,
'raw_value': raw_title,
'extraction_source': 'metadata_page_title',
'html_file': html_file,
'xpath_match_score': 1.0, # Title is reliable
}
return None
def extract_custodian_name(
html_content: str,
html_file_path: str,
metadata: Optional[dict] = None
) -> Optional[dict]:
"""
Extract CustodianName from HTML content with XPath provenance.
Tries multiple sources in priority order:
1. metadata.yaml page title (if available, most reliable)
2. <title> tag
3. og:site_name / og:title meta tags
4. First <h1> element
5. Footer "Over [Name]" pattern
Returns dict with claim_type, claim_value, xpath, html_file, etc.
"""
# Try metadata page title first (already extracted by archiver)
if metadata:
result = extract_name_from_metadata_title(metadata)
if result:
return result
# Parse HTML
soup = BeautifulSoup(html_content, 'html.parser')
# Try sources in priority order
extractors = [
extract_name_from_title,
extract_name_from_meta_og,
extract_name_from_h1,
extract_name_from_footer_over,
]
for extractor in extractors:
result = extractor(soup, html_file_path)
if result:
return result
return None
def get_web_archive_path(entry_data: dict, entry_num: str) -> Optional[Path]:
"""Get the web archive directory path for an entry."""
web_enrichment = entry_data.get('web_enrichment', {})
web_archives = web_enrichment.get('web_archives', [])
if web_archives:
# Use first archive
archive = web_archives[0]
directory = archive.get('directory')
if directory:
return ENTRIES_DIR / directory
# Fallback: look for directory in web/{entry_num}/
entry_web_dir = WEB_DIR / entry_num
if entry_web_dir.exists():
subdirs = [d for d in entry_web_dir.iterdir() if d.is_dir()]
if subdirs:
return subdirs[0]
return None
def load_html_and_metadata(archive_path: Path) -> tuple[Optional[str], Optional[dict]]:
"""Load HTML content and metadata from archive directory."""
html_content = None
metadata = None
# Load metadata
metadata_file = archive_path / 'metadata.yaml'
if metadata_file.exists():
try:
with open(metadata_file, 'r', encoding='utf-8') as f:
metadata = yaml.safe_load(f)
except Exception as e:
print(f" Warning: Failed to load {metadata_file}: {e}")
# Load HTML from pages/ or rendered.html
html_paths = [
archive_path / 'pages' / 'index.html',
archive_path / 'rendered.html',
]
# Also check mirror directories for legacy archives
mirror_dir = archive_path / 'mirror'
if mirror_dir.exists():
for subdir in mirror_dir.iterdir():
if subdir.is_dir():
html_paths.append(subdir / 'index.html')
for html_path in html_paths:
if html_path.exists():
try:
with open(html_path, 'r', encoding='utf-8', errors='replace') as f:
html_content = f.read()
break
except Exception as e:
print(f" Warning: Failed to load {html_path}: {e}")
return html_content, metadata
def extract_entry_number(filename: str) -> str:
"""Extract entry number from filename."""
match = re.match(r'^(\d+)', filename)
return match.group(1) if match else filename.replace('.yaml', '')
def process_entry(filepath: Path, dry_run: bool = False) -> tuple[bool, Optional[str], list[str]]:
"""
Process a single entry file to extract CustodianName.
Returns: (extracted, custodian_name, errors)
"""
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
return False, None, ["Empty file"]
# Skip if already has custodian_name
if data.get('custodian_name') and data['custodian_name'].get('claim_value'):
return False, data['custodian_name'].get('claim_value'), []
entry_num = extract_entry_number(filepath.name)
errors = []
# Get web archive path
archive_path = get_web_archive_path(data, entry_num)
if not archive_path or not archive_path.exists():
return False, None, [f"No web archive found for entry {entry_num}"]
# Load HTML and metadata
html_content, metadata = load_html_and_metadata(archive_path)
if not html_content and not metadata:
return False, None, [f"No HTML content or metadata in {archive_path}"]
# Determine HTML file path for provenance
html_file_path = str(archive_path.relative_to(ENTRIES_DIR))
if metadata and metadata.get('pages'):
html_file_path = metadata['pages'][0].get('archived_file', html_file_path)
# Extract custodian name
name_claim = extract_custodian_name(html_content or '', html_file_path, metadata)
if not name_claim:
return False, None, [f"Could not extract CustodianName from {entry_num}"]
# Add provenance metadata
name_claim['source_url'] = data.get('web_enrichment', {}).get('web_archives', [{}])[0].get('url', '')
if not name_claim['source_url']:
name_claim['source_url'] = data.get('original_entry', {}).get('webadres_organisatie', '')
name_claim['retrieved_on'] = metadata.get('archive_timestamp') if metadata else None
name_claim['extraction_timestamp'] = datetime.now(timezone.utc).isoformat()
if not dry_run:
# Add custodian_name to entry data
data['custodian_name'] = name_claim
# Write back
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return True, name_claim.get('claim_value'), errors
def main():
parser = argparse.ArgumentParser(description='Extract CustodianName from archived websites')
parser.add_argument('--limit', type=int, default=None, help='Limit number of entries')
parser.add_argument('--entry', type=str, default=None, help='Process specific entry number')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without writing')
parser.add_argument('--force', action='store_true', help='Re-extract even if custodian_name exists')
args = parser.parse_args()
if not HAS_DEPS:
print("Error: Required dependency beautifulsoup4 not installed.")
print("Run: pip install beautifulsoup4")
return 1
# Find entry files
if args.entry:
files = list(ENTRIES_DIR.glob(f'{args.entry}*.yaml'))
else:
files = sorted([f for f in ENTRIES_DIR.glob('*.yaml') if f.is_file() and not f.name.startswith('.')])
if args.limit:
files = files[:args.limit]
total_extracted = 0
total_skipped = 0
total_failed = 0
print(f"Processing {len(files)} entries...")
for filepath in files:
if filepath.is_dir():
continue
# Skip if already has custodian_name (unless --force)
if not args.force:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if data and data.get('custodian_name', {}).get('claim_value'):
total_skipped += 1
continue
extracted, name, errors = process_entry(filepath, dry_run=args.dry_run)
if extracted:
total_extracted += 1
print(f"{filepath.name}: {name}")
elif name:
total_skipped += 1 # Already had name
else:
total_failed += 1
for e in errors:
print(f"{filepath.name}: {e}")
print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:")
print(f" Extracted: {total_extracted}")
print(f" Skipped (already have name): {total_skipped}")
print(f" Failed (no archive/name): {total_failed}")
return 0 if total_failed == 0 else 1
if __name__ == '__main__':
sys.exit(main())