430 lines
15 KiB
Python
430 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract CustodianName from archived website HTML with XPath provenance.
|
|
|
|
This script extracts the official emic institution name from archived websites
|
|
following the WebObservation provenance rules defined in AGENTS.md Rule 6.
|
|
|
|
CustodianName sources (in priority order):
|
|
1. <title> tag - Often contains "Museum Name - Tagline" pattern
|
|
2. og:site_name meta tag - Clean site/organization name
|
|
3. og:title meta tag - Page title for social sharing
|
|
4. First <h1> element - Often the main institution name
|
|
5. Footer "Over" section heading - Dutch pattern "Over [Institution]"
|
|
|
|
Output: Adds custodian_name field to entry YAML with XPath provenance
|
|
|
|
Usage:
|
|
python scripts/extract_custodian_name.py [--limit N] [--entry ENTRY_NUM] [--dry-run]
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import yaml
|
|
|
|
try:
|
|
from bs4 import BeautifulSoup
|
|
HAS_DEPS = True
|
|
except ImportError:
|
|
HAS_DEPS = False
|
|
print("Warning: Missing dependency: beautifulsoup4")
|
|
print("Install with: pip install beautifulsoup4")
|
|
|
|
|
|
# Directories
|
|
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
|
|
WEB_DIR = ENTRIES_DIR / 'web'
|
|
|
|
|
|
# Common title suffixes/taglines to strip (case-insensitive patterns)
|
|
TITLE_STRIP_PATTERNS = [
|
|
# Dutch patterns
|
|
r'\s*[-–—|]\s*(?:home(?:page)?|welkom|startpagina|website).*$',
|
|
r'\s*[-–—|]\s*(?:het\s+)?(?:museum|archief|bibliotheek|galerie).*$',
|
|
r'\s*[-–—|]\s*(?:mooie\s+)?tentoonstellingen.*$',
|
|
r'\s*[-–—|]\s*ontdek\s+.*$',
|
|
r'\s*[-–—|]\s*bezoek\s+.*$',
|
|
# English patterns
|
|
r'\s*[-–—|]\s*(?:the\s+)?official\s+(?:website|site).*$',
|
|
r'\s*[-–—|]\s*(?:welcome|home|main).*$',
|
|
r'\s*[-–—|]\s*(?:museum|archive|library|gallery).*$',
|
|
# Generic separators with taglines
|
|
r'\s*[-–—|:]\s*[^|–—-]{30,}$', # Long taglines after separator
|
|
]
|
|
|
|
|
|
def clean_institution_name(name: str) -> str:
|
|
"""
|
|
Clean a raw title/name to extract the institution name.
|
|
|
|
Removes common suffixes, taglines, and normalizes whitespace.
|
|
"""
|
|
if not name:
|
|
return ""
|
|
|
|
# Normalize whitespace first
|
|
name = ' '.join(name.split())
|
|
|
|
# Apply strip patterns
|
|
for pattern in TITLE_STRIP_PATTERNS:
|
|
name = re.sub(pattern, '', name, flags=re.IGNORECASE)
|
|
|
|
# Final cleanup
|
|
name = name.strip(' -–—|:')
|
|
name = ' '.join(name.split())
|
|
|
|
return name
|
|
|
|
|
|
def get_xpath(element) -> str:
|
|
"""Generate XPath for an element (same as fetch_website_playwright.py)."""
|
|
parts = []
|
|
while element and element.name:
|
|
siblings = element.find_previous_siblings(element.name)
|
|
index = len(siblings) + 1
|
|
parts.insert(0, f"{element.name}[{index}]")
|
|
element = element.parent
|
|
return '/' + '/'.join(parts) if parts else '/'
|
|
|
|
|
|
def extract_name_from_title(soup: BeautifulSoup, html_file: str) -> Optional[dict]:
|
|
"""Extract institution name from <title> tag."""
|
|
title = soup.find('title')
|
|
if title and title.string:
|
|
raw_title = title.string.strip()
|
|
cleaned = clean_institution_name(raw_title)
|
|
if cleaned and len(cleaned) > 2:
|
|
return {
|
|
'claim_type': 'custodian_name',
|
|
'claim_value': cleaned,
|
|
'raw_value': raw_title,
|
|
'extraction_source': 'title_tag',
|
|
'xpath': get_xpath(title),
|
|
'html_file': html_file,
|
|
'xpath_match_score': 1.0,
|
|
}
|
|
return None
|
|
|
|
|
|
def extract_name_from_meta_og(soup: BeautifulSoup, html_file: str) -> Optional[dict]:
|
|
"""Extract institution name from og:site_name or og:title meta tags."""
|
|
# Prefer og:site_name as it's usually the clean organization name
|
|
for og_property in ['og:site_name', 'og:title']:
|
|
meta = soup.find('meta', property=og_property)
|
|
if meta and meta.get('content'):
|
|
raw_value = meta['content'].strip()
|
|
cleaned = clean_institution_name(raw_value)
|
|
if cleaned and len(cleaned) > 2:
|
|
return {
|
|
'claim_type': 'custodian_name',
|
|
'claim_value': cleaned,
|
|
'raw_value': raw_value,
|
|
'extraction_source': f'meta_{og_property.replace(":", "_")}',
|
|
'xpath': get_xpath(meta),
|
|
'html_file': html_file,
|
|
'xpath_match_score': 1.0,
|
|
}
|
|
return None
|
|
|
|
|
|
def extract_name_from_h1(soup: BeautifulSoup, html_file: str) -> Optional[dict]:
|
|
"""Extract institution name from first <h1> element."""
|
|
h1 = soup.find('h1')
|
|
if h1:
|
|
text = h1.get_text(strip=True)
|
|
if text and len(text) > 2 and len(text) < 100:
|
|
return {
|
|
'claim_type': 'custodian_name',
|
|
'claim_value': text,
|
|
'raw_value': text,
|
|
'extraction_source': 'h1_tag',
|
|
'xpath': get_xpath(h1),
|
|
'html_file': html_file,
|
|
'xpath_match_score': 1.0,
|
|
}
|
|
return None
|
|
|
|
|
|
def extract_name_from_footer_over(soup: BeautifulSoup, html_file: str) -> Optional[dict]:
|
|
"""
|
|
Extract institution name from Dutch "Over [Institution]" pattern.
|
|
|
|
Common pattern in Dutch museum websites:
|
|
- Footer section heading: "Over Museum Catharijneconvent"
|
|
- Link text: "Over het museum"
|
|
"""
|
|
# Look for h3 tags in footer containing "Over" pattern
|
|
footer = soup.find('footer')
|
|
if footer:
|
|
for h3 in footer.find_all('h3'):
|
|
text = h3.get_text(strip=True)
|
|
match = re.match(r'^Over\s+(.+)$', text, re.IGNORECASE)
|
|
if match:
|
|
name = match.group(1).strip()
|
|
if name and len(name) > 2 and name.lower() not in ['ons', 'het museum', 'de organisatie']:
|
|
return {
|
|
'claim_type': 'custodian_name',
|
|
'claim_value': name,
|
|
'raw_value': text,
|
|
'extraction_source': 'footer_over_heading',
|
|
'xpath': get_xpath(h3),
|
|
'html_file': html_file,
|
|
'xpath_match_score': 0.9, # Slightly lower confidence
|
|
}
|
|
return None
|
|
|
|
|
|
def extract_name_from_metadata_title(metadata: dict) -> Optional[dict]:
|
|
"""Extract institution name from metadata.yaml pages[0].title."""
|
|
pages = metadata.get('pages', [])
|
|
if pages and pages[0].get('title'):
|
|
raw_title = pages[0]['title']
|
|
cleaned = clean_institution_name(raw_title)
|
|
if cleaned and len(cleaned) > 2:
|
|
html_file = pages[0].get('archived_file', 'unknown')
|
|
return {
|
|
'claim_type': 'custodian_name',
|
|
'claim_value': cleaned,
|
|
'raw_value': raw_title,
|
|
'extraction_source': 'metadata_page_title',
|
|
'html_file': html_file,
|
|
'xpath_match_score': 1.0, # Title is reliable
|
|
}
|
|
return None
|
|
|
|
|
|
def extract_custodian_name(
|
|
html_content: str,
|
|
html_file_path: str,
|
|
metadata: Optional[dict] = None
|
|
) -> Optional[dict]:
|
|
"""
|
|
Extract CustodianName from HTML content with XPath provenance.
|
|
|
|
Tries multiple sources in priority order:
|
|
1. metadata.yaml page title (if available, most reliable)
|
|
2. <title> tag
|
|
3. og:site_name / og:title meta tags
|
|
4. First <h1> element
|
|
5. Footer "Over [Name]" pattern
|
|
|
|
Returns dict with claim_type, claim_value, xpath, html_file, etc.
|
|
"""
|
|
# Try metadata page title first (already extracted by archiver)
|
|
if metadata:
|
|
result = extract_name_from_metadata_title(metadata)
|
|
if result:
|
|
return result
|
|
|
|
# Parse HTML
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
# Try sources in priority order
|
|
extractors = [
|
|
extract_name_from_title,
|
|
extract_name_from_meta_og,
|
|
extract_name_from_h1,
|
|
extract_name_from_footer_over,
|
|
]
|
|
|
|
for extractor in extractors:
|
|
result = extractor(soup, html_file_path)
|
|
if result:
|
|
return result
|
|
|
|
return None
|
|
|
|
|
|
def get_web_archive_path(entry_data: dict, entry_num: str) -> Optional[Path]:
|
|
"""Get the web archive directory path for an entry."""
|
|
web_enrichment = entry_data.get('web_enrichment', {})
|
|
web_archives = web_enrichment.get('web_archives', [])
|
|
|
|
if web_archives:
|
|
# Use first archive
|
|
archive = web_archives[0]
|
|
directory = archive.get('directory')
|
|
if directory:
|
|
return ENTRIES_DIR / directory
|
|
|
|
# Fallback: look for directory in web/{entry_num}/
|
|
entry_web_dir = WEB_DIR / entry_num
|
|
if entry_web_dir.exists():
|
|
subdirs = [d for d in entry_web_dir.iterdir() if d.is_dir()]
|
|
if subdirs:
|
|
return subdirs[0]
|
|
|
|
return None
|
|
|
|
|
|
def load_html_and_metadata(archive_path: Path) -> tuple[Optional[str], Optional[dict]]:
|
|
"""Load HTML content and metadata from archive directory."""
|
|
html_content = None
|
|
metadata = None
|
|
|
|
# Load metadata
|
|
metadata_file = archive_path / 'metadata.yaml'
|
|
if metadata_file.exists():
|
|
try:
|
|
with open(metadata_file, 'r', encoding='utf-8') as f:
|
|
metadata = yaml.safe_load(f)
|
|
except Exception as e:
|
|
print(f" Warning: Failed to load {metadata_file}: {e}")
|
|
|
|
# Load HTML from pages/ or rendered.html
|
|
html_paths = [
|
|
archive_path / 'pages' / 'index.html',
|
|
archive_path / 'rendered.html',
|
|
]
|
|
|
|
# Also check mirror directories for legacy archives
|
|
mirror_dir = archive_path / 'mirror'
|
|
if mirror_dir.exists():
|
|
for subdir in mirror_dir.iterdir():
|
|
if subdir.is_dir():
|
|
html_paths.append(subdir / 'index.html')
|
|
|
|
for html_path in html_paths:
|
|
if html_path.exists():
|
|
try:
|
|
with open(html_path, 'r', encoding='utf-8', errors='replace') as f:
|
|
html_content = f.read()
|
|
break
|
|
except Exception as e:
|
|
print(f" Warning: Failed to load {html_path}: {e}")
|
|
|
|
return html_content, metadata
|
|
|
|
|
|
def extract_entry_number(filename: str) -> str:
|
|
"""Extract entry number from filename."""
|
|
match = re.match(r'^(\d+)', filename)
|
|
return match.group(1) if match else filename.replace('.yaml', '')
|
|
|
|
|
|
def process_entry(filepath: Path, dry_run: bool = False) -> tuple[bool, Optional[str], list[str]]:
|
|
"""
|
|
Process a single entry file to extract CustodianName.
|
|
|
|
Returns: (extracted, custodian_name, errors)
|
|
"""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
return False, None, ["Empty file"]
|
|
|
|
# Skip if already has custodian_name
|
|
if data.get('custodian_name') and data['custodian_name'].get('claim_value'):
|
|
return False, data['custodian_name'].get('claim_value'), []
|
|
|
|
entry_num = extract_entry_number(filepath.name)
|
|
errors = []
|
|
|
|
# Get web archive path
|
|
archive_path = get_web_archive_path(data, entry_num)
|
|
if not archive_path or not archive_path.exists():
|
|
return False, None, [f"No web archive found for entry {entry_num}"]
|
|
|
|
# Load HTML and metadata
|
|
html_content, metadata = load_html_and_metadata(archive_path)
|
|
if not html_content and not metadata:
|
|
return False, None, [f"No HTML content or metadata in {archive_path}"]
|
|
|
|
# Determine HTML file path for provenance
|
|
html_file_path = str(archive_path.relative_to(ENTRIES_DIR))
|
|
if metadata and metadata.get('pages'):
|
|
html_file_path = metadata['pages'][0].get('archived_file', html_file_path)
|
|
|
|
# Extract custodian name
|
|
name_claim = extract_custodian_name(html_content or '', html_file_path, metadata)
|
|
|
|
if not name_claim:
|
|
return False, None, [f"Could not extract CustodianName from {entry_num}"]
|
|
|
|
# Add provenance metadata
|
|
name_claim['source_url'] = data.get('web_enrichment', {}).get('web_archives', [{}])[0].get('url', '')
|
|
if not name_claim['source_url']:
|
|
name_claim['source_url'] = data.get('original_entry', {}).get('webadres_organisatie', '')
|
|
|
|
name_claim['retrieved_on'] = metadata.get('archive_timestamp') if metadata else None
|
|
name_claim['extraction_timestamp'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
if not dry_run:
|
|
# Add custodian_name to entry data
|
|
data['custodian_name'] = name_claim
|
|
|
|
# Write back
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return True, name_claim.get('claim_value'), errors
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Extract CustodianName from archived websites')
|
|
parser.add_argument('--limit', type=int, default=None, help='Limit number of entries')
|
|
parser.add_argument('--entry', type=str, default=None, help='Process specific entry number')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without writing')
|
|
parser.add_argument('--force', action='store_true', help='Re-extract even if custodian_name exists')
|
|
args = parser.parse_args()
|
|
|
|
if not HAS_DEPS:
|
|
print("Error: Required dependency beautifulsoup4 not installed.")
|
|
print("Run: pip install beautifulsoup4")
|
|
return 1
|
|
|
|
# Find entry files
|
|
if args.entry:
|
|
files = list(ENTRIES_DIR.glob(f'{args.entry}*.yaml'))
|
|
else:
|
|
files = sorted([f for f in ENTRIES_DIR.glob('*.yaml') if f.is_file() and not f.name.startswith('.')])
|
|
|
|
if args.limit:
|
|
files = files[:args.limit]
|
|
|
|
total_extracted = 0
|
|
total_skipped = 0
|
|
total_failed = 0
|
|
|
|
print(f"Processing {len(files)} entries...")
|
|
|
|
for filepath in files:
|
|
if filepath.is_dir():
|
|
continue
|
|
|
|
# Skip if already has custodian_name (unless --force)
|
|
if not args.force:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
if data and data.get('custodian_name', {}).get('claim_value'):
|
|
total_skipped += 1
|
|
continue
|
|
|
|
extracted, name, errors = process_entry(filepath, dry_run=args.dry_run)
|
|
|
|
if extracted:
|
|
total_extracted += 1
|
|
print(f" ✓ {filepath.name}: {name}")
|
|
elif name:
|
|
total_skipped += 1 # Already had name
|
|
else:
|
|
total_failed += 1
|
|
for e in errors:
|
|
print(f" ✗ {filepath.name}: {e}")
|
|
|
|
print(f"\n{'DRY RUN - ' if args.dry_run else ''}Summary:")
|
|
print(f" Extracted: {total_extracted}")
|
|
print(f" Skipped (already have name): {total_skipped}")
|
|
print(f" Failed (no archive/name): {total_failed}")
|
|
|
|
return 0 if total_failed == 0 else 1
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|