731 lines
27 KiB
Python
731 lines
27 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Extract structured claims from archived website HTML with XPath provenance.
|
||
|
||
This script extracts verifiable data from archived HTML files following
|
||
the WebObservation provenance rules defined in AGENTS.md Rule 6.
|
||
|
||
EVERY claim MUST have:
|
||
- claim_type: Type of claim (org_name, description, email, phone, address, etc.)
|
||
- claim_value: The extracted value
|
||
- source_url: URL the claim was extracted from
|
||
- retrieved_on: ISO 8601 timestamp when page was archived
|
||
- xpath: XPath to the element containing this value
|
||
- html_file: Relative path to archived HTML file
|
||
- xpath_match_score: 1.0 for exact match, <1.0 for fuzzy match
|
||
|
||
Claims WITHOUT XPath provenance are FABRICATED and must NOT be stored.
|
||
|
||
Usage:
|
||
python scripts/extract_html_claims.py [--limit N] [--entry ENTRY_NUM] [--dry-run]
|
||
"""
|
||
|
||
import argparse
|
||
import re
|
||
import sys
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import Optional, List, Dict, Any
|
||
from urllib.parse import urlparse
|
||
|
||
import yaml
|
||
|
||
# Type hints for optional dependencies
|
||
etree: Any = None
|
||
BeautifulSoup: Any = None
|
||
|
||
try:
|
||
from lxml import etree as _etree
|
||
etree = _etree
|
||
HAS_LXML = True
|
||
except ImportError:
|
||
HAS_LXML = False
|
||
print("Warning: Missing dependency: lxml")
|
||
print("Install with: pip install lxml")
|
||
|
||
try:
|
||
from bs4 import BeautifulSoup as _BeautifulSoup
|
||
BeautifulSoup = _BeautifulSoup
|
||
HAS_BS4 = True
|
||
except ImportError:
|
||
HAS_BS4 = False
|
||
print("Warning: Missing dependency: beautifulsoup4")
|
||
print("Install with: pip install beautifulsoup4")
|
||
|
||
HAS_DEPS = HAS_LXML # Only lxml is required for this script
|
||
|
||
|
||
# Directories
|
||
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
|
||
WEB_DIR = ENTRIES_DIR / 'web'
|
||
|
||
|
||
# Claim types to extract
|
||
CLAIM_TYPES = {
|
||
'org_name': 'Organization/institution official name',
|
||
'org_name_alt': 'Alternative organization name',
|
||
'tagline': 'Organization tagline or slogan',
|
||
'description': 'Organization description',
|
||
'description_short': 'Short description (meta description)',
|
||
'email': 'Email address',
|
||
'phone': 'Phone number',
|
||
'address': 'Physical address',
|
||
'postal_code': 'Postal code',
|
||
'city': 'City name',
|
||
'opening_hours_text': 'Opening hours as text',
|
||
'social_twitter': 'Twitter/X URL',
|
||
'social_facebook': 'Facebook URL',
|
||
'social_instagram': 'Instagram URL',
|
||
'social_linkedin': 'LinkedIn URL',
|
||
'social_youtube': 'YouTube URL',
|
||
}
|
||
|
||
|
||
def get_xpath_lxml(element) -> str:
|
||
"""Generate absolute XPath for an lxml element."""
|
||
tree = element.getroottree()
|
||
return tree.getpath(element)
|
||
|
||
|
||
def get_xpath_bs4(element) -> str:
|
||
"""Generate XPath for a BeautifulSoup element."""
|
||
parts = []
|
||
current = element
|
||
while current and current.name:
|
||
siblings = [s for s in current.find_previous_siblings(current.name)]
|
||
index = len(siblings) + 1
|
||
parts.insert(0, f"{current.name}[{index}]")
|
||
current = current.parent
|
||
return '/' + '/'.join(parts) if parts else '/'
|
||
|
||
|
||
def create_claim(
|
||
claim_type: str,
|
||
claim_value: str,
|
||
xpath: str,
|
||
html_file: str,
|
||
source_url: str,
|
||
retrieved_on: str,
|
||
raw_value: Optional[str] = None,
|
||
extraction_method: str = 'html_parser',
|
||
xpath_match_score: float = 1.0,
|
||
) -> Dict[str, Any]:
|
||
"""Create a properly structured claim with full provenance."""
|
||
return {
|
||
'claim_type': claim_type,
|
||
'claim_value': claim_value.strip() if claim_value else '',
|
||
'raw_value': raw_value or claim_value,
|
||
'source_url': source_url,
|
||
'retrieved_on': retrieved_on,
|
||
'xpath': xpath,
|
||
'html_file': html_file,
|
||
'xpath_match_score': xpath_match_score,
|
||
'extraction_method': extraction_method,
|
||
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
|
||
}
|
||
|
||
|
||
# === Extractors for specific claim types ===
|
||
|
||
def extract_title_claims(tree, html_file: str, source_url: str, retrieved_on: str) -> List[Dict]:
|
||
"""Extract organization name from <title> tag."""
|
||
claims = []
|
||
titles = tree.xpath('//title')
|
||
for title in titles:
|
||
if title.text:
|
||
raw_text = title.text.strip()
|
||
# Try to extract clean org name (before separator)
|
||
separators = [' - ', ' | ', ' – ', ' — ', ': ']
|
||
clean_name = raw_text
|
||
for sep in separators:
|
||
if sep in raw_text:
|
||
parts = raw_text.split(sep)
|
||
# Usually the org name is first or last
|
||
clean_name = parts[0].strip()
|
||
break
|
||
|
||
claims.append(create_claim(
|
||
claim_type='org_name',
|
||
claim_value=clean_name,
|
||
xpath=get_xpath_lxml(title),
|
||
html_file=html_file,
|
||
source_url=source_url,
|
||
retrieved_on=retrieved_on,
|
||
raw_value=raw_text,
|
||
extraction_method='title_tag',
|
||
))
|
||
return claims
|
||
|
||
|
||
def extract_meta_description(tree, html_file: str, source_url: str, retrieved_on: str) -> List[Dict]:
|
||
"""Extract description from meta tags."""
|
||
claims = []
|
||
|
||
# Standard meta description
|
||
metas = tree.xpath('//meta[@name="description"]/@content')
|
||
meta_elements = tree.xpath('//meta[@name="description"]')
|
||
for i, content in enumerate(metas):
|
||
if content and content.strip():
|
||
claims.append(create_claim(
|
||
claim_type='description_short',
|
||
claim_value=content.strip(),
|
||
xpath=get_xpath_lxml(meta_elements[i]) if i < len(meta_elements) else '//meta[@name="description"]',
|
||
html_file=html_file,
|
||
source_url=source_url,
|
||
retrieved_on=retrieved_on,
|
||
extraction_method='meta_description',
|
||
))
|
||
|
||
# OpenGraph description
|
||
og_desc = tree.xpath('//meta[@property="og:description"]/@content')
|
||
og_elements = tree.xpath('//meta[@property="og:description"]')
|
||
for i, content in enumerate(og_desc):
|
||
if content and content.strip():
|
||
claims.append(create_claim(
|
||
claim_type='description_short',
|
||
claim_value=content.strip(),
|
||
xpath=get_xpath_lxml(og_elements[i]) if i < len(og_elements) else '//meta[@property="og:description"]',
|
||
html_file=html_file,
|
||
source_url=source_url,
|
||
retrieved_on=retrieved_on,
|
||
extraction_method='og_description',
|
||
))
|
||
|
||
return claims
|
||
|
||
|
||
def extract_og_site_name(tree, html_file: str, source_url: str, retrieved_on: str) -> List[Dict]:
|
||
"""Extract organization name from og:site_name."""
|
||
claims = []
|
||
og_names = tree.xpath('//meta[@property="og:site_name"]/@content')
|
||
og_elements = tree.xpath('//meta[@property="og:site_name"]')
|
||
for i, content in enumerate(og_names):
|
||
if content and content.strip():
|
||
claims.append(create_claim(
|
||
claim_type='org_name',
|
||
claim_value=content.strip(),
|
||
xpath=get_xpath_lxml(og_elements[i]) if i < len(og_elements) else '//meta[@property="og:site_name"]',
|
||
html_file=html_file,
|
||
source_url=source_url,
|
||
retrieved_on=retrieved_on,
|
||
extraction_method='og_site_name',
|
||
))
|
||
return claims
|
||
|
||
|
||
def extract_schema_org(tree, html_file: str, source_url: str, retrieved_on: str) -> List[Dict]:
|
||
"""Extract data from schema.org JSON-LD."""
|
||
claims = []
|
||
import json
|
||
|
||
scripts = tree.xpath('//script[@type="application/ld+json"]')
|
||
for script in scripts:
|
||
if script.text:
|
||
try:
|
||
data = json.loads(script.text)
|
||
if isinstance(data, list):
|
||
for item in data:
|
||
claims.extend(_extract_schema_item(item, get_xpath_lxml(script), html_file, source_url, retrieved_on))
|
||
else:
|
||
claims.extend(_extract_schema_item(data, get_xpath_lxml(script), html_file, source_url, retrieved_on))
|
||
except json.JSONDecodeError:
|
||
pass
|
||
return claims
|
||
|
||
|
||
def _extract_schema_item(item: dict, xpath: str, html_file: str, source_url: str, retrieved_on: str) -> List[Dict]:
|
||
"""Extract claims from a schema.org item."""
|
||
claims = []
|
||
|
||
# Get the @type to distinguish organizations from events
|
||
item_type = item.get('@type', '')
|
||
if isinstance(item_type, list):
|
||
item_type = item_type[0] if item_type else ''
|
||
|
||
# Organization types that should have org_name extracted
|
||
org_types = {
|
||
'Organization', 'LocalBusiness', 'Museum', 'Library', 'Archive',
|
||
'EducationalOrganization', 'GovernmentOrganization', 'NGO',
|
||
'Corporation', 'Place', 'CivicStructure', 'LandmarksOrHistoricalBuildings',
|
||
'PerformingArtsTheater', 'MovieTheater', 'Zoo', 'Aquarium',
|
||
}
|
||
|
||
# Event types - extract as event_name, not org_name
|
||
event_types = {'Event', 'BusinessEvent', 'ChildrensEvent', 'ComedyEvent',
|
||
'CourseInstance', 'DanceEvent', 'DeliveryEvent', 'EducationEvent',
|
||
'EventSeries', 'ExhibitionEvent', 'Festival', 'FoodEvent',
|
||
'Hackathon', 'LiteraryEvent', 'MusicEvent', 'PublicationEvent',
|
||
'SaleEvent', 'ScreeningEvent', 'SocialEvent', 'SportsEvent',
|
||
'TheaterEvent', 'VisualArtsEvent'}
|
||
|
||
is_org = any(t in item_type for t in org_types) or not item_type
|
||
is_event = any(t in item_type for t in event_types)
|
||
|
||
# Organization name - only for org types or if @type is missing
|
||
if 'name' in item and is_org and not is_event:
|
||
name_value = item['name']
|
||
# Skip if it looks like HTML/code
|
||
if name_value and '<' not in name_value and len(name_value) < 200:
|
||
claims.append(create_claim(
|
||
claim_type='org_name',
|
||
claim_value=name_value,
|
||
xpath=xpath,
|
||
html_file=html_file,
|
||
source_url=source_url,
|
||
retrieved_on=retrieved_on,
|
||
extraction_method='schema_org_name',
|
||
))
|
||
|
||
# Description - only for organizations, skip HTML/code
|
||
if 'description' in item and is_org and not is_event:
|
||
desc_value = item['description']
|
||
# Skip if it looks like HTML/code
|
||
if desc_value and '<' not in desc_value and 'vc_row' not in desc_value:
|
||
claims.append(create_claim(
|
||
claim_type='description',
|
||
claim_value=desc_value,
|
||
xpath=xpath,
|
||
html_file=html_file,
|
||
source_url=source_url,
|
||
retrieved_on=retrieved_on,
|
||
extraction_method='schema_org_description',
|
||
))
|
||
|
||
# Address
|
||
if 'address' in item:
|
||
addr = item['address']
|
||
if isinstance(addr, str):
|
||
claims.append(create_claim(
|
||
claim_type='address',
|
||
claim_value=addr,
|
||
xpath=xpath,
|
||
html_file=html_file,
|
||
source_url=source_url,
|
||
retrieved_on=retrieved_on,
|
||
extraction_method='schema_org_address',
|
||
))
|
||
elif isinstance(addr, dict):
|
||
if 'streetAddress' in addr:
|
||
claims.append(create_claim(
|
||
claim_type='address',
|
||
claim_value=addr['streetAddress'],
|
||
xpath=xpath,
|
||
html_file=html_file,
|
||
source_url=source_url,
|
||
retrieved_on=retrieved_on,
|
||
extraction_method='schema_org_streetAddress',
|
||
))
|
||
if 'postalCode' in addr:
|
||
claims.append(create_claim(
|
||
claim_type='postal_code',
|
||
claim_value=addr['postalCode'],
|
||
xpath=xpath,
|
||
html_file=html_file,
|
||
source_url=source_url,
|
||
retrieved_on=retrieved_on,
|
||
extraction_method='schema_org_postalCode',
|
||
))
|
||
if 'addressLocality' in addr:
|
||
claims.append(create_claim(
|
||
claim_type='city',
|
||
claim_value=addr['addressLocality'],
|
||
xpath=xpath,
|
||
html_file=html_file,
|
||
source_url=source_url,
|
||
retrieved_on=retrieved_on,
|
||
extraction_method='schema_org_addressLocality',
|
||
))
|
||
|
||
# Phone
|
||
if 'telephone' in item:
|
||
claims.append(create_claim(
|
||
claim_type='phone',
|
||
claim_value=item['telephone'],
|
||
xpath=xpath,
|
||
html_file=html_file,
|
||
source_url=source_url,
|
||
retrieved_on=retrieved_on,
|
||
extraction_method='schema_org_telephone',
|
||
))
|
||
|
||
# Email
|
||
if 'email' in item:
|
||
claims.append(create_claim(
|
||
claim_type='email',
|
||
claim_value=item['email'],
|
||
xpath=xpath,
|
||
html_file=html_file,
|
||
source_url=source_url,
|
||
retrieved_on=retrieved_on,
|
||
extraction_method='schema_org_email',
|
||
))
|
||
|
||
# Social media
|
||
if 'sameAs' in item:
|
||
same_as = item['sameAs'] if isinstance(item['sameAs'], list) else [item['sameAs']]
|
||
for url in same_as:
|
||
if 'twitter.com' in url or 'x.com' in url:
|
||
claims.append(create_claim(claim_type='social_twitter', claim_value=url, xpath=xpath, html_file=html_file, source_url=source_url, retrieved_on=retrieved_on, extraction_method='schema_org_sameAs'))
|
||
elif 'facebook.com' in url:
|
||
claims.append(create_claim(claim_type='social_facebook', claim_value=url, xpath=xpath, html_file=html_file, source_url=source_url, retrieved_on=retrieved_on, extraction_method='schema_org_sameAs'))
|
||
elif 'instagram.com' in url:
|
||
claims.append(create_claim(claim_type='social_instagram', claim_value=url, xpath=xpath, html_file=html_file, source_url=source_url, retrieved_on=retrieved_on, extraction_method='schema_org_sameAs'))
|
||
elif 'linkedin.com' in url:
|
||
claims.append(create_claim(claim_type='social_linkedin', claim_value=url, xpath=xpath, html_file=html_file, source_url=source_url, retrieved_on=retrieved_on, extraction_method='schema_org_sameAs'))
|
||
elif 'youtube.com' in url:
|
||
claims.append(create_claim(claim_type='social_youtube', claim_value=url, xpath=xpath, html_file=html_file, source_url=source_url, retrieved_on=retrieved_on, extraction_method='schema_org_sameAs'))
|
||
|
||
return claims
|
||
|
||
|
||
def extract_email_links(tree, html_file: str, source_url: str, retrieved_on: str) -> List[Dict]:
|
||
"""Extract email addresses from mailto: links."""
|
||
claims = []
|
||
mailto_links = tree.xpath('//a[starts-with(@href, "mailto:")]')
|
||
for link in mailto_links:
|
||
href = link.get('href', '')
|
||
if href.startswith('mailto:'):
|
||
email = href[7:].split('?')[0] # Remove query params
|
||
if email and '@' in email:
|
||
claims.append(create_claim(
|
||
claim_type='email',
|
||
claim_value=email,
|
||
xpath=get_xpath_lxml(link),
|
||
html_file=html_file,
|
||
source_url=source_url,
|
||
retrieved_on=retrieved_on,
|
||
extraction_method='mailto_link',
|
||
))
|
||
return claims
|
||
|
||
|
||
def extract_phone_links(tree, html_file: str, source_url: str, retrieved_on: str) -> List[Dict]:
|
||
"""Extract phone numbers from tel: links."""
|
||
claims = []
|
||
tel_links = tree.xpath('//a[starts-with(@href, "tel:")]')
|
||
for link in tel_links:
|
||
href = link.get('href', '')
|
||
if href.startswith('tel:'):
|
||
phone = href[4:]
|
||
if phone:
|
||
claims.append(create_claim(
|
||
claim_type='phone',
|
||
claim_value=phone,
|
||
xpath=get_xpath_lxml(link),
|
||
html_file=html_file,
|
||
source_url=source_url,
|
||
retrieved_on=retrieved_on,
|
||
extraction_method='tel_link',
|
||
))
|
||
return claims
|
||
|
||
|
||
def extract_social_links(tree, html_file: str, source_url: str, retrieved_on: str) -> List[Dict]:
|
||
"""Extract social media links."""
|
||
claims = []
|
||
social_patterns = {
|
||
'social_twitter': ['twitter.com', 'x.com'],
|
||
'social_facebook': ['facebook.com'],
|
||
'social_instagram': ['instagram.com'],
|
||
'social_linkedin': ['linkedin.com'],
|
||
'social_youtube': ['youtube.com'],
|
||
}
|
||
|
||
for link in tree.xpath('//a[@href]'):
|
||
href = link.get('href', '')
|
||
for claim_type, domains in social_patterns.items():
|
||
for domain in domains:
|
||
if domain in href:
|
||
claims.append(create_claim(
|
||
claim_type=claim_type,
|
||
claim_value=href,
|
||
xpath=get_xpath_lxml(link),
|
||
html_file=html_file,
|
||
source_url=source_url,
|
||
retrieved_on=retrieved_on,
|
||
extraction_method='social_link',
|
||
))
|
||
break
|
||
return claims
|
||
|
||
|
||
def extract_h1_org_name(tree, html_file: str, source_url: str, retrieved_on: str) -> List[Dict]:
|
||
"""Extract organization name from first h1."""
|
||
claims = []
|
||
h1s = tree.xpath('//h1')
|
||
if h1s:
|
||
h1 = h1s[0]
|
||
text = ''.join(h1.itertext()).strip()
|
||
if text and len(text) > 2 and len(text) < 150:
|
||
claims.append(create_claim(
|
||
claim_type='org_name',
|
||
claim_value=text,
|
||
xpath=get_xpath_lxml(h1),
|
||
html_file=html_file,
|
||
source_url=source_url,
|
||
retrieved_on=retrieved_on,
|
||
extraction_method='h1_tag',
|
||
xpath_match_score=0.9, # Slightly lower confidence
|
||
))
|
||
return claims
|
||
|
||
|
||
def extract_all_claims(html_content: str, html_file: str, source_url: str, retrieved_on: str) -> List[Dict]:
|
||
"""Extract all claims from HTML content."""
|
||
claims = []
|
||
|
||
try:
|
||
# Parse with lxml for proper XPath support
|
||
tree = etree.HTML(html_content)
|
||
|
||
# Run all extractors
|
||
extractors = [
|
||
extract_title_claims,
|
||
extract_meta_description,
|
||
extract_og_site_name,
|
||
extract_schema_org,
|
||
extract_email_links,
|
||
extract_phone_links,
|
||
extract_social_links,
|
||
extract_h1_org_name,
|
||
]
|
||
|
||
for extractor in extractors:
|
||
try:
|
||
claims.extend(extractor(tree, html_file, source_url, retrieved_on))
|
||
except Exception as e:
|
||
print(f" Warning: Extractor {extractor.__name__} failed: {e}")
|
||
|
||
except Exception as e:
|
||
print(f" Error parsing HTML: {e}")
|
||
|
||
return claims
|
||
|
||
|
||
def deduplicate_claims(claims: List[Dict]) -> List[Dict]:
|
||
"""Remove duplicate claims, keeping highest confidence."""
|
||
seen = {}
|
||
for claim in claims:
|
||
key = (claim['claim_type'], claim['claim_value'])
|
||
if key not in seen or claim['xpath_match_score'] > seen[key]['xpath_match_score']:
|
||
seen[key] = claim
|
||
return list(seen.values())
|
||
|
||
|
||
def get_web_archive_path(entry_data: dict, entry_num: str) -> Optional[Path]:
|
||
"""Get the web archive directory path for an entry."""
|
||
web_enrichment = entry_data.get('web_enrichment', {})
|
||
web_archives = web_enrichment.get('web_archives', [])
|
||
|
||
if web_archives:
|
||
archive = web_archives[0]
|
||
directory = archive.get('directory')
|
||
if directory:
|
||
return ENTRIES_DIR / directory
|
||
|
||
# Fallback: look for directory in web/{entry_num}/
|
||
entry_web_dir = WEB_DIR / entry_num
|
||
if entry_web_dir.exists():
|
||
subdirs = [d for d in entry_web_dir.iterdir() if d.is_dir()]
|
||
if subdirs:
|
||
return subdirs[0]
|
||
|
||
return None
|
||
|
||
|
||
def load_metadata(archive_path: Path) -> Optional[dict]:
|
||
"""Load metadata.yaml from archive directory."""
|
||
metadata_file = archive_path / 'metadata.yaml'
|
||
if metadata_file.exists():
|
||
try:
|
||
with open(metadata_file, 'r', encoding='utf-8') as f:
|
||
return yaml.safe_load(f)
|
||
except Exception as e:
|
||
print(f" Warning: Failed to load {metadata_file}: {e}")
|
||
return None
|
||
|
||
|
||
def find_html_files(archive_path: Path) -> List[Path]:
|
||
"""Find all HTML files in archive directory."""
|
||
html_files = []
|
||
|
||
# Check pages/ directory first
|
||
pages_dir = archive_path / 'pages'
|
||
if pages_dir.exists():
|
||
html_files.extend(pages_dir.glob('*.html'))
|
||
|
||
# Check mirror/ directory
|
||
mirror_dir = archive_path / 'mirror'
|
||
if mirror_dir.exists():
|
||
html_files.extend(mirror_dir.rglob('*.html'))
|
||
|
||
# Check root for rendered.html
|
||
rendered = archive_path / 'rendered.html'
|
||
if rendered.exists():
|
||
html_files.append(rendered)
|
||
|
||
return html_files
|
||
|
||
|
||
def extract_entry_number(filename: str) -> str:
|
||
"""Extract entry number from filename."""
|
||
match = re.match(r'^(\d+)', filename)
|
||
return match.group(1) if match else filename.replace('.yaml', '')
|
||
|
||
|
||
def process_entry(filepath: Path, dry_run: bool = False) -> tuple[int, List[str]]:
|
||
"""
|
||
Process a single entry file to extract HTML claims.
|
||
|
||
Returns: (claims_count, errors)
|
||
"""
|
||
with open(filepath, 'r', encoding='utf-8') as f:
|
||
data = yaml.safe_load(f)
|
||
|
||
if not data:
|
||
return 0, ["Empty file"]
|
||
|
||
entry_num = extract_entry_number(filepath.name)
|
||
errors = []
|
||
all_claims = []
|
||
|
||
# Get web archive path
|
||
archive_path = get_web_archive_path(data, entry_num)
|
||
if not archive_path or not archive_path.exists():
|
||
return 0, [f"No web archive found for entry {entry_num}"]
|
||
|
||
# Load metadata for timestamps
|
||
metadata = load_metadata(archive_path)
|
||
source_url = metadata.get('url', '') if metadata else ''
|
||
retrieved_on = metadata.get('archive_timestamp', '') if metadata else ''
|
||
|
||
if not source_url:
|
||
# Try to get URL from entry data
|
||
source_url = data.get('web_enrichment', {}).get('web_archives', [{}])[0].get('url', '')
|
||
if not source_url:
|
||
source_url = data.get('original_entry', {}).get('webadres_organisatie', '')
|
||
|
||
# Find and process HTML files
|
||
html_files = find_html_files(archive_path)
|
||
if not html_files:
|
||
return 0, [f"No HTML files found in {archive_path}"]
|
||
|
||
# Process HTML files in priority order:
|
||
# 1. rendered.html (Playwright-rendered, most complete)
|
||
# 2. index.html from root (fallback)
|
||
# 3. index.html from mirror/ (warc mirror)
|
||
# 4. First available HTML file
|
||
rendered_files = [f for f in html_files if f.name == 'rendered.html' and f.parent == archive_path]
|
||
root_index = [f for f in html_files if f.name == 'index.html' and f.parent == archive_path]
|
||
mirror_index = [f for f in html_files if f.name == 'index.html' and 'mirror' in str(f)]
|
||
|
||
if rendered_files:
|
||
main_html = rendered_files[0]
|
||
elif root_index:
|
||
main_html = root_index[0]
|
||
elif mirror_index:
|
||
main_html = mirror_index[0]
|
||
else:
|
||
main_html = html_files[0]
|
||
|
||
try:
|
||
with open(main_html, 'r', encoding='utf-8', errors='replace') as f:
|
||
html_content = f.read()
|
||
|
||
html_file_rel = str(main_html.relative_to(ENTRIES_DIR))
|
||
claims = extract_all_claims(html_content, html_file_rel, source_url, retrieved_on)
|
||
all_claims.extend(claims)
|
||
except Exception as e:
|
||
errors.append(f"Failed to process {main_html}: {e}")
|
||
|
||
# Deduplicate claims
|
||
all_claims = deduplicate_claims(all_claims)
|
||
|
||
if not dry_run and all_claims:
|
||
# Store claims in entry data
|
||
if 'web_claims' not in data:
|
||
data['web_claims'] = {}
|
||
|
||
data['web_claims'] = {
|
||
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
|
||
'source_archive': str(archive_path.relative_to(ENTRIES_DIR)),
|
||
'claims_count': len(all_claims),
|
||
'claims': all_claims,
|
||
}
|
||
|
||
# Write back
|
||
with open(filepath, 'w', encoding='utf-8') as f:
|
||
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||
|
||
return len(all_claims), errors
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description='Extract structured claims from archived HTML')
|
||
parser.add_argument('--limit', type=int, default=None, help='Limit number of entries')
|
||
parser.add_argument('--entry', type=str, default=None, help='Process specific entry number')
|
||
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without writing')
|
||
parser.add_argument('--force', action='store_true', help='Re-extract even if web_claims exists')
|
||
args = parser.parse_args()
|
||
|
||
if not HAS_DEPS:
|
||
print("Error: Required dependencies not installed.")
|
||
print("Run: pip install beautifulsoup4 lxml")
|
||
return 1
|
||
|
||
# Find entry files
|
||
if args.entry:
|
||
files = list(ENTRIES_DIR.glob(f'{args.entry}*.yaml'))
|
||
else:
|
||
files = sorted([f for f in ENTRIES_DIR.glob('*.yaml') if f.is_file() and not f.name.startswith('.')])
|
||
|
||
if args.limit:
|
||
files = files[:args.limit]
|
||
|
||
total_claims = 0
|
||
total_entries = 0
|
||
total_skipped = 0
|
||
total_failed = 0
|
||
|
||
print(f"Processing {len(files)} entries...")
|
||
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
|
||
print()
|
||
|
||
for filepath in files:
|
||
if filepath.is_dir():
|
||
continue
|
||
|
||
# Skip if already has web_claims (unless --force)
|
||
if not args.force:
|
||
with open(filepath, 'r', encoding='utf-8') as f:
|
||
data = yaml.safe_load(f)
|
||
if data and data.get('web_claims', {}).get('claims'):
|
||
total_skipped += 1
|
||
continue
|
||
|
||
claims_count, errors = process_entry(filepath, dry_run=args.dry_run)
|
||
|
||
if claims_count > 0:
|
||
total_entries += 1
|
||
total_claims += claims_count
|
||
print(f" ✓ {filepath.name}: {claims_count} claims")
|
||
elif errors:
|
||
total_failed += 1
|
||
for e in errors:
|
||
print(f" ✗ {filepath.name}: {e}")
|
||
else:
|
||
total_failed += 1
|
||
print(f" ✗ {filepath.name}: No claims extracted")
|
||
|
||
print()
|
||
print(f"{'DRY RUN - ' if args.dry_run else ''}Summary:")
|
||
print(f" Entries with claims: {total_entries}")
|
||
print(f" Total claims extracted: {total_claims}")
|
||
print(f" Skipped (already have claims): {total_skipped}")
|
||
print(f" Failed (no archive/claims): {total_failed}")
|
||
|
||
return 0
|
||
|
||
|
||
if __name__ == '__main__':
|
||
sys.exit(main())
|