glam/scripts/reenrich_with_xpath.py

#!/usr/bin/env python3
"""
Re-enrich custodian files with proper xpath provenance.
Uses httpx to fetch and extract web claims.

Per Rule 6 in AGENTS.md: "Every claim extracted from a webpage MUST have an XPath
pointer to the exact location in archived HTML where that value appears."
"""
import os
import re
import json
import yaml
import subprocess
import asyncio
import httpx
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import time

# Load queue
QUEUE_FILE = Path('/Users/kempersc/apps/glam/data/reenrich_queue.json')
ARCHIVE_DIR = Path('/Users/kempersc/apps/glam/data/custodian/web')

def get_xpath(element) -> str:
    """Generate XPath for a BeautifulSoup element."""
    parts = []
    child = element
    while child.parent and child.parent.name:
        siblings = list(child.parent.children)
        tag_siblings = [s for s in siblings if hasattr(s, 'name') and s.name == child.name]
        if len(tag_siblings) > 1:
            index = tag_siblings.index(child) + 1
            parts.insert(0, f"{child.name}[{index}]")
        else:
            parts.insert(0, child.name)
        child = child.parent

    return '/' + '/'.join(parts) if parts else '/'


def extract_claims_with_xpath(html: str, source_url: str) -> list[dict]:
    """
    Extract claims from HTML with proper xpath provenance.
    Returns list of claims with xpath, claim_type, claim_value, etc.
    """
    soup = BeautifulSoup(html, 'html.parser')
    claims = []
    timestamp = datetime.now(timezone.utc).isoformat()

    # 1. Extract title
    title_tag = soup.find('title')
    if title_tag and title_tag.string:
        claims.append({
            'claim_type': 'page_title',
            'claim_value': title_tag.string.strip(),
            'source_url': source_url,
            'retrieved_on': timestamp,
            'xpath': get_xpath(title_tag),
            'xpath_match_score': 1.0,
            'extraction_method': 'title_tag'
        })

    # 2. Extract meta description
    meta_desc = soup.find('meta', attrs={'name': 'description'})
    if meta_desc:
        content = meta_desc.get('content')
        if content and isinstance(content, str):
            claims.append({
                'claim_type': 'description',
                'claim_value': content.strip(),
                'source_url': source_url,
                'retrieved_on': timestamp,
                'xpath': get_xpath(meta_desc),
                'xpath_match_score': 1.0,
                'extraction_method': 'meta_description'
            })

    # 3. Extract og:title and og:description
    og_title = soup.find('meta', property='og:title')
    if og_title:
        content = og_title.get('content')
        if content and isinstance(content, str):
            claims.append({
                'claim_type': 'og_title',
                'claim_value': content.strip(),
                'source_url': source_url,
                'retrieved_on': timestamp,
                'xpath': get_xpath(og_title),
                'xpath_match_score': 1.0,
                'extraction_method': 'og_title'
            })

    og_desc = soup.find('meta', property='og:description')
    if og_desc:
        content = og_desc.get('content')
        if content and isinstance(content, str):
            claims.append({
                'claim_type': 'og_description',
                'claim_value': content.strip(),
                'source_url': source_url,
                'retrieved_on': timestamp,
                'xpath': get_xpath(og_desc),
                'xpath_match_score': 1.0,
                'extraction_method': 'og_description'
            })

    # 4. Extract h1 headings
    h1_tags = soup.find_all('h1')
    for i, h1 in enumerate(h1_tags[:3]):  # Limit to first 3
        text = h1.get_text(strip=True)
        if text and len(text) > 2:
            claims.append({
                'claim_type': 'heading_h1',
                'claim_value': text,
                'source_url': source_url,
                'retrieved_on': timestamp,
                'xpath': get_xpath(h1),
                'xpath_match_score': 1.0,
                'extraction_method': 'h1_tag'
            })

    # 5. Extract contact info patterns
    # Email
    for a in soup.find_all('a', href=re.compile(r'^mailto:')):
        href = a.get('href')
        if href and isinstance(href, str):
            email = href.replace('mailto:', '').split('?')[0]
            if email:
                claims.append({
                    'claim_type': 'email',
                    'claim_value': email,
                    'source_url': source_url,
                    'retrieved_on': timestamp,
                    'xpath': get_xpath(a),
                    'xpath_match_score': 1.0,
                    'extraction_method': 'mailto_link'
                })

    # Phone
    for a in soup.find_all('a', href=re.compile(r'^tel:')):
        href = a.get('href')
        if href and isinstance(href, str):
            phone = href.replace('tel:', '')
            if phone:
                claims.append({
                    'claim_type': 'phone',
                    'claim_value': phone,
                    'source_url': source_url,
                    'retrieved_on': timestamp,
                    'xpath': get_xpath(a),
                    'xpath_match_score': 1.0,
                    'extraction_method': 'tel_link'
                })

    # 6. Extract social media links
    social_patterns = {
        'facebook': r'facebook\.com',
        'twitter': r'twitter\.com|x\.com',
        'instagram': r'instagram\.com',
        'linkedin': r'linkedin\.com',
        'youtube': r'youtube\.com',
    }

    for platform, pattern in social_patterns.items():
        for a in soup.find_all('a', href=re.compile(pattern)):
            href = a.get('href')
            if href and isinstance(href, str):
                claims.append({
                    'claim_type': f'social_{platform}',
                    'claim_value': href,
                    'source_url': source_url,
                    'retrieved_on': timestamp,
                    'xpath': get_xpath(a),
                    'xpath_match_score': 1.0,
                    'extraction_method': 'social_link'
                })
                break  # Only first occurrence per platform

    # 7. Extract address (look for common patterns)
    address_containers = soup.find_all(['address', 'div', 'p'],
                                        class_=re.compile(r'address|contact|location', re.I))
    for container in address_containers[:2]:
        text = container.get_text(strip=True)
        if len(text) > 10 and len(text) < 300:
            claims.append({
                'claim_type': 'address_text',
                'claim_value': text,
                'source_url': source_url,
                'retrieved_on': timestamp,
                'xpath': get_xpath(container),
                'xpath_match_score': 0.8,  # Lower score for inferred
                'extraction_method': 'address_container'
            })

    return claims


def remove_web_enrichment_block(content: str) -> str:
    """Remove the web_enrichment block from YAML content."""
    # Match from ^web_enrichment: to next top-level key or end
    pattern = r'^web_enrichment:.*?(?=^[a-z_]+:|\Z)'
    result = re.sub(pattern, '', content, flags=re.MULTILINE | re.DOTALL)
    # Clean up extra blank lines
    result = re.sub(r'\n{3,}', '\n\n', result)
    return result


def add_web_enrichment_block(content: str, enrichment: dict) -> str:
    """Add new web_enrichment block to YAML content."""
    # Convert enrichment dict to YAML
    enrichment_yaml = yaml.dump({'web_enrichment': enrichment},
                                 default_flow_style=False,
                                 allow_unicode=True,
                                 sort_keys=False)

    # Add at the end of the file
    if not content.endswith('\n'):
        content += '\n'

    return content + '\n' + enrichment_yaml


async def fetch_and_enrich(entry: dict, session: httpx.AsyncClient) -> Optional[dict]:
    """
    Fetch URL and extract claims with xpath provenance.
    Returns enrichment dict or None on failure.
    """
    url = entry['url']
    ghcid = entry['ghcid']

    try:
        # Fetch page
        response = await session.get(url, follow_redirects=True, timeout=30.0)
        response.raise_for_status()
        html = response.text

        # Save HTML archive
        parsed_url = urlparse(url)
        archive_subdir = ARCHIVE_DIR / ghcid.replace('-', '/')
        archive_subdir.mkdir(parents=True, exist_ok=True)

        html_file = archive_subdir / f"{parsed_url.netloc}_index.html"
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(html)

        # Extract claims
        claims = extract_claims_with_xpath(html, url)

        if not claims:
            return None

        # Update html_file path in claims (relative to data/custodian)
        rel_html_path = str(html_file.relative_to(Path('/Users/kempersc/apps/glam/data/custodian')))
        for claim in claims:
            claim['html_file'] = rel_html_path

        enrichment = {
            'archive_metadata': {
                'archive_method': 'httpx_fetch',
                'archive_timestamp': datetime.now(timezone.utc).isoformat(),
                'archive_location': str(archive_subdir.relative_to(Path('/Users/kempersc/apps/glam/data/custodian'))),
                'source_url': url,
                'html_file': rel_html_path
            },
            'claims': claims
        }

        return enrichment

    except Exception as e:
        print(f"  Error fetching {url}: {e}")
        return None


async def process_batch(entries: list[dict], batch_size: int = 5) -> dict:
    """Process a batch of entries concurrently."""
    results = {}

    async with httpx.AsyncClient(
        headers={'User-Agent': 'GLAM Heritage Custodian Enrichment Bot/1.0'},
        follow_redirects=True
    ) as session:

        for i in range(0, len(entries), batch_size):
            batch = entries[i:i+batch_size]
            print(f"Processing batch {i//batch_size + 1} ({len(batch)} entries)...")

            tasks = [fetch_and_enrich(entry, session) for entry in batch]
            batch_results = await asyncio.gather(*tasks, return_exceptions=True)

            for entry, result in zip(batch, batch_results):
                if isinstance(result, Exception):
                    print(f"  Exception for {entry['ghcid']}: {result}")
                    results[entry['ghcid']] = None
                else:
                    results[entry['ghcid']] = result

            # Rate limiting
            await asyncio.sleep(1)

    return results


def update_custodian_file(filepath: str, enrichment: dict) -> bool:
    """
    Update custodian YAML file with new web_enrichment.
    Removes old fabricated block, adds new one with xpath provenance.
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()

        # Remove old web_enrichment block
        content = remove_web_enrichment_block(content)

        # Add new enrichment
        content = add_web_enrichment_block(content, enrichment)

        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)

        return True

    except Exception as e:
        print(f"  Error updating {filepath}: {e}")
        return False


async def main():
    import argparse

    parser = argparse.ArgumentParser(description='Re-enrich files with proper xpath provenance')
    parser.add_argument('--limit', '-l', type=int, default=10, help='Max files to process')
    parser.add_argument('--dry-run', '-n', action='store_true', help='Do not modify files')
    parser.add_argument('--batch-size', '-b', type=int, default=5, help='Concurrent requests')
    args = parser.parse_args()

    # Load queue
    with open(QUEUE_FILE) as f:
        queue_data = json.load(f)

    entries = queue_data['queue'][:args.limit]

    print(f"Re-enriching {len(entries)} files with proper xpath provenance...")
    print(f"Started at: {datetime.now().isoformat()}")
    print()

    # Process entries
    results = await process_batch(entries, batch_size=args.batch_size)

    # Update files
    success = 0
    failed = 0
    skipped = 0

    for entry in entries:
        ghcid = entry['ghcid']
        enrichment = results.get(ghcid)

        if enrichment is None:
            print(f"  SKIP {ghcid}: No enrichment data")
            skipped += 1
            continue

        if args.dry_run:
            print(f"  DRY-RUN {ghcid}: Would update with {len(enrichment['claims'])} claims")
            success += 1
        else:
            if update_custodian_file(entry['filepath'], enrichment):
                print(f"  OK {ghcid}: Updated with {len(enrichment['claims'])} claims")
                success += 1
            else:
                failed += 1

    print()
    print("=" * 60)
    print("RESULTS")
    print("=" * 60)
    print(f"Successfully enriched:  {success}")
    print(f"Skipped (fetch failed): {skipped}")
    print(f"Failed (update error):  {failed}")
    print(f"Completed at: {datetime.now().isoformat()}")


if __name__ == '__main__':
    asyncio.run(main())