389 lines
13 KiB
Python
389 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Re-enrich custodian files with proper xpath provenance.
|
|
Uses httpx to fetch and extract web claims.
|
|
|
|
Per Rule 6 in AGENTS.md: "Every claim extracted from a webpage MUST have an XPath
|
|
pointer to the exact location in archived HTML where that value appears."
|
|
"""
|
|
import os
|
|
import re
|
|
import json
|
|
import yaml
|
|
import subprocess
|
|
import asyncio
|
|
import httpx
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urlparse
|
|
import time
|
|
|
|
# Load queue
|
|
QUEUE_FILE = Path('/Users/kempersc/apps/glam/data/reenrich_queue.json')
|
|
ARCHIVE_DIR = Path('/Users/kempersc/apps/glam/data/custodian/web')
|
|
|
|
def get_xpath(element) -> str:
|
|
"""Generate XPath for a BeautifulSoup element."""
|
|
parts = []
|
|
child = element
|
|
while child.parent and child.parent.name:
|
|
siblings = list(child.parent.children)
|
|
tag_siblings = [s for s in siblings if hasattr(s, 'name') and s.name == child.name]
|
|
if len(tag_siblings) > 1:
|
|
index = tag_siblings.index(child) + 1
|
|
parts.insert(0, f"{child.name}[{index}]")
|
|
else:
|
|
parts.insert(0, child.name)
|
|
child = child.parent
|
|
|
|
return '/' + '/'.join(parts) if parts else '/'
|
|
|
|
|
|
def extract_claims_with_xpath(html: str, source_url: str) -> list[dict]:
|
|
"""
|
|
Extract claims from HTML with proper xpath provenance.
|
|
Returns list of claims with xpath, claim_type, claim_value, etc.
|
|
"""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
claims = []
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# 1. Extract title
|
|
title_tag = soup.find('title')
|
|
if title_tag and title_tag.string:
|
|
claims.append({
|
|
'claim_type': 'page_title',
|
|
'claim_value': title_tag.string.strip(),
|
|
'source_url': source_url,
|
|
'retrieved_on': timestamp,
|
|
'xpath': get_xpath(title_tag),
|
|
'xpath_match_score': 1.0,
|
|
'extraction_method': 'title_tag'
|
|
})
|
|
|
|
# 2. Extract meta description
|
|
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
|
if meta_desc:
|
|
content = meta_desc.get('content')
|
|
if content and isinstance(content, str):
|
|
claims.append({
|
|
'claim_type': 'description',
|
|
'claim_value': content.strip(),
|
|
'source_url': source_url,
|
|
'retrieved_on': timestamp,
|
|
'xpath': get_xpath(meta_desc),
|
|
'xpath_match_score': 1.0,
|
|
'extraction_method': 'meta_description'
|
|
})
|
|
|
|
# 3. Extract og:title and og:description
|
|
og_title = soup.find('meta', property='og:title')
|
|
if og_title:
|
|
content = og_title.get('content')
|
|
if content and isinstance(content, str):
|
|
claims.append({
|
|
'claim_type': 'og_title',
|
|
'claim_value': content.strip(),
|
|
'source_url': source_url,
|
|
'retrieved_on': timestamp,
|
|
'xpath': get_xpath(og_title),
|
|
'xpath_match_score': 1.0,
|
|
'extraction_method': 'og_title'
|
|
})
|
|
|
|
og_desc = soup.find('meta', property='og:description')
|
|
if og_desc:
|
|
content = og_desc.get('content')
|
|
if content and isinstance(content, str):
|
|
claims.append({
|
|
'claim_type': 'og_description',
|
|
'claim_value': content.strip(),
|
|
'source_url': source_url,
|
|
'retrieved_on': timestamp,
|
|
'xpath': get_xpath(og_desc),
|
|
'xpath_match_score': 1.0,
|
|
'extraction_method': 'og_description'
|
|
})
|
|
|
|
# 4. Extract h1 headings
|
|
h1_tags = soup.find_all('h1')
|
|
for i, h1 in enumerate(h1_tags[:3]): # Limit to first 3
|
|
text = h1.get_text(strip=True)
|
|
if text and len(text) > 2:
|
|
claims.append({
|
|
'claim_type': 'heading_h1',
|
|
'claim_value': text,
|
|
'source_url': source_url,
|
|
'retrieved_on': timestamp,
|
|
'xpath': get_xpath(h1),
|
|
'xpath_match_score': 1.0,
|
|
'extraction_method': 'h1_tag'
|
|
})
|
|
|
|
# 5. Extract contact info patterns
|
|
# Email
|
|
for a in soup.find_all('a', href=re.compile(r'^mailto:')):
|
|
href = a.get('href')
|
|
if href and isinstance(href, str):
|
|
email = href.replace('mailto:', '').split('?')[0]
|
|
if email:
|
|
claims.append({
|
|
'claim_type': 'email',
|
|
'claim_value': email,
|
|
'source_url': source_url,
|
|
'retrieved_on': timestamp,
|
|
'xpath': get_xpath(a),
|
|
'xpath_match_score': 1.0,
|
|
'extraction_method': 'mailto_link'
|
|
})
|
|
|
|
# Phone
|
|
for a in soup.find_all('a', href=re.compile(r'^tel:')):
|
|
href = a.get('href')
|
|
if href and isinstance(href, str):
|
|
phone = href.replace('tel:', '')
|
|
if phone:
|
|
claims.append({
|
|
'claim_type': 'phone',
|
|
'claim_value': phone,
|
|
'source_url': source_url,
|
|
'retrieved_on': timestamp,
|
|
'xpath': get_xpath(a),
|
|
'xpath_match_score': 1.0,
|
|
'extraction_method': 'tel_link'
|
|
})
|
|
|
|
# 6. Extract social media links
|
|
social_patterns = {
|
|
'facebook': r'facebook\.com',
|
|
'twitter': r'twitter\.com|x\.com',
|
|
'instagram': r'instagram\.com',
|
|
'linkedin': r'linkedin\.com',
|
|
'youtube': r'youtube\.com',
|
|
}
|
|
|
|
for platform, pattern in social_patterns.items():
|
|
for a in soup.find_all('a', href=re.compile(pattern)):
|
|
href = a.get('href')
|
|
if href and isinstance(href, str):
|
|
claims.append({
|
|
'claim_type': f'social_{platform}',
|
|
'claim_value': href,
|
|
'source_url': source_url,
|
|
'retrieved_on': timestamp,
|
|
'xpath': get_xpath(a),
|
|
'xpath_match_score': 1.0,
|
|
'extraction_method': 'social_link'
|
|
})
|
|
break # Only first occurrence per platform
|
|
|
|
# 7. Extract address (look for common patterns)
|
|
address_containers = soup.find_all(['address', 'div', 'p'],
|
|
class_=re.compile(r'address|contact|location', re.I))
|
|
for container in address_containers[:2]:
|
|
text = container.get_text(strip=True)
|
|
if len(text) > 10 and len(text) < 300:
|
|
claims.append({
|
|
'claim_type': 'address_text',
|
|
'claim_value': text,
|
|
'source_url': source_url,
|
|
'retrieved_on': timestamp,
|
|
'xpath': get_xpath(container),
|
|
'xpath_match_score': 0.8, # Lower score for inferred
|
|
'extraction_method': 'address_container'
|
|
})
|
|
|
|
return claims
|
|
|
|
|
|
def remove_web_enrichment_block(content: str) -> str:
|
|
"""Remove the web_enrichment block from YAML content."""
|
|
# Match from ^web_enrichment: to next top-level key or end
|
|
pattern = r'^web_enrichment:.*?(?=^[a-z_]+:|\Z)'
|
|
result = re.sub(pattern, '', content, flags=re.MULTILINE | re.DOTALL)
|
|
# Clean up extra blank lines
|
|
result = re.sub(r'\n{3,}', '\n\n', result)
|
|
return result
|
|
|
|
|
|
def add_web_enrichment_block(content: str, enrichment: dict) -> str:
|
|
"""Add new web_enrichment block to YAML content."""
|
|
# Convert enrichment dict to YAML
|
|
enrichment_yaml = yaml.dump({'web_enrichment': enrichment},
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False)
|
|
|
|
# Add at the end of the file
|
|
if not content.endswith('\n'):
|
|
content += '\n'
|
|
|
|
return content + '\n' + enrichment_yaml
|
|
|
|
|
|
async def fetch_and_enrich(entry: dict, session: httpx.AsyncClient) -> Optional[dict]:
|
|
"""
|
|
Fetch URL and extract claims with xpath provenance.
|
|
Returns enrichment dict or None on failure.
|
|
"""
|
|
url = entry['url']
|
|
ghcid = entry['ghcid']
|
|
|
|
try:
|
|
# Fetch page
|
|
response = await session.get(url, follow_redirects=True, timeout=30.0)
|
|
response.raise_for_status()
|
|
html = response.text
|
|
|
|
# Save HTML archive
|
|
parsed_url = urlparse(url)
|
|
archive_subdir = ARCHIVE_DIR / ghcid.replace('-', '/')
|
|
archive_subdir.mkdir(parents=True, exist_ok=True)
|
|
|
|
html_file = archive_subdir / f"{parsed_url.netloc}_index.html"
|
|
with open(html_file, 'w', encoding='utf-8') as f:
|
|
f.write(html)
|
|
|
|
# Extract claims
|
|
claims = extract_claims_with_xpath(html, url)
|
|
|
|
if not claims:
|
|
return None
|
|
|
|
# Update html_file path in claims (relative to data/custodian)
|
|
rel_html_path = str(html_file.relative_to(Path('/Users/kempersc/apps/glam/data/custodian')))
|
|
for claim in claims:
|
|
claim['html_file'] = rel_html_path
|
|
|
|
enrichment = {
|
|
'archive_metadata': {
|
|
'archive_method': 'httpx_fetch',
|
|
'archive_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'archive_location': str(archive_subdir.relative_to(Path('/Users/kempersc/apps/glam/data/custodian'))),
|
|
'source_url': url,
|
|
'html_file': rel_html_path
|
|
},
|
|
'claims': claims
|
|
}
|
|
|
|
return enrichment
|
|
|
|
except Exception as e:
|
|
print(f" Error fetching {url}: {e}")
|
|
return None
|
|
|
|
|
|
async def process_batch(entries: list[dict], batch_size: int = 5) -> dict:
|
|
"""Process a batch of entries concurrently."""
|
|
results = {}
|
|
|
|
async with httpx.AsyncClient(
|
|
headers={'User-Agent': 'GLAM Heritage Custodian Enrichment Bot/1.0'},
|
|
follow_redirects=True
|
|
) as session:
|
|
|
|
for i in range(0, len(entries), batch_size):
|
|
batch = entries[i:i+batch_size]
|
|
print(f"Processing batch {i//batch_size + 1} ({len(batch)} entries)...")
|
|
|
|
tasks = [fetch_and_enrich(entry, session) for entry in batch]
|
|
batch_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
for entry, result in zip(batch, batch_results):
|
|
if isinstance(result, Exception):
|
|
print(f" Exception for {entry['ghcid']}: {result}")
|
|
results[entry['ghcid']] = None
|
|
else:
|
|
results[entry['ghcid']] = result
|
|
|
|
# Rate limiting
|
|
await asyncio.sleep(1)
|
|
|
|
return results
|
|
|
|
|
|
def update_custodian_file(filepath: str, enrichment: dict) -> bool:
|
|
"""
|
|
Update custodian YAML file with new web_enrichment.
|
|
Removes old fabricated block, adds new one with xpath provenance.
|
|
"""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Remove old web_enrichment block
|
|
content = remove_web_enrichment_block(content)
|
|
|
|
# Add new enrichment
|
|
content = add_web_enrichment_block(content, enrichment)
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f" Error updating {filepath}: {e}")
|
|
return False
|
|
|
|
|
|
async def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Re-enrich files with proper xpath provenance')
|
|
parser.add_argument('--limit', '-l', type=int, default=10, help='Max files to process')
|
|
parser.add_argument('--dry-run', '-n', action='store_true', help='Do not modify files')
|
|
parser.add_argument('--batch-size', '-b', type=int, default=5, help='Concurrent requests')
|
|
args = parser.parse_args()
|
|
|
|
# Load queue
|
|
with open(QUEUE_FILE) as f:
|
|
queue_data = json.load(f)
|
|
|
|
entries = queue_data['queue'][:args.limit]
|
|
|
|
print(f"Re-enriching {len(entries)} files with proper xpath provenance...")
|
|
print(f"Started at: {datetime.now().isoformat()}")
|
|
print()
|
|
|
|
# Process entries
|
|
results = await process_batch(entries, batch_size=args.batch_size)
|
|
|
|
# Update files
|
|
success = 0
|
|
failed = 0
|
|
skipped = 0
|
|
|
|
for entry in entries:
|
|
ghcid = entry['ghcid']
|
|
enrichment = results.get(ghcid)
|
|
|
|
if enrichment is None:
|
|
print(f" SKIP {ghcid}: No enrichment data")
|
|
skipped += 1
|
|
continue
|
|
|
|
if args.dry_run:
|
|
print(f" DRY-RUN {ghcid}: Would update with {len(enrichment['claims'])} claims")
|
|
success += 1
|
|
else:
|
|
if update_custodian_file(entry['filepath'], enrichment):
|
|
print(f" OK {ghcid}: Updated with {len(enrichment['claims'])} claims")
|
|
success += 1
|
|
else:
|
|
failed += 1
|
|
|
|
print()
|
|
print("=" * 60)
|
|
print("RESULTS")
|
|
print("=" * 60)
|
|
print(f"Successfully enriched: {success}")
|
|
print(f"Skipped (fetch failed): {skipped}")
|
|
print(f"Failed (update error): {failed}")
|
|
print(f"Completed at: {datetime.now().isoformat()}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
asyncio.run(main())
|