910 lines
31 KiB
Python
910 lines
31 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Dutch custodian YAML files with web data using Firecrawl API.
|
|
|
|
This script:
|
|
1. Maps websites to discover all URLs (APIs, catalogs, portals)
|
|
2. Scrapes homepage for metadata and content
|
|
3. Extracts structured digital platform information
|
|
|
|
Uses Firecrawl MCP tools directly through the existing MCP infrastructure,
|
|
but this script provides a standalone batch processing approach using the
|
|
Firecrawl REST API directly.
|
|
|
|
Usage:
|
|
python scripts/enrich_dutch_custodians_firecrawl.py [options]
|
|
|
|
Options:
|
|
--dry-run Show what would be enriched without modifying files
|
|
--limit N Process only first N files (for testing)
|
|
--start-index N Start from index N (for resuming)
|
|
--resume Resume from last checkpoint
|
|
--force Re-enrich even if already has firecrawl_enrichment
|
|
--file PATH Process a single specific file
|
|
|
|
Environment Variables:
|
|
FIRECRAWL_API_KEY - Required API key for Firecrawl
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
|
|
|
|
import httpx
|
|
import yaml
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables from .env file
|
|
load_dotenv()
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration
|
|
FIRECRAWL_API_BASE = "https://api.firecrawl.dev/v2"
|
|
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
|
|
CHECKPOINT_FILE = CUSTODIAN_DIR / ".firecrawl_enrichment_checkpoint.json"
|
|
|
|
# Rate limiting - Firecrawl has rate limits, be conservative
|
|
# Increased from 2.0 to 3.5 after hitting 429 errors in batch testing
|
|
REQUEST_DELAY = 3.5 # seconds between requests
|
|
|
|
# API Key
|
|
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", "")
|
|
|
|
# Digital platform detection patterns
|
|
API_ENDPOINT_PATTERNS = [
|
|
r'/oai[-_]?pmh',
|
|
r'/api/',
|
|
r'/rest/',
|
|
r'/sparql',
|
|
r'/graphql',
|
|
r'/iiif/',
|
|
r'/sru',
|
|
r'/z39\.50',
|
|
r'/opensearch',
|
|
]
|
|
|
|
CATALOG_PATTERNS = [
|
|
r'/catalogu[es]?(?:/|\?|$)',
|
|
r'/collecti[eo]n?[s]?(?:/|\?|$)',
|
|
r'/archie[fv](?:/|\?|$)',
|
|
r'/beeldbank(?:/|\?|$)',
|
|
r'/zoeken(?:/|\?|$)',
|
|
r'/search(?:/|\?|$)',
|
|
r'/discover(?:/|\?|$)',
|
|
r'/browse(?:/|\?|$)',
|
|
]
|
|
|
|
# Dutch-specific catalog type detection
|
|
# Maps URL path patterns to human-readable catalog types
|
|
CATALOG_TYPE_PATTERNS = {
|
|
'beeldbank': {
|
|
'patterns': [r'/beeldbank', r'/beeld', r'/images', r'/foto'],
|
|
'label': 'Image Collection',
|
|
'description_nl': 'Beeldbank met gedigitaliseerde foto\'s, kaarten en afbeeldingen',
|
|
},
|
|
'genealogie': {
|
|
'patterns': [r'/genealogie', r'/stamboom', r'/persons', r'/akten'],
|
|
'label': 'Genealogy Records',
|
|
'description_nl': 'Genealogische bronnen en persoonsgegevens',
|
|
},
|
|
'archieven': {
|
|
'patterns': [r'/archie[fv]', r'/inventaris', r'/toegangen', r'/finding'],
|
|
'label': 'Archive Finding Aids',
|
|
'description_nl': 'Archiefinventarissen en toegangen',
|
|
},
|
|
'collectie': {
|
|
'patterns': [r'/collectie', r'/collection', r'/object'],
|
|
'label': 'Collection Portal',
|
|
'description_nl': 'Collectieportaal met objecten en kunstwerken',
|
|
},
|
|
'kranten': {
|
|
'patterns': [r'/kranten', r'/newspaper', r'/periodiek'],
|
|
'label': 'Newspaper Archive',
|
|
'description_nl': 'Gedigitaliseerde kranten en periodieken',
|
|
},
|
|
'kaarten': {
|
|
'patterns': [r'/kaart', r'/map', r'/cartogra'],
|
|
'label': 'Map Collection',
|
|
'description_nl': 'Historische kaarten en cartografisch materiaal',
|
|
},
|
|
'bibliotheek': {
|
|
'patterns': [r'/catalogu', r'/biblio', r'/library', r'/boek'],
|
|
'label': 'Library Catalog',
|
|
'description_nl': 'Bibliotheekcatalogus',
|
|
},
|
|
'zoeken': {
|
|
'patterns': [r'/zoeken', r'/search', r'/discover', r'/browse'],
|
|
'label': 'Search Interface',
|
|
'description_nl': 'Algemene zoekinterface',
|
|
},
|
|
}
|
|
|
|
# Query parameters that should be stripped for URL normalization
|
|
NOISE_QUERY_PARAMS = [
|
|
'sort', 'order', 'view', 'mode', 'ss', 'page', 'offset', 'limit',
|
|
'random', 'session', 'sid', 'token', 'ref', 'utm_', 'fbclid', 'gclid',
|
|
]
|
|
|
|
CMS_INDICATORS = {
|
|
'atlantis': ['atlantis', 'picturae'],
|
|
'mais_flexis': ['mais-flexis', 'mais flexis', 'de ree'],
|
|
'adlib': ['adlib', 'axiell'],
|
|
'collective_access': ['collectiveaccess', 'collective access'],
|
|
'archivematica': ['archivematica'],
|
|
'archivesspace': ['archivesspace'],
|
|
'atom': ['accesstomemory', 'atom'],
|
|
'omeka': ['omeka'],
|
|
'contentdm': ['contentdm'],
|
|
'dspace': ['dspace'],
|
|
'islandora': ['islandora'],
|
|
'memorix': ['memorix'],
|
|
}
|
|
|
|
METADATA_STANDARDS = [
|
|
'dublin core', 'dc:', 'dcterms',
|
|
'marc21', 'marc 21',
|
|
'ead', 'encoded archival description',
|
|
'mets', 'metadata encoding',
|
|
'mods', 'metadata object description',
|
|
'lido', 'lightweight information',
|
|
'cidoc-crm', 'cidoc crm',
|
|
'schema.org', 'schema:',
|
|
'ric-o', 'records in contexts',
|
|
'premis', 'preservation metadata',
|
|
'bibframe',
|
|
'iiif',
|
|
]
|
|
|
|
|
|
@dataclass
|
|
class FirecrawlClient:
|
|
"""Simple Firecrawl API client."""
|
|
api_key: str
|
|
base_url: str = FIRECRAWL_API_BASE
|
|
|
|
def __post_init__(self):
|
|
self.client = httpx.Client(
|
|
timeout=60.0,
|
|
headers={
|
|
"Authorization": f"Bearer {self.api_key}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
)
|
|
|
|
def scrape(self, url: str, formats: list[str] | None = None,
|
|
only_main_content: bool = True) -> dict | None:
|
|
"""Scrape a single URL."""
|
|
if formats is None:
|
|
formats = ["markdown", "links"]
|
|
|
|
payload = {
|
|
"url": url,
|
|
"formats": formats,
|
|
"onlyMainContent": only_main_content,
|
|
"maxAge": 172800000, # 2 days cache
|
|
"blockAds": True,
|
|
"skipTlsVerification": True,
|
|
"location": {"country": "NL"}, # Dutch locale
|
|
}
|
|
|
|
max_retries = 3
|
|
for attempt in range(max_retries):
|
|
try:
|
|
response = self.client.post(f"{self.base_url}/scrape", json=payload)
|
|
response.raise_for_status()
|
|
result = response.json()
|
|
if result.get("success"):
|
|
return result.get("data")
|
|
else:
|
|
logger.warning(f"Scrape failed for {url}: {result}")
|
|
return None
|
|
except httpx.HTTPStatusError as e:
|
|
if e.response.status_code == 429 and attempt < max_retries - 1:
|
|
wait_time = (attempt + 1) * 10 # 10s, 20s, 30s
|
|
logger.warning(f"Rate limited (429), waiting {wait_time}s before retry...")
|
|
time.sleep(wait_time)
|
|
continue
|
|
logger.error(f"HTTP error scraping {url}: {e.response.status_code}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error scraping {url}: {e}")
|
|
return None
|
|
return None
|
|
|
|
def map_site(self, url: str, limit: int = 100) -> list[dict] | None:
|
|
"""Map all URLs on a site."""
|
|
payload = {
|
|
"url": url,
|
|
"limit": limit,
|
|
"sitemap": "include",
|
|
"includeSubdomains": True,
|
|
"ignoreQueryParameters": False, # Keep query params for API endpoints
|
|
"location": {"country": "NL"},
|
|
}
|
|
|
|
max_retries = 3
|
|
for attempt in range(max_retries):
|
|
try:
|
|
response = self.client.post(f"{self.base_url}/map", json=payload)
|
|
response.raise_for_status()
|
|
result = response.json()
|
|
if result.get("success"):
|
|
return result.get("links", [])
|
|
else:
|
|
logger.warning(f"Map failed for {url}: {result}")
|
|
return None
|
|
except httpx.HTTPStatusError as e:
|
|
if e.response.status_code == 429 and attempt < max_retries - 1:
|
|
wait_time = (attempt + 1) * 10 # 10s, 20s, 30s
|
|
logger.warning(f"Rate limited (429), waiting {wait_time}s before retry...")
|
|
time.sleep(wait_time)
|
|
continue
|
|
logger.error(f"HTTP error mapping {url}: {e.response.status_code}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error mapping {url}: {e}")
|
|
return None
|
|
return None
|
|
|
|
def close(self):
|
|
"""Close the HTTP client."""
|
|
self.client.close()
|
|
|
|
|
|
def get_website_url(entry: dict) -> str | None:
|
|
"""Extract website URL from custodian entry, prioritizing different sources."""
|
|
# Priority 1: Original entry webadres
|
|
if entry.get('original_entry', {}).get('webadres_organisatie'):
|
|
url = entry['original_entry']['webadres_organisatie']
|
|
if url and url.strip():
|
|
return normalize_url(url.strip())
|
|
|
|
# Priority 2: Wikidata official website
|
|
if entry.get('wikidata_enrichment', {}).get('wikidata_official_website'):
|
|
url = entry['wikidata_enrichment']['wikidata_official_website']
|
|
if url and url.strip():
|
|
return normalize_url(url.strip())
|
|
|
|
# Priority 3: Google Maps website
|
|
if entry.get('google_maps_enrichment', {}).get('website'):
|
|
url = entry['google_maps_enrichment']['website']
|
|
if url and url.strip():
|
|
return normalize_url(url.strip())
|
|
|
|
return None
|
|
|
|
|
|
def normalize_url(url: str) -> str:
|
|
"""Normalize URL to ensure it has a scheme."""
|
|
if not url:
|
|
return url
|
|
|
|
url = url.strip()
|
|
|
|
# Add https if no scheme
|
|
if not url.startswith(('http://', 'https://')):
|
|
url = 'https://' + url
|
|
|
|
# Remove trailing slash for consistency
|
|
url = url.rstrip('/')
|
|
|
|
return url
|
|
|
|
|
|
def detect_apis_from_urls(urls: list[dict]) -> list[dict]:
|
|
"""Detect API endpoints from a list of URLs."""
|
|
apis = []
|
|
|
|
for url_info in urls:
|
|
url = url_info.get('url', '') if isinstance(url_info, dict) else str(url_info)
|
|
url_lower = url.lower()
|
|
|
|
for pattern in API_ENDPOINT_PATTERNS:
|
|
if re.search(pattern, url_lower):
|
|
api_type = detect_api_type(url_lower)
|
|
apis.append({
|
|
'url': url,
|
|
'type': api_type,
|
|
'title': url_info.get('title') if isinstance(url_info, dict) else None,
|
|
})
|
|
break
|
|
|
|
return apis
|
|
|
|
|
|
def detect_api_type(url: str) -> str:
|
|
"""Detect the type of API from URL."""
|
|
url_lower = url.lower()
|
|
|
|
if 'oai' in url_lower or 'pmh' in url_lower:
|
|
return 'OAI-PMH'
|
|
elif 'sparql' in url_lower:
|
|
return 'SPARQL'
|
|
elif 'iiif' in url_lower:
|
|
return 'IIIF'
|
|
elif 'sru' in url_lower:
|
|
return 'SRU'
|
|
elif 'opensearch' in url_lower:
|
|
return 'OpenSearch'
|
|
elif 'graphql' in url_lower:
|
|
return 'GraphQL'
|
|
elif '/api/' in url_lower or '/rest/' in url_lower:
|
|
return 'REST'
|
|
else:
|
|
return 'Unknown'
|
|
|
|
|
|
def normalize_catalog_url(url: str, strip_all_params: bool = True) -> str:
|
|
"""Normalize catalog URL by removing query parameters.
|
|
|
|
For catalog URLs, we want the canonical base URL without query params,
|
|
as they often contain session-specific or record-specific parameters.
|
|
|
|
Args:
|
|
url: The URL to normalize
|
|
strip_all_params: If True, remove ALL query params (default for catalogs)
|
|
If False, only remove known noisy params
|
|
"""
|
|
try:
|
|
parsed = urlparse(url)
|
|
|
|
# For catalog URLs, strip all query params to get canonical base URL
|
|
if strip_all_params:
|
|
normalized = urlunparse((
|
|
parsed.scheme,
|
|
parsed.netloc,
|
|
parsed.path.rstrip('/'),
|
|
'', # No params
|
|
'', # No query
|
|
'' # No fragment
|
|
))
|
|
return normalized
|
|
|
|
# Parse query parameters
|
|
query_params = parse_qs(parsed.query, keep_blank_values=False)
|
|
|
|
# Filter out noisy parameters
|
|
cleaned_params = {}
|
|
for key, values in query_params.items():
|
|
key_lower = key.lower()
|
|
# Skip if parameter matches noise patterns
|
|
is_noise = False
|
|
for noise in NOISE_QUERY_PARAMS:
|
|
if noise in key_lower:
|
|
is_noise = True
|
|
break
|
|
|
|
# Also skip if value contains random-looking strings (session IDs, etc.)
|
|
if not is_noise and values:
|
|
value = values[0]
|
|
# Skip values that look like random session IDs (long hex strings, timestamps)
|
|
if re.match(r'^[a-f0-9]{20,}$', value, re.I):
|
|
is_noise = True
|
|
# Skip values with encoded JSON containing random seeds
|
|
if 'random' in value.lower():
|
|
is_noise = True
|
|
|
|
if not is_noise:
|
|
cleaned_params[key] = values
|
|
|
|
# Reconstruct URL without noisy params
|
|
new_query = urlencode(cleaned_params, doseq=True) if cleaned_params else ''
|
|
|
|
normalized = urlunparse((
|
|
parsed.scheme,
|
|
parsed.netloc,
|
|
parsed.path.rstrip('/'), # Normalize trailing slash
|
|
parsed.params,
|
|
new_query,
|
|
'' # Remove fragment
|
|
))
|
|
|
|
return normalized
|
|
except Exception:
|
|
return url
|
|
|
|
|
|
def detect_catalog_type(url: str) -> tuple[str | None, str | None]:
|
|
"""Detect the catalog type from URL path.
|
|
|
|
Returns tuple of (type_key, label) or (None, None) if not detected.
|
|
"""
|
|
url_lower = url.lower()
|
|
|
|
for type_key, config in CATALOG_TYPE_PATTERNS.items():
|
|
for pattern in config['patterns']:
|
|
if re.search(pattern, url_lower):
|
|
return type_key, config['label']
|
|
|
|
return None, None
|
|
|
|
|
|
def generate_catalog_description(url: str, catalog_type: str | None) -> str | None:
|
|
"""Generate a meaningful description based on URL path and catalog type."""
|
|
if catalog_type and catalog_type in CATALOG_TYPE_PATTERNS:
|
|
return CATALOG_TYPE_PATTERNS[catalog_type]['description_nl']
|
|
|
|
# Fallback: extract description from URL path
|
|
parsed = urlparse(url)
|
|
path_parts = [p for p in parsed.path.split('/') if p]
|
|
|
|
if path_parts:
|
|
# Use last meaningful path segment
|
|
last_segment = path_parts[-1].replace('-', ' ').replace('_', ' ').title()
|
|
if last_segment.lower() not in ('zoeken', 'search', 'index', 'home'):
|
|
return f"Zoekinterface: {last_segment}"
|
|
|
|
return None
|
|
|
|
|
|
def detect_catalogs_from_urls(urls: list[dict]) -> list[dict]:
|
|
"""Detect catalog/collection portals from URLs.
|
|
|
|
Improvements over basic detection:
|
|
1. Normalizes URLs (removes session IDs, random sort params)
|
|
2. Deduplicates by catalog section (keeps one URL per catalog type per domain)
|
|
3. Categorizes by catalog type (beeldbank, genealogie, etc.)
|
|
4. Generates meaningful descriptions
|
|
"""
|
|
catalogs = []
|
|
seen_catalog_types = {} # Track which catalog types we've seen per domain
|
|
|
|
# First pass: collect all matching URLs
|
|
candidate_urls = []
|
|
for url_info in urls:
|
|
url = url_info.get('url', '') if isinstance(url_info, dict) else str(url_info)
|
|
url_lower = url.lower()
|
|
|
|
# Check if URL matches any catalog pattern
|
|
is_catalog = False
|
|
for pattern in CATALOG_PATTERNS:
|
|
if re.search(pattern, url_lower):
|
|
is_catalog = True
|
|
break
|
|
|
|
if is_catalog:
|
|
# Normalize URL
|
|
normalized_url = normalize_catalog_url(url)
|
|
catalog_type, catalog_label = detect_catalog_type(url)
|
|
|
|
# Calculate path depth (shallower URLs are preferred)
|
|
parsed = urlparse(normalized_url)
|
|
path_parts = [p for p in parsed.path.split('/') if p]
|
|
path_depth = len(path_parts)
|
|
|
|
candidate_urls.append({
|
|
'url': normalized_url,
|
|
'original_url': url,
|
|
'url_info': url_info,
|
|
'catalog_type': catalog_type,
|
|
'catalog_label': catalog_label,
|
|
'path_depth': path_depth,
|
|
'has_query': bool(parsed.query),
|
|
})
|
|
|
|
# Sort by: catalog_type, then has_query (no query first), then path_depth
|
|
candidate_urls.sort(key=lambda x: (
|
|
x['catalog_type'] or 'zzz', # Group by type
|
|
x['has_query'], # Prefer URLs without query params
|
|
x['path_depth'], # Prefer shallower paths
|
|
))
|
|
|
|
# Second pass: keep only one URL per catalog type per domain
|
|
for candidate in candidate_urls:
|
|
parsed = urlparse(candidate['url'])
|
|
domain = parsed.netloc
|
|
catalog_type = candidate['catalog_type'] or 'generic'
|
|
|
|
# Create a key for deduplication: domain + catalog_type
|
|
dedup_key = f"{domain}:{catalog_type}"
|
|
|
|
# If we haven't seen this catalog type on this domain, add it
|
|
if dedup_key not in seen_catalog_types:
|
|
seen_catalog_types[dedup_key] = candidate['url']
|
|
|
|
# Get or generate description
|
|
url_info = candidate['url_info']
|
|
title = url_info.get('title') if isinstance(url_info, dict) else None
|
|
original_description = url_info.get('description') if isinstance(url_info, dict) else None
|
|
|
|
description = original_description
|
|
if not description:
|
|
description = generate_catalog_description(candidate['url'], candidate['catalog_type'])
|
|
|
|
# Build catalog entry
|
|
catalog_entry = {
|
|
'url': candidate['url'],
|
|
'catalog_type': candidate['catalog_type'],
|
|
'catalog_type_label': candidate['catalog_label'],
|
|
'title': title,
|
|
'description': description,
|
|
}
|
|
|
|
catalogs.append(catalog_entry)
|
|
|
|
# Sort by catalog type for consistent ordering
|
|
type_order = list(CATALOG_TYPE_PATTERNS.keys())
|
|
catalogs.sort(key=lambda x: (
|
|
type_order.index(x['catalog_type']) if x['catalog_type'] in type_order else 999,
|
|
x['url']
|
|
))
|
|
|
|
return catalogs
|
|
|
|
|
|
def detect_cms_from_content(content: str) -> list[str]:
|
|
"""Detect CMS/platform indicators from page content."""
|
|
if not content:
|
|
return []
|
|
|
|
content_lower = content.lower()
|
|
detected = []
|
|
|
|
for cms_name, indicators in CMS_INDICATORS.items():
|
|
for indicator in indicators:
|
|
if indicator in content_lower:
|
|
detected.append(cms_name)
|
|
break
|
|
|
|
return list(set(detected))
|
|
|
|
|
|
def detect_metadata_standards(content: str) -> list[str]:
|
|
"""Detect metadata standards mentioned in content."""
|
|
if not content:
|
|
return []
|
|
|
|
content_lower = content.lower()
|
|
detected = []
|
|
|
|
for standard in METADATA_STANDARDS:
|
|
if standard in content_lower:
|
|
# Normalize standard names
|
|
if 'dublin' in standard:
|
|
detected.append('Dublin Core')
|
|
elif 'marc' in standard:
|
|
detected.append('MARC21')
|
|
elif 'ead' in standard:
|
|
detected.append('EAD')
|
|
elif 'mets' in standard:
|
|
detected.append('METS')
|
|
elif 'mods' in standard:
|
|
detected.append('MODS')
|
|
elif 'lido' in standard:
|
|
detected.append('LIDO')
|
|
elif 'cidoc' in standard:
|
|
detected.append('CIDOC-CRM')
|
|
elif 'schema' in standard:
|
|
detected.append('Schema.org')
|
|
elif 'ric' in standard:
|
|
detected.append('RiC-O')
|
|
elif 'premis' in standard:
|
|
detected.append('PREMIS')
|
|
elif 'bibframe' in standard:
|
|
detected.append('BIBFRAME')
|
|
elif 'iiif' in standard:
|
|
detected.append('IIIF')
|
|
|
|
return list(set(detected))
|
|
|
|
|
|
def create_firecrawl_enrichment(
|
|
url: str,
|
|
scrape_data: dict | None,
|
|
map_data: list[dict] | None,
|
|
) -> dict:
|
|
"""Create the firecrawl_enrichment section for a custodian entry."""
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
enrichment = {
|
|
'fetch_timestamp': timestamp,
|
|
'source_url': url,
|
|
'success': scrape_data is not None,
|
|
}
|
|
|
|
if scrape_data:
|
|
# Extract metadata
|
|
metadata = scrape_data.get('metadata', {})
|
|
enrichment['page_metadata'] = {
|
|
'title': metadata.get('title'),
|
|
'description': metadata.get('description'),
|
|
'language': metadata.get('language'),
|
|
'status_code': metadata.get('statusCode'),
|
|
}
|
|
|
|
# Get markdown content for analysis
|
|
content = scrape_data.get('markdown', '')
|
|
|
|
# Detect CMS and standards from content
|
|
enrichment['detected_cms'] = detect_cms_from_content(content)
|
|
enrichment['detected_standards'] = detect_metadata_standards(content)
|
|
|
|
# Get links from scrape
|
|
links = scrape_data.get('links', [])
|
|
if links:
|
|
enrichment['links_count'] = len(links)
|
|
|
|
if map_data:
|
|
enrichment['sitemap_urls_count'] = len(map_data)
|
|
|
|
# Detect APIs and catalogs from mapped URLs
|
|
apis = detect_apis_from_urls(map_data)
|
|
if apis:
|
|
enrichment['detected_api_endpoints'] = apis
|
|
|
|
catalogs = detect_catalogs_from_urls(map_data)
|
|
if catalogs:
|
|
enrichment['detected_catalog_urls'] = catalogs[:10] # Limit to 10
|
|
|
|
return enrichment
|
|
|
|
|
|
def update_provenance(entry: dict, firecrawl_data: dict) -> dict:
|
|
"""Update provenance section with Firecrawl source."""
|
|
provenance = entry.get('provenance', {})
|
|
sources = provenance.get('sources', {})
|
|
|
|
# Add Firecrawl source
|
|
sources['firecrawl'] = [{
|
|
'source_type': 'firecrawl_api',
|
|
'fetch_timestamp': firecrawl_data.get('fetch_timestamp'),
|
|
'api_version': 'v2',
|
|
'source_url': firecrawl_data.get('source_url'),
|
|
'claims_extracted': [
|
|
'page_metadata',
|
|
'detected_cms',
|
|
'detected_standards',
|
|
'detected_api_endpoints',
|
|
'detected_catalog_urls',
|
|
],
|
|
}]
|
|
|
|
provenance['sources'] = sources
|
|
entry['provenance'] = provenance
|
|
|
|
return entry
|
|
|
|
|
|
def load_checkpoint() -> dict:
|
|
"""Load checkpoint for resumable processing."""
|
|
if CHECKPOINT_FILE.exists():
|
|
try:
|
|
with open(CHECKPOINT_FILE, 'r') as f:
|
|
return json.load(f)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to load checkpoint: {e}")
|
|
|
|
return {
|
|
'processed_files': [],
|
|
'last_processed_index': -1,
|
|
'stats': {
|
|
'total_processed': 0,
|
|
'successful': 0,
|
|
'failed': 0,
|
|
'skipped_no_url': 0,
|
|
'skipped_already_enriched': 0,
|
|
}
|
|
}
|
|
|
|
|
|
def save_checkpoint(checkpoint: dict):
|
|
"""Save checkpoint for resumable processing."""
|
|
try:
|
|
with open(CHECKPOINT_FILE, 'w') as f:
|
|
json.dump(checkpoint, f, indent=2)
|
|
except Exception as e:
|
|
logger.error(f"Failed to save checkpoint: {e}")
|
|
|
|
|
|
def process_custodian(
|
|
filepath: Path,
|
|
client: FirecrawlClient,
|
|
dry_run: bool = False,
|
|
force: bool = False,
|
|
) -> dict:
|
|
"""Process a single custodian file."""
|
|
result = {
|
|
'filepath': str(filepath),
|
|
'status': 'unknown',
|
|
'url': None,
|
|
'error': None,
|
|
}
|
|
|
|
try:
|
|
# Load entry
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
if not entry:
|
|
result['status'] = 'skipped_empty'
|
|
return result
|
|
|
|
# Check if already enriched
|
|
if not force and entry.get('firecrawl_enrichment'):
|
|
result['status'] = 'skipped_already_enriched'
|
|
return result
|
|
|
|
# Get website URL
|
|
url = get_website_url(entry)
|
|
result['url'] = url
|
|
|
|
if not url:
|
|
result['status'] = 'skipped_no_url'
|
|
return result
|
|
|
|
logger.info(f"Processing {filepath.name}: {url}")
|
|
|
|
if dry_run:
|
|
result['status'] = 'dry_run'
|
|
return result
|
|
|
|
# Rate limiting
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
# Scrape homepage
|
|
scrape_data = client.scrape(url)
|
|
|
|
# Map site URLs (with smaller limit for efficiency)
|
|
time.sleep(REQUEST_DELAY)
|
|
map_data = client.map_site(url, limit=200)
|
|
|
|
# Create enrichment
|
|
firecrawl_enrichment = create_firecrawl_enrichment(url, scrape_data, map_data)
|
|
|
|
# Update entry
|
|
entry['firecrawl_enrichment'] = firecrawl_enrichment
|
|
entry = update_provenance(entry, firecrawl_enrichment)
|
|
|
|
# Save updated entry
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
result['status'] = 'success' if firecrawl_enrichment.get('success') else 'partial'
|
|
result['apis_found'] = len(firecrawl_enrichment.get('detected_api_endpoints', []))
|
|
result['catalogs_found'] = len(firecrawl_enrichment.get('detected_catalog_urls', []))
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing {filepath}: {e}")
|
|
result['status'] = 'error'
|
|
result['error'] = str(e)
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Enrich Dutch custodian files with Firecrawl web data'
|
|
)
|
|
parser.add_argument('--dry-run', action='store_true',
|
|
help='Show what would be enriched without modifying files')
|
|
parser.add_argument('--limit', type=int, default=None,
|
|
help='Process only first N files')
|
|
parser.add_argument('--start-index', type=int, default=0,
|
|
help='Start from index N')
|
|
parser.add_argument('--resume', action='store_true',
|
|
help='Resume from last checkpoint')
|
|
parser.add_argument('--force', action='store_true',
|
|
help='Re-enrich even if already has firecrawl_enrichment')
|
|
parser.add_argument('--file', type=str, default=None,
|
|
help='Process a single specific file')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Check API key
|
|
if not FIRECRAWL_API_KEY:
|
|
logger.error("FIRECRAWL_API_KEY environment variable not set")
|
|
sys.exit(1)
|
|
|
|
# Initialize client
|
|
client = FirecrawlClient(api_key=FIRECRAWL_API_KEY)
|
|
|
|
try:
|
|
# Single file mode
|
|
if args.file:
|
|
filepath = Path(args.file)
|
|
if not filepath.exists():
|
|
logger.error(f"File not found: {filepath}")
|
|
sys.exit(1)
|
|
|
|
result = process_custodian(filepath, client, args.dry_run, args.force)
|
|
logger.info(f"Result: {result}")
|
|
return
|
|
|
|
# Get list of Dutch custodian files
|
|
files = sorted(CUSTODIAN_DIR.glob("NL-*.yaml"))
|
|
total_files = len(files)
|
|
logger.info(f"Found {total_files} Dutch custodian files")
|
|
|
|
# Load checkpoint if resuming
|
|
checkpoint = load_checkpoint() if args.resume else {
|
|
'processed_files': [],
|
|
'last_processed_index': -1,
|
|
'stats': {
|
|
'total_processed': 0,
|
|
'successful': 0,
|
|
'failed': 0,
|
|
'skipped_no_url': 0,
|
|
'skipped_already_enriched': 0,
|
|
}
|
|
}
|
|
|
|
# Determine start index
|
|
start_index = args.start_index
|
|
if args.resume and checkpoint['last_processed_index'] >= 0:
|
|
start_index = checkpoint['last_processed_index'] + 1
|
|
logger.info(f"Resuming from index {start_index}")
|
|
|
|
# Determine end index
|
|
end_index = total_files
|
|
if args.limit:
|
|
end_index = min(start_index + args.limit, total_files)
|
|
|
|
logger.info(f"Processing files {start_index} to {end_index - 1}")
|
|
|
|
# Process files
|
|
for i, filepath in enumerate(files[start_index:end_index], start=start_index):
|
|
logger.info(f"[{i + 1}/{total_files}] Processing {filepath.name}")
|
|
|
|
result = process_custodian(filepath, client, args.dry_run, args.force)
|
|
|
|
# Update stats
|
|
checkpoint['stats']['total_processed'] += 1
|
|
if result['status'] == 'success':
|
|
checkpoint['stats']['successful'] += 1
|
|
elif result['status'] == 'error':
|
|
checkpoint['stats']['failed'] += 1
|
|
elif result['status'] == 'skipped_no_url':
|
|
checkpoint['stats']['skipped_no_url'] += 1
|
|
elif result['status'] == 'skipped_already_enriched':
|
|
checkpoint['stats']['skipped_already_enriched'] += 1
|
|
|
|
checkpoint['processed_files'].append(str(filepath))
|
|
checkpoint['last_processed_index'] = i
|
|
|
|
# Save checkpoint every 10 files
|
|
if (i + 1) % 10 == 0:
|
|
save_checkpoint(checkpoint)
|
|
logger.info(f"Checkpoint saved at index {i}")
|
|
|
|
# Log progress
|
|
if result['status'] in ('success', 'partial'):
|
|
logger.info(
|
|
f" -> {result['status']}: "
|
|
f"{result.get('apis_found', 0)} APIs, "
|
|
f"{result.get('catalogs_found', 0)} catalogs found"
|
|
)
|
|
else:
|
|
logger.info(f" -> {result['status']}")
|
|
|
|
# Final checkpoint save
|
|
save_checkpoint(checkpoint)
|
|
|
|
# Print summary
|
|
logger.info("\n=== Processing Summary ===")
|
|
logger.info(f"Total processed: {checkpoint['stats']['total_processed']}")
|
|
logger.info(f"Successful: {checkpoint['stats']['successful']}")
|
|
logger.info(f"Failed: {checkpoint['stats']['failed']}")
|
|
logger.info(f"Skipped (no URL): {checkpoint['stats']['skipped_no_url']}")
|
|
logger.info(f"Skipped (already enriched): {checkpoint['stats']['skipped_already_enriched']}")
|
|
|
|
finally:
|
|
client.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|