glam/scripts/enrich_dutch_custodians_crawl4ai.py
kempersc 0c36429257 feat(scripts): Add batch crawling and data quality scripts
- batch_crawl4ai_recrawl.py: Retry failed URL crawls
- batch_firecrawl_recrawl.py: FireCrawl batch processing
- batch_httpx_scrape.py: HTTPX-based scraping
- detect_name_mismatch.py: Find name mismatches in data
- enrich_dutch_custodians_crawl4ai.py: Dutch custodian enrichment
- fix_collision_victims.py: GHCID collision resolution
- fix_generic_platform_names*.py: Platform name cleanup
- fix_ghcid_type.py: GHCID type corrections
- fix_simon_kemper_contamination.py: Data cleanup
- scan_dutch_data_quality.py: Data quality scanning
- transform_crawl4ai_to_digital_platform.py: Data transformation
2025-12-15 01:47:46 +01:00

666 lines
23 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Enrich Dutch custodian YAML files with web data using Crawl4AI (free, local).
This script replaces the Firecrawl-based enrichment with Crawl4AI which:
1. Runs locally using Playwright (no API costs)
2. Extracts links, metadata, and content with XPath provenance
3. Detects APIs, catalogs, and metadata standards
Usage:
python scripts/enrich_dutch_custodians_crawl4ai.py [options]
Options:
--dry-run Show what would be enriched without modifying files
--limit N Process only first N files (for testing)
--start-index N Start from index N (for resuming)
--resume Resume from last checkpoint
--force Re-enrich even if already has crawl4ai_enrichment
--file PATH Process a single specific file
"""
import argparse
import asyncio
import json
import logging
import os
import re
import sys
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional
from urllib.parse import urlparse, urlunparse
import yaml
from dotenv import load_dotenv
from lxml import etree
# Crawl4AI imports
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
# Load environment variables from .env file
load_dotenv()
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Configuration
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
CHECKPOINT_FILE = CUSTODIAN_DIR / ".crawl4ai_enrichment_checkpoint.json"
# Rate limiting - be nice to websites even though we're local
REQUEST_DELAY = 2.0 # seconds between requests
# Digital platform detection patterns
API_ENDPOINT_PATTERNS = [
r'/oai[-_]?pmh',
r'/api/',
r'/rest/',
r'/sparql',
r'/graphql',
r'/iiif/',
r'/sru',
r'/z39\.50',
r'/opensearch',
]
CATALOG_PATTERNS = [
r'/catalogu[es]?(?:/|\?|$)',
r'/collecti[eo]n?[s]?(?:/|\?|$)',
r'/archie[fv](?:/|\?|$)',
r'/beeldbank(?:/|\?|$)',
r'/zoeken(?:/|\?|$)',
r'/search(?:/|\?|$)',
r'/discover(?:/|\?|$)',
r'/browse(?:/|\?|$)',
]
# Dutch-specific catalog type detection
CATALOG_TYPE_PATTERNS = {
'beeldbank': {
'patterns': [r'/beeldbank', r'/beeld', r'/images', r'/foto'],
'label': 'Image Collection',
'description_nl': 'Beeldbank met gedigitaliseerde foto\'s, kaarten en afbeeldingen',
},
'genealogie': {
'patterns': [r'/genealogie', r'/stamboom', r'/persons', r'/akten'],
'label': 'Genealogy Records',
'description_nl': 'Genealogische bronnen en persoonsgegevens',
},
'archieven': {
'patterns': [r'/archie[fv]', r'/inventaris', r'/toegangen', r'/finding'],
'label': 'Archive Finding Aids',
'description_nl': 'Archiefinventarissen en toegangen',
},
'collectie': {
'patterns': [r'/collectie', r'/collection', r'/object'],
'label': 'Collection Portal',
'description_nl': 'Collectieportaal met objecten en kunstwerken',
},
'kranten': {
'patterns': [r'/kranten', r'/newspaper', r'/periodiek'],
'label': 'Newspaper Archive',
'description_nl': 'Gedigitaliseerde kranten en periodieken',
},
'kaarten': {
'patterns': [r'/kaart', r'/map', r'/cartogra'],
'label': 'Map Collection',
'description_nl': 'Historische kaarten en cartografisch materiaal',
},
'bibliotheek': {
'patterns': [r'/catalogu', r'/biblio', r'/library', r'/boek'],
'label': 'Library Catalog',
'description_nl': 'Bibliotheekcatalogus',
},
'zoeken': {
'patterns': [r'/zoeken', r'/search', r'/discover', r'/browse'],
'label': 'Search Interface',
'description_nl': 'Algemene zoekinterface',
},
}
CMS_INDICATORS = {
'atlantis': ['atlantis', 'picturae'],
'mais_flexis': ['mais-flexis', 'mais flexis', 'de ree'],
'adlib': ['adlib', 'axiell'],
'collective_access': ['collectiveaccess', 'collective access'],
'archivematica': ['archivematica'],
'archivesspace': ['archivesspace'],
'atom': ['accesstomemory', 'atom'],
'omeka': ['omeka'],
'contentdm': ['contentdm'],
'dspace': ['dspace'],
'islandora': ['islandora'],
'memorix': ['memorix'],
}
# Metadata standards detection patterns with regex word boundaries
METADATA_STANDARDS_PATTERNS = [
(r'\bdublin\s+core\b', 'Dublin Core', True),
(r'\bdc:', 'Dublin Core', True),
(r'\bdcterms\b', 'Dublin Core', True),
(r'\bmarc\s*21\b', 'MARC21', True),
(r'\bmarc21\b', 'MARC21', True),
(r'\bead\b', 'EAD', True),
(r'encoded\s+archival\s+description', 'EAD', True),
(r'\bead\s*2002\b', 'EAD', True),
(r'\bead3\b', 'EAD', True),
(r'\bmets\b', 'METS', True),
(r'metadata\s+encoding\s+and\s+transmission', 'METS', True),
(r'\bmods\b', 'MODS', True),
(r'metadata\s+object\s+description', 'MODS', True),
(r'\blido\b', 'LIDO', True),
(r'lightweight\s+information\s+describing', 'LIDO', True),
(r'\bcidoc[-\s]?crm\b', 'CIDOC-CRM', True),
(r'\bschema\.org\b', 'Schema.org', True),
(r'\bschema:', 'Schema.org', True),
(r'\bric[-\s]?o\b', 'RiC-O', True),
(r'records\s+in\s+contexts', 'RiC-O', True),
(r'\bpremis\b', 'PREMIS', True),
(r'preservation\s+metadata', 'PREMIS', True),
(r'\bbibframe\b', 'BIBFRAME', True),
(r'\biiif\b', 'IIIF', True),
(r'image\s+interoperability\s+framework', 'IIIF', True),
]
# Dutch archive platform domains to detect
DUTCH_ARCHIVE_PLATFORMS = [
'archieven.nl',
'memorix.nl',
'archiefweb.eu',
'atlantisdigitaal.nl',
'picturae.nl',
'mais-flexis.nl',
'delpher.nl',
'geheugen.nl',
]
def get_xpath(element, tree) -> str:
"""Generate XPath for an lxml element."""
parts = []
while element is not None:
parent = element.getparent()
if parent is None:
parts.append(element.tag)
else:
siblings = [c for c in parent if c.tag == element.tag]
if len(siblings) == 1:
parts.append(element.tag)
else:
index = siblings.index(element) + 1
parts.append(f'{element.tag}[{index}]')
element = parent
return '/' + '/'.join(reversed(parts))
def normalize_url(url: str) -> str:
"""Normalize URL by removing noise query parameters."""
if not url:
return url
parsed = urlparse(url)
# Remove common tracking/session parameters
noise_params = ['sort', 'order', 'view', 'mode', 'ss', 'page', 'offset',
'limit', 'random', 'session', 'sid', 'token', 'ref']
if parsed.query:
params = dict(p.split('=', 1) if '=' in p else (p, '')
for p in parsed.query.split('&'))
filtered = {k: v for k, v in params.items()
if not any(k.startswith(n) for n in noise_params + ['utm_', 'fbclid', 'gclid'])}
new_query = '&'.join(f'{k}={v}' if v else k for k, v in sorted(filtered.items()))
return urlunparse(parsed._replace(query=new_query))
return url
def detect_catalog_type(url: str) -> dict | None:
"""Detect catalog type from URL pattern."""
url_lower = url.lower()
for type_key, type_info in CATALOG_TYPE_PATTERNS.items():
for pattern in type_info['patterns']:
if re.search(pattern, url_lower):
return {
'type': type_key,
'label': type_info['label'],
'description_nl': type_info['description_nl'],
}
return None
def detect_metadata_standards(content: str) -> list[str]:
"""Detect metadata standards mentioned in content using regex word boundaries."""
if not content:
return []
content_lower = content.lower()
standards_found = set()
for pattern, standard_name, use_regex in METADATA_STANDARDS_PATTERNS:
if use_regex:
if re.search(pattern, content_lower, re.IGNORECASE):
standards_found.add(standard_name)
else:
if pattern.lower() in content_lower:
standards_found.add(standard_name)
return sorted(list(standards_found))
def detect_cms(content: str) -> str | None:
"""Detect CMS/collection management system from content."""
if not content:
return None
content_lower = content.lower()
for cms_name, indicators in CMS_INDICATORS.items():
for indicator in indicators:
if indicator in content_lower:
return cms_name
return None
def extract_website_url(entry: dict) -> str | None:
"""Extract website URL from custodian entry."""
# Check various possible locations for website
if 'website' in entry:
return entry['website']
# Check in enrichment data
for enrichment_key in ['zcbs_enrichment', 'google_maps_enrichment', 'wikidata_enrichment']:
if enrichment_key in entry:
enrichment = entry[enrichment_key]
if isinstance(enrichment, dict):
if 'website' in enrichment:
return enrichment['website']
if 'url' in enrichment:
return enrichment['url']
# Check identifiers
if 'identifiers' in entry:
for identifier in entry.get('identifiers', []):
if isinstance(identifier, dict):
if identifier.get('identifier_scheme') == 'Website':
return identifier.get('identifier_value')
return None
async def crawl_website(crawler: AsyncWebCrawler, url: str) -> dict:
"""
Crawl a website and extract structured data with XPath provenance.
Returns a dict with:
- success: bool
- title: str
- description: str
- html: str (raw HTML for further processing)
- markdown: str
- links: list of dicts with href, text, xpath
- metadata: dict of og/meta tags
- error: str (if failed)
"""
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
verbose=False,
# Wait for page to fully load
wait_until="networkidle",
page_timeout=30000,
)
try:
result = await crawler.arun(url=url, config=config)
if not result.success:
return {
'success': False,
'error': f'Crawl failed with status {result.status_code}',
'status_code': result.status_code,
}
# Parse HTML with lxml to extract XPaths
links_with_xpath = []
if result.html:
try:
tree = etree.HTML(result.html)
link_elements = tree.xpath('//a[@href]')
for link_el in link_elements:
href = link_el.get('href', '')
text = ''.join(link_el.itertext()).strip()
xpath = get_xpath(link_el, tree)
# Skip empty links and javascript
if href and not href.startswith(('javascript:', '#', 'mailto:', 'tel:')):
links_with_xpath.append({
'href': href,
'text': text[:200] if text else '', # Truncate long text
'xpath': xpath,
})
except Exception as e:
logger.warning(f"Error parsing HTML for XPath extraction: {e}")
# Also include crawl4ai's extracted links for completeness
internal_links = result.links.get('internal', []) if result.links else []
external_links = result.links.get('external', []) if result.links else []
return {
'success': True,
'status_code': result.status_code,
'title': result.metadata.get('title', '') if result.metadata else '',
'description': result.metadata.get('description', '') if result.metadata else '',
'html': result.html,
'markdown': result.markdown.raw_markdown if result.markdown else '',
'links_with_xpath': links_with_xpath,
'internal_links': [l.get('href', '') for l in internal_links if isinstance(l, dict)],
'external_links': [l.get('href', '') for l in external_links if isinstance(l, dict)],
'metadata': result.metadata or {},
}
except Exception as e:
logger.error(f"Error crawling {url}: {e}")
return {
'success': False,
'error': str(e),
}
def analyze_crawl_results(crawl_data: dict, base_url: str) -> dict:
"""
Analyze crawl results to extract APIs, catalogs, and metadata standards.
Returns enrichment dict ready to add to YAML.
"""
enrichment = {
'retrieval_timestamp': datetime.now(timezone.utc).isoformat(),
'retrieval_agent': 'crawl4ai',
'source_url': base_url,
'status_code': crawl_data.get('status_code'),
}
if not crawl_data.get('success'):
enrichment['error'] = crawl_data.get('error', 'Unknown error')
return enrichment
# Basic metadata
enrichment['title'] = crawl_data.get('title', '')
enrichment['description'] = crawl_data.get('description', '')
enrichment['links_count'] = len(crawl_data.get('links_with_xpath', []))
# Collect all URLs for analysis
all_urls = set()
links_with_xpath = crawl_data.get('links_with_xpath', [])
for link in links_with_xpath:
href = link.get('href', '')
if href:
# Make absolute URL if relative
if href.startswith('/'):
parsed_base = urlparse(base_url)
href = f"{parsed_base.scheme}://{parsed_base.netloc}{href}"
all_urls.add(href)
# Add internal/external links from crawl4ai
for link in crawl_data.get('internal_links', []):
if link:
all_urls.add(link)
for link in crawl_data.get('external_links', []):
if link:
all_urls.add(link)
# Detect API endpoints
detected_apis = []
for url in all_urls:
url_lower = url.lower()
for pattern in API_ENDPOINT_PATTERNS:
if re.search(pattern, url_lower):
detected_apis.append({
'url': normalize_url(url),
'pattern_matched': pattern,
})
break
if detected_apis:
enrichment['detected_api_endpoints'] = detected_apis
# Detect catalog URLs with type classification
detected_catalogs = []
for url in all_urls:
url_lower = url.lower()
for pattern in CATALOG_PATTERNS:
if re.search(pattern, url_lower):
catalog_entry = {
'url': normalize_url(url),
}
catalog_type = detect_catalog_type(url)
if catalog_type:
catalog_entry['type'] = catalog_type['type']
catalog_entry['label'] = catalog_type['label']
# Find XPath for this link
for link in links_with_xpath:
if link.get('href', '').rstrip('/') == url.rstrip('/') or \
(link.get('href', '').startswith('/') and url.endswith(link.get('href', ''))):
catalog_entry['xpath'] = link.get('xpath')
catalog_entry['link_text'] = link.get('text', '')
break
detected_catalogs.append(catalog_entry)
break
if detected_catalogs:
enrichment['detected_catalog_urls'] = detected_catalogs
# Detect external archive platforms
external_platforms = []
for url in all_urls:
url_lower = url.lower()
for platform in DUTCH_ARCHIVE_PLATFORMS:
if platform in url_lower:
external_platforms.append({
'url': normalize_url(url),
'platform': platform,
})
break
if external_platforms:
enrichment['external_archive_platforms'] = external_platforms
# Detect metadata standards from content
# Handle None values explicitly to avoid string concatenation errors
markdown = crawl_data.get('markdown') or ''
title = crawl_data.get('title') or ''
description = crawl_data.get('description') or ''
content = f"{markdown} {title} {description}"
standards = detect_metadata_standards(content)
if standards:
enrichment['detected_standards'] = standards
# Detect CMS
cms = detect_cms(content)
if cms:
enrichment['detected_cms'] = cms
# Extract OG/meta tags of interest
metadata = crawl_data.get('metadata', {})
og_data = {}
for key in ['og:title', 'og:description', 'og:image', 'og:url', 'og:site_name']:
if key in metadata:
og_data[key.replace('og:', '')] = metadata[key]
if og_data:
enrichment['open_graph'] = og_data
return enrichment
def load_checkpoint() -> dict:
"""Load checkpoint from file."""
if CHECKPOINT_FILE.exists():
with open(CHECKPOINT_FILE, 'r') as f:
return json.load(f)
return {}
def save_checkpoint(checkpoint: dict):
"""Save checkpoint to file."""
with open(CHECKPOINT_FILE, 'w') as f:
json.dump(checkpoint, f, indent=2)
async def process_single_file(
crawler: AsyncWebCrawler,
filepath: Path,
dry_run: bool = False,
force: bool = False,
) -> bool:
"""Process a single custodian YAML file."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
if not entry:
logger.warning(f"Empty file: {filepath}")
return False
# Check if already enriched
if 'crawl4ai_enrichment' in entry and not force:
logger.info(f"Skipping {filepath.name}: already has crawl4ai_enrichment")
return True
# Extract website URL
website_url = extract_website_url(entry)
if not website_url:
logger.info(f"Skipping {filepath.name}: no website URL found")
return False
# Ensure URL has protocol
if not website_url.startswith(('http://', 'https://')):
website_url = 'https://' + website_url
logger.info(f"Processing {filepath.name}: {website_url}")
if dry_run:
logger.info(f" -> DRY RUN: would crawl {website_url}")
return True
# Crawl the website
crawl_data = await crawl_website(crawler, website_url)
# Analyze results
enrichment = analyze_crawl_results(crawl_data, website_url)
# Add enrichment to entry
entry['crawl4ai_enrichment'] = enrichment
# Write back to file
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
# Log summary
apis_count = len(enrichment.get('detected_api_endpoints', []))
catalogs_count = len(enrichment.get('detected_catalog_urls', []))
platforms_count = len(enrichment.get('external_archive_platforms', []))
logger.info(f" -> success: {apis_count} APIs, {catalogs_count} catalogs, {platforms_count} external platforms found")
return True
except Exception as e:
logger.error(f"Error processing {filepath}: {e}")
return False
async def main():
parser = argparse.ArgumentParser(description='Enrich Dutch custodians with Crawl4AI')
parser.add_argument('--dry-run', action='store_true', help='Show what would be enriched')
parser.add_argument('--limit', type=int, help='Process only first N files')
parser.add_argument('--start-index', type=int, default=0, help='Start from index N')
parser.add_argument('--resume', action='store_true', help='Resume from last checkpoint')
parser.add_argument('--force', action='store_true', help='Re-enrich even if already enriched')
parser.add_argument('--file', type=str, help='Process a single specific file')
args = parser.parse_args()
# Create logs directory
logs_dir = Path(__file__).parent.parent / "logs"
logs_dir.mkdir(exist_ok=True)
# Add file handler for logging
log_file = logs_dir / f"crawl4ai_enrichment_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)
# Single file mode
if args.file:
filepath = Path(args.file)
if not filepath.exists():
logger.error(f"File not found: {filepath}")
sys.exit(1)
async with AsyncWebCrawler() as crawler:
success = await process_single_file(crawler, filepath, args.dry_run, args.force)
sys.exit(0 if success else 1)
# Batch mode
files = sorted(CUSTODIAN_DIR.glob("NL-*.yaml"))
logger.info(f"Found {len(files)} Dutch custodian files")
# Handle resume
start_index = args.start_index
if args.resume:
checkpoint = load_checkpoint()
if 'last_processed_index' in checkpoint:
start_index = checkpoint['last_processed_index'] + 1
logger.info(f"Resuming from index {start_index}")
# Apply limit
end_index = len(files)
if args.limit:
end_index = min(start_index + args.limit, len(files))
logger.info(f"Processing files {start_index} to {end_index - 1}")
# Process files
success_count = 0
error_count = 0
async with AsyncWebCrawler() as crawler:
for i, filepath in enumerate(files[start_index:end_index], start=start_index):
logger.info(f"[{i + 1}/{len(files)}] Processing {filepath.name}")
success = await process_single_file(crawler, filepath, args.dry_run, args.force)
if success:
success_count += 1
else:
error_count += 1
# Save checkpoint
if not args.dry_run:
save_checkpoint({
'last_processed_index': i,
'last_processed_file': str(filepath),
'last_processed_time': datetime.now(timezone.utc).isoformat(),
'success_count': success_count,
'error_count': error_count,
})
# Rate limiting
if i < end_index - 1:
await asyncio.sleep(REQUEST_DELAY)
# Summary
logger.info(f"\n{'='*50}")
logger.info(f"Enrichment complete!")
logger.info(f" Success: {success_count}")
logger.info(f" Errors: {error_count}")
logger.info(f" Log file: {log_file}")
if __name__ == '__main__':
asyncio.run(main())