feat(scripts): Add batch crawling and data quality scripts
- batch_crawl4ai_recrawl.py: Retry failed URL crawls - batch_firecrawl_recrawl.py: FireCrawl batch processing - batch_httpx_scrape.py: HTTPX-based scraping - detect_name_mismatch.py: Find name mismatches in data - enrich_dutch_custodians_crawl4ai.py: Dutch custodian enrichment - fix_collision_victims.py: GHCID collision resolution - fix_generic_platform_names*.py: Platform name cleanup - fix_ghcid_type.py: GHCID type corrections - fix_simon_kemper_contamination.py: Data cleanup - scan_dutch_data_quality.py: Data quality scanning - transform_crawl4ai_to_digital_platform.py: Data transformation
This commit is contained in:
parent
70c30a52d4
commit
0c36429257
15 changed files with 4881 additions and 11 deletions
371
scripts/batch_crawl4ai_recrawl.py
Normal file
371
scripts/batch_crawl4ai_recrawl.py
Normal file
|
|
@ -0,0 +1,371 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Batch re-crawl failed URLs using crawl4ai (free, local) and transform to digital_platform_v2.
|
||||||
|
|
||||||
|
This script:
|
||||||
|
1. Reads the list of failed crawl URLs
|
||||||
|
2. Uses crawl4ai to fetch content (free, no API limits)
|
||||||
|
3. Transforms results to digital_platform_v2 format
|
||||||
|
4. Updates the custodian YAML files
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/batch_crawl4ai_recrawl.py --limit 100 --start 0
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
||||||
|
FAILED_URLS_FILE = Path("/Users/kempersc/apps/glam/data/failed_crawl_urls.txt")
|
||||||
|
|
||||||
|
# Platform type detection patterns
|
||||||
|
PLATFORM_PATTERNS = {
|
||||||
|
'DISCOVERY_PORTAL': [
|
||||||
|
r'/collectie', r'/collection', r'/catalogus', r'/catalog',
|
||||||
|
r'/zoeken', r'/search', r'/archief', r'/archive',
|
||||||
|
r'/beeldbank', r'/images', r'/foto', r'/photo',
|
||||||
|
],
|
||||||
|
'DIGITAL_ARCHIVE': [
|
||||||
|
r'archieven\.nl', r'archief', r'archive',
|
||||||
|
r'/inventaris', r'/inventory', r'/toegang',
|
||||||
|
],
|
||||||
|
'EDUCATION': [
|
||||||
|
r'/educatie', r'/education', r'/onderwijs', r'/leren',
|
||||||
|
r'/scholen', r'/schools', r'/lesmateriaal',
|
||||||
|
],
|
||||||
|
'INSTITUTIONAL_WEBSITE': [
|
||||||
|
r'/over-ons', r'/about', r'/contact', r'/bezoek',
|
||||||
|
r'/visit', r'/openingstijden', r'/hours',
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def detect_platform_type(url: str, links: list[str] | None = None) -> str:
|
||||||
|
"""Detect the platform type based on URL patterns and extracted links."""
|
||||||
|
url_lower = url.lower()
|
||||||
|
all_urls = [url_lower] + [l.lower() for l in (links or [])]
|
||||||
|
|
||||||
|
for platform_type, patterns in PLATFORM_PATTERNS.items():
|
||||||
|
for pattern in patterns:
|
||||||
|
for check_url in all_urls:
|
||||||
|
if re.search(pattern, check_url):
|
||||||
|
return platform_type
|
||||||
|
|
||||||
|
return 'INSTITUTIONAL_WEBSITE'
|
||||||
|
|
||||||
|
|
||||||
|
def extract_collection_urls(links: list[str], base_url: str) -> list[str]:
|
||||||
|
"""Extract URLs that appear to be collection/catalog pages."""
|
||||||
|
collection_patterns = [
|
||||||
|
r'/collectie', r'/collection', r'/catalogus', r'/catalog',
|
||||||
|
r'/zoeken', r'/search', r'/beeldbank', r'/inventaris',
|
||||||
|
r'/archief(?!en\.)', r'/archiefstukken', r'/toegangen',
|
||||||
|
]
|
||||||
|
|
||||||
|
collection_urls = []
|
||||||
|
base_domain = urlparse(base_url).netloc
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
try:
|
||||||
|
parsed = urlparse(link)
|
||||||
|
if base_domain in parsed.netloc or parsed.netloc in base_domain:
|
||||||
|
for pattern in collection_patterns:
|
||||||
|
if re.search(pattern, link.lower()):
|
||||||
|
if link not in collection_urls:
|
||||||
|
collection_urls.append(link)
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return collection_urls[:10]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_auxiliary_platforms(links: list[str], base_url: str) -> list[dict]:
|
||||||
|
"""Extract external platform links (aggregators, portals, etc.)."""
|
||||||
|
external_patterns = {
|
||||||
|
'archieven.nl': {'name': 'Archieven.nl', 'type': 'AGGREGATOR'},
|
||||||
|
'europeana.eu': {'name': 'Europeana', 'type': 'AGGREGATOR'},
|
||||||
|
'collectienederland.nl': {'name': 'Collectie Nederland', 'type': 'AGGREGATOR'},
|
||||||
|
'erfgoedthesaurus.nl': {'name': 'Erfgoedthesaurus', 'type': 'THESAURUS'},
|
||||||
|
'delpher.nl': {'name': 'Delpher', 'type': 'DIGITAL_ARCHIVE'},
|
||||||
|
'geheugen.nl': {'name': 'Geheugen van Nederland', 'type': 'AGGREGATOR'},
|
||||||
|
}
|
||||||
|
|
||||||
|
base_domain = urlparse(base_url).netloc
|
||||||
|
auxiliary = []
|
||||||
|
seen_domains = set()
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
try:
|
||||||
|
parsed = urlparse(link)
|
||||||
|
domain = parsed.netloc.replace('www.', '')
|
||||||
|
|
||||||
|
if base_domain in domain or domain in base_domain:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for pattern, info in external_patterns.items():
|
||||||
|
if pattern in domain and domain not in seen_domains:
|
||||||
|
seen_domains.add(domain)
|
||||||
|
auxiliary.append({
|
||||||
|
'platform_name': info['name'],
|
||||||
|
'platform_url': link,
|
||||||
|
'platform_type': info['type'],
|
||||||
|
'integration_type': 'external_aggregator',
|
||||||
|
})
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return auxiliary[:5]
|
||||||
|
|
||||||
|
|
||||||
|
def is_generic_title(title: str) -> bool:
|
||||||
|
"""Check if a title is too generic to use as platform name."""
|
||||||
|
generic_patterns = [
|
||||||
|
'home', 'homepage', 'welkom', 'welcome', 'startpagina',
|
||||||
|
'index', 'main', 'website', 'webpagina', 'homepagina',
|
||||||
|
]
|
||||||
|
if not title:
|
||||||
|
return True
|
||||||
|
title_lower = title.lower().strip()
|
||||||
|
for pattern in generic_patterns:
|
||||||
|
if title_lower == pattern or title_lower.startswith(f"{pattern} -") or title_lower.startswith(f"{pattern} |"):
|
||||||
|
return True
|
||||||
|
return len(title) < 3
|
||||||
|
|
||||||
|
|
||||||
|
def transform_to_platform_v2(crawl_result, source_url: str, org_name: str) -> dict[str, Any]:
|
||||||
|
"""Transform crawl4ai result to digital_platform_v2 format."""
|
||||||
|
metadata = crawl_result.metadata or {}
|
||||||
|
|
||||||
|
# Get internal links
|
||||||
|
internal_links = []
|
||||||
|
if crawl_result.links:
|
||||||
|
internal_links = [l.get('href', '') for l in crawl_result.links.get('internal', []) if l.get('href')]
|
||||||
|
|
||||||
|
# Extract title, checking for generic titles
|
||||||
|
candidate_titles = [
|
||||||
|
metadata.get('og:title'),
|
||||||
|
metadata.get('title', '').split(' - ')[0].strip(),
|
||||||
|
metadata.get('title', '').split(' | ')[0].strip(),
|
||||||
|
metadata.get('og:site_name'),
|
||||||
|
]
|
||||||
|
|
||||||
|
title = org_name # Default fallback
|
||||||
|
for candidate in candidate_titles:
|
||||||
|
if candidate and not is_generic_title(candidate):
|
||||||
|
title = candidate
|
||||||
|
break
|
||||||
|
|
||||||
|
# Generate platform ID
|
||||||
|
domain = urlparse(source_url).netloc.replace('www.', '').replace('.', '_')
|
||||||
|
platform_id = f"primary_website_{domain}"
|
||||||
|
|
||||||
|
# Detect platform type
|
||||||
|
platform_type = detect_platform_type(source_url, internal_links)
|
||||||
|
|
||||||
|
# Extract collection URLs
|
||||||
|
collection_urls = extract_collection_urls(internal_links, source_url)
|
||||||
|
|
||||||
|
# Extract auxiliary platforms
|
||||||
|
auxiliary_platforms = extract_auxiliary_platforms(internal_links, source_url)
|
||||||
|
|
||||||
|
# Build digital_platform_v2 structure
|
||||||
|
platform_v2: dict[str, Any] = {
|
||||||
|
'transformation_metadata': {
|
||||||
|
'transformed_from': 'crawl4ai_recrawl',
|
||||||
|
'transformation_date': datetime.now(timezone.utc).isoformat(),
|
||||||
|
'transformation_version': '2.0',
|
||||||
|
'source_status_code': crawl_result.status_code,
|
||||||
|
},
|
||||||
|
'primary_platform': {
|
||||||
|
'platform_id': platform_id,
|
||||||
|
'platform_name': f"{title} Website" if 'website' not in title.lower() else title,
|
||||||
|
'platform_url': source_url,
|
||||||
|
'platform_type': platform_type,
|
||||||
|
'description': metadata.get('description') or metadata.get('og:description', ''),
|
||||||
|
'language': metadata.get('language', 'nl'),
|
||||||
|
'og_image': metadata.get('og:image'),
|
||||||
|
'favicon': metadata.get('favicon'),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
if collection_urls:
|
||||||
|
platform_v2['primary_platform']['collection_urls'] = collection_urls
|
||||||
|
|
||||||
|
if auxiliary_platforms:
|
||||||
|
platform_v2['auxiliary_platforms'] = auxiliary_platforms
|
||||||
|
|
||||||
|
if internal_links:
|
||||||
|
platform_v2['navigation_links'] = internal_links[:20]
|
||||||
|
|
||||||
|
return platform_v2
|
||||||
|
|
||||||
|
|
||||||
|
def update_custodian_file(filepath: Path, platform_v2: dict) -> bool:
|
||||||
|
"""Update a custodian YAML file with digital_platform_v2 data."""
|
||||||
|
try:
|
||||||
|
with open(filepath, 'r') as f:
|
||||||
|
data = yaml.safe_load(f)
|
||||||
|
|
||||||
|
if data is None:
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
data['digital_platform_v2'] = platform_v2
|
||||||
|
|
||||||
|
if 'crawl4ai_enrichment' in data:
|
||||||
|
data['crawl4ai_enrichment']['recrawled_with'] = 'crawl4ai_v2'
|
||||||
|
data['crawl4ai_enrichment']['recrawl_date'] = datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
with open(filepath, 'w') as f:
|
||||||
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error updating {filepath}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def load_failed_urls() -> list[tuple[str, str]]:
|
||||||
|
"""Load the list of failed URLs with their file paths."""
|
||||||
|
urls = []
|
||||||
|
with open(FAILED_URLS_FILE, 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if '\t' in line:
|
||||||
|
filename, url = line.split('\t', 1)
|
||||||
|
urls.append((filename, url))
|
||||||
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
def get_org_name(filepath: Path) -> str:
|
||||||
|
"""Extract organization name from custodian file."""
|
||||||
|
try:
|
||||||
|
with open(filepath, 'r') as f:
|
||||||
|
data = yaml.safe_load(f)
|
||||||
|
|
||||||
|
if data:
|
||||||
|
if 'original_entry' in data and data['original_entry'].get('organisatie'):
|
||||||
|
return data['original_entry']['organisatie']
|
||||||
|
if 'custodian_name' in data:
|
||||||
|
return data['custodian_name'].get('emic_name', '') or data['custodian_name'].get('preferred_name', '')
|
||||||
|
if 'name' in data:
|
||||||
|
return data['name']
|
||||||
|
|
||||||
|
stem = filepath.stem
|
||||||
|
parts = stem.split('-')
|
||||||
|
return parts[-1] if parts else stem
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return filepath.stem
|
||||||
|
|
||||||
|
|
||||||
|
async def scrape_single_url(crawler: AsyncWebCrawler, url: str) -> Any:
|
||||||
|
"""Scrape a single URL using crawl4ai."""
|
||||||
|
try:
|
||||||
|
result = await crawler.arun(url, verbose=False)
|
||||||
|
if result.success:
|
||||||
|
return result
|
||||||
|
print(f" Crawl failed: {result.error_message}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Exception: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def main_async(args):
|
||||||
|
"""Async main function."""
|
||||||
|
all_urls = load_failed_urls()
|
||||||
|
print(f"Loaded {len(all_urls)} failed URLs")
|
||||||
|
|
||||||
|
if args.limit > 0:
|
||||||
|
urls_to_process = all_urls[args.start:args.start + args.limit]
|
||||||
|
else:
|
||||||
|
urls_to_process = all_urls[args.start:]
|
||||||
|
|
||||||
|
print(f"Processing {len(urls_to_process)} URLs (start={args.start}, limit={args.limit or 'all'})")
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
print("\n[DRY RUN MODE - No changes will be made]")
|
||||||
|
for filename, url in urls_to_process[:10]:
|
||||||
|
print(f" Would scrape: {filename} -> {url}")
|
||||||
|
print(f" ... and {len(urls_to_process) - 10} more")
|
||||||
|
return
|
||||||
|
|
||||||
|
success_count = 0
|
||||||
|
fail_count = 0
|
||||||
|
skip_count = 0
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||||
|
for i, (filename, url) in enumerate(urls_to_process):
|
||||||
|
filepath = CUSTODIAN_DIR / filename
|
||||||
|
|
||||||
|
print(f"\n[{i+1}/{len(urls_to_process)}] {filename}")
|
||||||
|
print(f" URL: {url}")
|
||||||
|
|
||||||
|
if not filepath.exists():
|
||||||
|
print(f" SKIP: File not found")
|
||||||
|
skip_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if already has digital_platform_v2
|
||||||
|
with open(filepath, 'r') as f:
|
||||||
|
content = f.read()
|
||||||
|
if 'digital_platform_v2:' in content:
|
||||||
|
print(f" SKIP: Already has digital_platform_v2")
|
||||||
|
skip_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
org_name = get_org_name(filepath)
|
||||||
|
|
||||||
|
result = await scrape_single_url(crawler, url)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
platform_v2 = transform_to_platform_v2(result, url, org_name)
|
||||||
|
|
||||||
|
if update_custodian_file(filepath, platform_v2):
|
||||||
|
success_count += 1
|
||||||
|
print(f" SUCCESS: {platform_v2['primary_platform']['platform_name']}")
|
||||||
|
else:
|
||||||
|
fail_count += 1
|
||||||
|
else:
|
||||||
|
fail_count += 1
|
||||||
|
print(f" FAILED: Could not scrape URL")
|
||||||
|
|
||||||
|
# Small delay to be polite
|
||||||
|
await asyncio.sleep(args.delay)
|
||||||
|
|
||||||
|
if (i + 1) % 50 == 0:
|
||||||
|
print(f"\n=== Progress: {i+1}/{len(urls_to_process)} (success={success_count}, skip={skip_count}, fail={fail_count}) ===\n")
|
||||||
|
|
||||||
|
print(f"\n=== Final Results ===")
|
||||||
|
print(f"Success: {success_count}")
|
||||||
|
print(f"Skipped: {skip_count}")
|
||||||
|
print(f"Failed: {fail_count}")
|
||||||
|
print(f"Total: {len(urls_to_process)}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description='Batch re-crawl failed URLs with crawl4ai')
|
||||||
|
parser.add_argument('--start', type=int, default=0, help='Starting index')
|
||||||
|
parser.add_argument('--limit', type=int, default=0, help='Maximum URLs to process (0=all)')
|
||||||
|
parser.add_argument('--dry-run', action='store_true', help='Print what would be done without making changes')
|
||||||
|
parser.add_argument('--delay', type=float, default=0.5, help='Delay between requests in seconds')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
asyncio.run(main_async(args))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
434
scripts/batch_firecrawl_recrawl.py
Normal file
434
scripts/batch_firecrawl_recrawl.py
Normal file
|
|
@ -0,0 +1,434 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Batch re-crawl failed URLs using Firecrawl and transform to digital_platform_v2.
|
||||||
|
|
||||||
|
This script:
|
||||||
|
1. Reads the list of failed crawl URLs
|
||||||
|
2. Uses Firecrawl batch_scrape or individual scrape to fetch content
|
||||||
|
3. Transforms results to digital_platform_v2 format
|
||||||
|
4. Updates the custodian YAML files
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/batch_firecrawl_recrawl.py --batch-size 50 --start 0
|
||||||
|
|
||||||
|
Firecrawl API reference: https://docs.firecrawl.dev/api-reference/endpoint/scrape
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
||||||
|
FAILED_URLS_FILE = Path("/Users/kempersc/apps/glam/data/failed_crawl_urls.txt")
|
||||||
|
FIRECRAWL_API_KEY = os.environ.get("FIRECRAWL_API_KEY", "")
|
||||||
|
FIRECRAWL_BASE_URL = "https://api.firecrawl.dev/v1"
|
||||||
|
|
||||||
|
# Platform type detection patterns
|
||||||
|
PLATFORM_PATTERNS = {
|
||||||
|
'DISCOVERY_PORTAL': [
|
||||||
|
r'/collectie', r'/collection', r'/catalogus', r'/catalog',
|
||||||
|
r'/zoeken', r'/search', r'/archief', r'/archive',
|
||||||
|
r'/beeldbank', r'/images', r'/foto', r'/photo',
|
||||||
|
],
|
||||||
|
'DIGITAL_ARCHIVE': [
|
||||||
|
r'archieven\.nl', r'archief', r'archive',
|
||||||
|
r'/inventaris', r'/inventory', r'/toegang',
|
||||||
|
],
|
||||||
|
'EDUCATION': [
|
||||||
|
r'/educatie', r'/education', r'/onderwijs', r'/leren',
|
||||||
|
r'/scholen', r'/schools', r'/lesmateriaal',
|
||||||
|
],
|
||||||
|
'INSTITUTIONAL_WEBSITE': [
|
||||||
|
r'/over-ons', r'/about', r'/contact', r'/bezoek',
|
||||||
|
r'/visit', r'/openingstijden', r'/hours',
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def detect_platform_type(url: str, links: list[str] | None = None) -> str:
|
||||||
|
"""Detect the platform type based on URL patterns and extracted links."""
|
||||||
|
url_lower = url.lower()
|
||||||
|
all_urls = [url_lower] + [l.lower() for l in (links or [])]
|
||||||
|
|
||||||
|
for platform_type, patterns in PLATFORM_PATTERNS.items():
|
||||||
|
for pattern in patterns:
|
||||||
|
for check_url in all_urls:
|
||||||
|
if re.search(pattern, check_url):
|
||||||
|
return platform_type
|
||||||
|
|
||||||
|
return 'INSTITUTIONAL_WEBSITE'
|
||||||
|
|
||||||
|
|
||||||
|
def extract_collection_urls(links: list[str], base_url: str) -> list[str]:
|
||||||
|
"""Extract URLs that appear to be collection/catalog pages."""
|
||||||
|
collection_patterns = [
|
||||||
|
r'/collectie', r'/collection', r'/catalogus', r'/catalog',
|
||||||
|
r'/zoeken', r'/search', r'/beeldbank', r'/inventaris',
|
||||||
|
r'/archief(?!en\.)', r'/archiefstukken', r'/toegangen',
|
||||||
|
]
|
||||||
|
|
||||||
|
collection_urls = []
|
||||||
|
base_domain = urlparse(base_url).netloc
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
try:
|
||||||
|
parsed = urlparse(link)
|
||||||
|
# Only include links from same domain or subdomains
|
||||||
|
if base_domain in parsed.netloc or parsed.netloc in base_domain:
|
||||||
|
for pattern in collection_patterns:
|
||||||
|
if re.search(pattern, link.lower()):
|
||||||
|
if link not in collection_urls:
|
||||||
|
collection_urls.append(link)
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return collection_urls[:10] # Limit to 10 collection URLs
|
||||||
|
|
||||||
|
|
||||||
|
def extract_auxiliary_platforms(links: list[str], base_url: str) -> list[dict]:
|
||||||
|
"""Extract external platform links (aggregators, portals, etc.)."""
|
||||||
|
external_patterns = {
|
||||||
|
'archieven.nl': {'name': 'Archieven.nl', 'type': 'AGGREGATOR'},
|
||||||
|
'europeana.eu': {'name': 'Europeana', 'type': 'AGGREGATOR'},
|
||||||
|
'collectienederland.nl': {'name': 'Collectie Nederland', 'type': 'AGGREGATOR'},
|
||||||
|
'erfgoedthesaurus.nl': {'name': 'Erfgoedthesaurus', 'type': 'THESAURUS'},
|
||||||
|
'delpher.nl': {'name': 'Delpher', 'type': 'DIGITAL_ARCHIVE'},
|
||||||
|
'geheugen.nl': {'name': 'Geheugen van Nederland', 'type': 'AGGREGATOR'},
|
||||||
|
'archiefweb.eu': {'name': 'Archiefweb', 'type': 'DIGITAL_ARCHIVE'},
|
||||||
|
}
|
||||||
|
|
||||||
|
base_domain = urlparse(base_url).netloc
|
||||||
|
auxiliary = []
|
||||||
|
seen_domains = set()
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
try:
|
||||||
|
parsed = urlparse(link)
|
||||||
|
domain = parsed.netloc.replace('www.', '')
|
||||||
|
|
||||||
|
# Skip if same domain as base URL
|
||||||
|
if base_domain in domain or domain in base_domain:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check for known external platforms
|
||||||
|
for pattern, info in external_patterns.items():
|
||||||
|
if pattern in domain and domain not in seen_domains:
|
||||||
|
seen_domains.add(domain)
|
||||||
|
auxiliary.append({
|
||||||
|
'platform_name': info['name'],
|
||||||
|
'platform_url': link,
|
||||||
|
'platform_type': info['type'],
|
||||||
|
'integration_type': 'external_aggregator',
|
||||||
|
})
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return auxiliary[:5] # Limit to 5 auxiliary platforms
|
||||||
|
|
||||||
|
|
||||||
|
def is_generic_title(title: str) -> bool:
|
||||||
|
"""Check if a title is too generic to use as platform name."""
|
||||||
|
generic_patterns = [
|
||||||
|
'home', 'homepage', 'welkom', 'welcome', 'startpagina',
|
||||||
|
'index', 'main', 'website', 'webpagina', 'web page',
|
||||||
|
]
|
||||||
|
if not title:
|
||||||
|
return True
|
||||||
|
title_lower = title.lower().strip()
|
||||||
|
# Check if title is just one of the generic patterns
|
||||||
|
for pattern in generic_patterns:
|
||||||
|
if title_lower == pattern or title_lower == f"{pattern} -" or title_lower.startswith(f"{pattern} |"):
|
||||||
|
return True
|
||||||
|
return len(title) < 3
|
||||||
|
|
||||||
|
|
||||||
|
def transform_to_platform_v2(scrape_result: dict, source_url: str, org_name: str) -> dict[str, Any]:
|
||||||
|
"""Transform Firecrawl scrape result to digital_platform_v2 format."""
|
||||||
|
metadata = scrape_result.get('metadata', {})
|
||||||
|
links = scrape_result.get('links', [])
|
||||||
|
markdown = scrape_result.get('markdown', '')
|
||||||
|
|
||||||
|
# Extract title from metadata, checking for generic titles
|
||||||
|
candidate_titles = [
|
||||||
|
metadata.get('ogTitle'),
|
||||||
|
metadata.get('title', '').split(' - ')[0].strip(),
|
||||||
|
metadata.get('title', '').split(' | ')[0].strip(),
|
||||||
|
metadata.get('og:title'),
|
||||||
|
metadata.get('ogSiteName'),
|
||||||
|
metadata.get('og:site_name'),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Find first non-generic title
|
||||||
|
title = org_name # Default fallback
|
||||||
|
for candidate in candidate_titles:
|
||||||
|
if candidate and not is_generic_title(candidate):
|
||||||
|
title = candidate
|
||||||
|
break
|
||||||
|
|
||||||
|
# Generate platform ID
|
||||||
|
domain = urlparse(source_url).netloc.replace('www.', '').replace('.', '_')
|
||||||
|
platform_id = f"primary_website_{domain}"
|
||||||
|
|
||||||
|
# Detect platform type
|
||||||
|
platform_type = detect_platform_type(source_url, links)
|
||||||
|
|
||||||
|
# Extract collection URLs
|
||||||
|
collection_urls = extract_collection_urls(links, source_url)
|
||||||
|
|
||||||
|
# Extract auxiliary platforms
|
||||||
|
auxiliary_platforms = extract_auxiliary_platforms(links, source_url)
|
||||||
|
|
||||||
|
# Build digital_platform_v2 structure
|
||||||
|
platform_v2 = {
|
||||||
|
'transformation_metadata': {
|
||||||
|
'transformed_from': 'firecrawl_scrape',
|
||||||
|
'transformation_date': datetime.now(timezone.utc).isoformat(),
|
||||||
|
'transformation_version': '2.0',
|
||||||
|
'source_status_code': metadata.get('statusCode', 200),
|
||||||
|
},
|
||||||
|
'primary_platform': {
|
||||||
|
'platform_id': platform_id,
|
||||||
|
'platform_name': f"{title} Website" if 'website' not in title.lower() else title,
|
||||||
|
'platform_url': source_url,
|
||||||
|
'platform_type': platform_type,
|
||||||
|
'description': metadata.get('description') or metadata.get('ogDescription', ''),
|
||||||
|
'language': metadata.get('language', 'nl'),
|
||||||
|
'og_image': metadata.get('ogImage') or metadata.get('og:image'),
|
||||||
|
'favicon': metadata.get('favicon'),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add collection URLs if found
|
||||||
|
if collection_urls:
|
||||||
|
platform_v2['primary_platform']['collection_urls'] = collection_urls
|
||||||
|
|
||||||
|
# Add auxiliary platforms if found
|
||||||
|
if auxiliary_platforms:
|
||||||
|
platform_v2['auxiliary_platforms'] = auxiliary_platforms
|
||||||
|
|
||||||
|
# Add internal navigation links (sample)
|
||||||
|
internal_links = [
|
||||||
|
l for l in links
|
||||||
|
if urlparse(l).netloc in urlparse(source_url).netloc
|
||||||
|
][:20]
|
||||||
|
if internal_links:
|
||||||
|
platform_v2['navigation_links'] = internal_links
|
||||||
|
|
||||||
|
return platform_v2
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_single_url(url: str, client: httpx.Client, max_retries: int = 3) -> dict | None:
|
||||||
|
"""Scrape a single URL using Firecrawl API with retry on rate limit."""
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
response = client.post(
|
||||||
|
f"{FIRECRAWL_BASE_URL}/scrape",
|
||||||
|
json={
|
||||||
|
'url': url,
|
||||||
|
'formats': ['markdown', 'links'],
|
||||||
|
'onlyMainContent': True,
|
||||||
|
},
|
||||||
|
timeout=60.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
if data.get('success'):
|
||||||
|
return data.get('data', {})
|
||||||
|
|
||||||
|
# Handle rate limiting (429)
|
||||||
|
if response.status_code == 429:
|
||||||
|
wait_time = 15 * (attempt + 1) # 15s, 30s, 45s
|
||||||
|
print(f" Rate limited, waiting {wait_time}s (attempt {attempt + 1}/{max_retries})")
|
||||||
|
time.sleep(wait_time)
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f" Error {response.status_code}: {response.text[:200]}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Exception: {e}")
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
time.sleep(5)
|
||||||
|
continue
|
||||||
|
return None
|
||||||
|
|
||||||
|
print(f" Max retries exceeded")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def update_custodian_file(filepath: Path, platform_v2: dict) -> bool:
|
||||||
|
"""Update a custodian YAML file with digital_platform_v2 data."""
|
||||||
|
try:
|
||||||
|
with open(filepath, 'r') as f:
|
||||||
|
data = yaml.safe_load(f)
|
||||||
|
|
||||||
|
if data is None:
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
# Add digital_platform_v2 section
|
||||||
|
data['digital_platform_v2'] = platform_v2
|
||||||
|
|
||||||
|
# Update crawl4ai_enrichment status
|
||||||
|
if 'crawl4ai_enrichment' in data:
|
||||||
|
data['crawl4ai_enrichment']['recrawled_with'] = 'firecrawl'
|
||||||
|
data['crawl4ai_enrichment']['recrawl_date'] = datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
with open(filepath, 'w') as f:
|
||||||
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error updating {filepath}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def load_failed_urls() -> list[tuple[str, str]]:
|
||||||
|
"""Load the list of failed URLs with their file paths."""
|
||||||
|
urls = []
|
||||||
|
with open(FAILED_URLS_FILE, 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if '\t' in line:
|
||||||
|
filename, url = line.split('\t', 1)
|
||||||
|
urls.append((filename, url))
|
||||||
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
def get_org_name(filepath: Path) -> str:
|
||||||
|
"""Extract organization name from custodian file."""
|
||||||
|
try:
|
||||||
|
with open(filepath, 'r') as f:
|
||||||
|
data = yaml.safe_load(f)
|
||||||
|
|
||||||
|
# Try different name fields
|
||||||
|
if data:
|
||||||
|
if 'original_entry' in data and data['original_entry'].get('organisatie'):
|
||||||
|
return data['original_entry']['organisatie']
|
||||||
|
if 'custodian_name' in data:
|
||||||
|
return data['custodian_name'].get('emic_name', '') or data['custodian_name'].get('preferred_name', '')
|
||||||
|
if 'name' in data:
|
||||||
|
return data['name']
|
||||||
|
|
||||||
|
# Fallback: extract from filename
|
||||||
|
stem = filepath.stem
|
||||||
|
parts = stem.split('-')
|
||||||
|
return parts[-1] if parts else stem
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return filepath.stem
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description='Batch re-crawl failed URLs with Firecrawl')
|
||||||
|
parser.add_argument('--batch-size', type=int, default=50, help='Number of URLs per batch')
|
||||||
|
parser.add_argument('--start', type=int, default=0, help='Starting index')
|
||||||
|
parser.add_argument('--limit', type=int, default=0, help='Maximum URLs to process (0=all)')
|
||||||
|
parser.add_argument('--dry-run', action='store_true', help='Print what would be done without making changes')
|
||||||
|
parser.add_argument('--delay', type=float, default=6.0, help='Delay between requests in seconds (default 6 for rate limits)')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not FIRECRAWL_API_KEY:
|
||||||
|
print("Error: FIRECRAWL_API_KEY environment variable not set")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Load URLs
|
||||||
|
all_urls = load_failed_urls()
|
||||||
|
print(f"Loaded {len(all_urls)} failed URLs")
|
||||||
|
|
||||||
|
# Slice based on start and limit
|
||||||
|
if args.limit > 0:
|
||||||
|
urls_to_process = all_urls[args.start:args.start + args.limit]
|
||||||
|
else:
|
||||||
|
urls_to_process = all_urls[args.start:]
|
||||||
|
|
||||||
|
print(f"Processing {len(urls_to_process)} URLs (start={args.start}, limit={args.limit or 'all'})")
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
print("\n[DRY RUN MODE - No changes will be made]")
|
||||||
|
for filename, url in urls_to_process[:10]:
|
||||||
|
print(f" Would scrape: {filename} -> {url}")
|
||||||
|
print(f" ... and {len(urls_to_process) - 10} more")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Create HTTP client
|
||||||
|
client = httpx.Client(
|
||||||
|
headers={
|
||||||
|
'Authorization': f'Bearer {FIRECRAWL_API_KEY}',
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
success_count = 0
|
||||||
|
fail_count = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
for i, (filename, url) in enumerate(urls_to_process):
|
||||||
|
filepath = CUSTODIAN_DIR / filename
|
||||||
|
|
||||||
|
print(f"\n[{i+1}/{len(urls_to_process)}] {filename}")
|
||||||
|
print(f" URL: {url}")
|
||||||
|
|
||||||
|
if not filepath.exists():
|
||||||
|
print(f" SKIP: File not found")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if already has digital_platform_v2
|
||||||
|
with open(filepath, 'r') as f:
|
||||||
|
content = f.read()
|
||||||
|
if 'digital_platform_v2:' in content:
|
||||||
|
print(f" SKIP: Already has digital_platform_v2")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get org name for platform naming
|
||||||
|
org_name = get_org_name(filepath)
|
||||||
|
|
||||||
|
# Scrape URL
|
||||||
|
result = scrape_single_url(url, client)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
# Transform to platform_v2
|
||||||
|
platform_v2 = transform_to_platform_v2(result, url, org_name)
|
||||||
|
|
||||||
|
# Update file
|
||||||
|
if update_custodian_file(filepath, platform_v2):
|
||||||
|
success_count += 1
|
||||||
|
print(f" SUCCESS: {platform_v2['primary_platform']['platform_name']}")
|
||||||
|
else:
|
||||||
|
fail_count += 1
|
||||||
|
else:
|
||||||
|
fail_count += 1
|
||||||
|
print(f" FAILED: Could not scrape URL")
|
||||||
|
|
||||||
|
# Rate limiting
|
||||||
|
time.sleep(args.delay)
|
||||||
|
|
||||||
|
# Progress update every 50 URLs
|
||||||
|
if (i + 1) % 50 == 0:
|
||||||
|
print(f"\n=== Progress: {i+1}/{len(urls_to_process)} (success={success_count}, fail={fail_count}) ===\n")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
client.close()
|
||||||
|
|
||||||
|
print(f"\n=== Final Results ===")
|
||||||
|
print(f"Success: {success_count}")
|
||||||
|
print(f"Failed: {fail_count}")
|
||||||
|
print(f"Total: {len(urls_to_process)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
488
scripts/batch_httpx_scrape.py
Normal file
488
scripts/batch_httpx_scrape.py
Normal file
|
|
@ -0,0 +1,488 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Batch web scraper using httpx + BeautifulSoup for digital_platform_v2 enrichment.
|
||||||
|
|
||||||
|
This script:
|
||||||
|
1. Reads the list of failed crawl URLs
|
||||||
|
2. Uses httpx to fetch HTML content directly (no browser, no external API)
|
||||||
|
3. Uses BeautifulSoup to parse and extract metadata
|
||||||
|
4. Transforms results to digital_platform_v2 format
|
||||||
|
5. Updates the custodian YAML files
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/batch_httpx_scrape.py --limit 10
|
||||||
|
python scripts/batch_httpx_scrape.py --start 100 --limit 50
|
||||||
|
python scripts/batch_httpx_scrape.py --dry-run
|
||||||
|
|
||||||
|
No API keys or external services required!
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import yaml
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
||||||
|
FAILED_URLS_FILE = Path("/Users/kempersc/apps/glam/data/failed_crawl_urls.txt")
|
||||||
|
|
||||||
|
# User agent to mimic a real browser
|
||||||
|
USER_AGENT = (
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Platform type detection patterns
|
||||||
|
PLATFORM_PATTERNS: dict[str, list[str]] = {
|
||||||
|
'DISCOVERY_PORTAL': [
|
||||||
|
r'/collectie', r'/collection', r'/catalogus', r'/catalog',
|
||||||
|
r'/zoeken', r'/search', r'/archief', r'/archive',
|
||||||
|
r'/beeldbank', r'/images', r'/foto', r'/photo',
|
||||||
|
],
|
||||||
|
'DIGITAL_ARCHIVE': [
|
||||||
|
r'archieven\.nl', r'archief', r'archive',
|
||||||
|
r'/inventaris', r'/inventory', r'/toegang',
|
||||||
|
],
|
||||||
|
'EDUCATION': [
|
||||||
|
r'/educatie', r'/education', r'/onderwijs', r'/leren',
|
||||||
|
r'/scholen', r'/schools', r'/lesmateriaal',
|
||||||
|
],
|
||||||
|
'INSTITUTIONAL_WEBSITE': [
|
||||||
|
r'/over-ons', r'/about', r'/contact', r'/bezoek',
|
||||||
|
r'/visit', r'/openingstijden', r'/hours',
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def detect_platform_type(url: str, links: list[str] | None = None) -> str:
|
||||||
|
"""Detect the platform type based on URL patterns and extracted links."""
|
||||||
|
url_lower = url.lower()
|
||||||
|
all_urls = [url_lower] + [link.lower() for link in (links or [])]
|
||||||
|
|
||||||
|
for platform_type, patterns in PLATFORM_PATTERNS.items():
|
||||||
|
for pattern in patterns:
|
||||||
|
for check_url in all_urls:
|
||||||
|
if re.search(pattern, check_url):
|
||||||
|
return platform_type
|
||||||
|
|
||||||
|
return 'INSTITUTIONAL_WEBSITE'
|
||||||
|
|
||||||
|
|
||||||
|
def extract_collection_urls(links: list[str], base_url: str) -> list[str]:
|
||||||
|
"""Extract URLs that appear to be collection/catalog pages."""
|
||||||
|
collection_patterns = [
|
||||||
|
r'/collectie', r'/collection', r'/catalogus', r'/catalog',
|
||||||
|
r'/zoeken', r'/search', r'/beeldbank', r'/inventaris',
|
||||||
|
r'/archief(?!en\.)', r'/archiefstukken', r'/toegangen',
|
||||||
|
]
|
||||||
|
|
||||||
|
collection_urls: list[str] = []
|
||||||
|
base_domain = urlparse(base_url).netloc
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
try:
|
||||||
|
parsed = urlparse(link)
|
||||||
|
if base_domain in parsed.netloc or parsed.netloc in base_domain:
|
||||||
|
for pattern in collection_patterns:
|
||||||
|
if re.search(pattern, link.lower()):
|
||||||
|
if link not in collection_urls:
|
||||||
|
collection_urls.append(link)
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return collection_urls[:10]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_auxiliary_platforms(links: list[str], base_url: str) -> list[dict[str, str]]:
|
||||||
|
"""Extract external platform links (aggregators, portals, etc.)."""
|
||||||
|
external_patterns: dict[str, dict[str, str]] = {
|
||||||
|
'archieven.nl': {'name': 'Archieven.nl', 'type': 'AGGREGATOR'},
|
||||||
|
'europeana.eu': {'name': 'Europeana', 'type': 'AGGREGATOR'},
|
||||||
|
'collectienederland.nl': {'name': 'Collectie Nederland', 'type': 'AGGREGATOR'},
|
||||||
|
'erfgoedthesaurus.nl': {'name': 'Erfgoedthesaurus', 'type': 'THESAURUS'},
|
||||||
|
'delpher.nl': {'name': 'Delpher', 'type': 'DIGITAL_ARCHIVE'},
|
||||||
|
'geheugen.nl': {'name': 'Geheugen van Nederland', 'type': 'AGGREGATOR'},
|
||||||
|
'archiefweb.eu': {'name': 'Archiefweb', 'type': 'DIGITAL_ARCHIVE'},
|
||||||
|
}
|
||||||
|
|
||||||
|
base_domain = urlparse(base_url).netloc
|
||||||
|
auxiliary: list[dict[str, str]] = []
|
||||||
|
seen_domains: set[str] = set()
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
try:
|
||||||
|
parsed = urlparse(link)
|
||||||
|
domain = parsed.netloc.replace('www.', '')
|
||||||
|
|
||||||
|
if base_domain in domain or domain in base_domain:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for pattern, info in external_patterns.items():
|
||||||
|
if pattern in domain and domain not in seen_domains:
|
||||||
|
seen_domains.add(domain)
|
||||||
|
auxiliary.append({
|
||||||
|
'platform_name': info['name'],
|
||||||
|
'platform_url': link,
|
||||||
|
'platform_type': info['type'],
|
||||||
|
'integration_type': 'external_aggregator',
|
||||||
|
})
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return auxiliary[:5]
|
||||||
|
|
||||||
|
|
||||||
|
def is_generic_title(title: str | None) -> bool:
|
||||||
|
"""Check if a title is too generic to use as platform name."""
|
||||||
|
generic_patterns = [
|
||||||
|
'home', 'homepage', 'welkom', 'welcome', 'startpagina',
|
||||||
|
'index', 'main', 'website', 'webpagina', 'web page',
|
||||||
|
]
|
||||||
|
if not title:
|
||||||
|
return True
|
||||||
|
title_lower = title.lower().strip()
|
||||||
|
for pattern in generic_patterns:
|
||||||
|
if title_lower == pattern or title_lower == f"{pattern} -" or title_lower.startswith(f"{pattern} |"):
|
||||||
|
return True
|
||||||
|
return len(title) < 3
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_with_httpx(url: str, client: httpx.Client, timeout: float = 30.0) -> dict[str, Any] | None:
|
||||||
|
"""Scrape a URL using httpx and return parsed metadata."""
|
||||||
|
try:
|
||||||
|
response = client.get(url, timeout=timeout, follow_redirects=True)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
return {'error': f'HTTP {response.status_code}', 'status_code': response.status_code}
|
||||||
|
|
||||||
|
# Parse HTML
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
|
# Extract metadata
|
||||||
|
metadata: dict[str, Any] = {
|
||||||
|
'status_code': response.status_code,
|
||||||
|
'final_url': str(response.url),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Title
|
||||||
|
title_tag = soup.find('title')
|
||||||
|
metadata['title'] = title_tag.get_text(strip=True) if title_tag else None
|
||||||
|
|
||||||
|
# Meta tags
|
||||||
|
for meta in soup.find_all('meta'):
|
||||||
|
name = str(meta.get('name', '')).lower()
|
||||||
|
prop = str(meta.get('property', '')).lower()
|
||||||
|
content = str(meta.get('content', ''))
|
||||||
|
|
||||||
|
if name == 'description' or prop == 'og:description':
|
||||||
|
if 'description' not in metadata or prop == 'og:description':
|
||||||
|
metadata['description'] = content
|
||||||
|
elif prop == 'og:title':
|
||||||
|
metadata['og_title'] = content
|
||||||
|
elif prop == 'og:image':
|
||||||
|
metadata['og_image'] = urljoin(url, content) if content else None
|
||||||
|
elif prop == 'og:site_name':
|
||||||
|
metadata['og_site_name'] = content
|
||||||
|
elif name == 'language' or str(meta.get('http-equiv', '')).lower() == 'content-language':
|
||||||
|
metadata['language'] = content.split(',')[0].split('-')[0]
|
||||||
|
|
||||||
|
# Detect language from html tag
|
||||||
|
html_tag = soup.find('html')
|
||||||
|
if html_tag:
|
||||||
|
lang_attr = html_tag.get('lang')
|
||||||
|
if lang_attr:
|
||||||
|
lang_str = str(lang_attr) if not isinstance(lang_attr, list) else str(lang_attr[0])
|
||||||
|
metadata['language'] = lang_str.split('-')[0]
|
||||||
|
|
||||||
|
# Favicon
|
||||||
|
for link in soup.find_all('link'):
|
||||||
|
rel = link.get('rel')
|
||||||
|
if rel is None:
|
||||||
|
rel = []
|
||||||
|
if isinstance(rel, list):
|
||||||
|
rel_str = ' '.join(str(r) for r in rel)
|
||||||
|
else:
|
||||||
|
rel_str = str(rel)
|
||||||
|
if 'icon' in rel_str.lower():
|
||||||
|
href = link.get('href')
|
||||||
|
if href:
|
||||||
|
metadata['favicon'] = urljoin(url, str(href))
|
||||||
|
break
|
||||||
|
|
||||||
|
# Extract links
|
||||||
|
links: list[str] = []
|
||||||
|
for a in soup.find_all('a', href=True):
|
||||||
|
href = str(a['href'])
|
||||||
|
if href.startswith('http') or href.startswith('/'):
|
||||||
|
full_url = urljoin(url, href)
|
||||||
|
if full_url not in links:
|
||||||
|
links.append(full_url)
|
||||||
|
|
||||||
|
metadata['links'] = links[:100] # Limit to 100 links
|
||||||
|
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
except httpx.TimeoutException:
|
||||||
|
return {'error': 'Timeout', 'status_code': None}
|
||||||
|
except httpx.ConnectError as e:
|
||||||
|
return {'error': f'Connection error: {e}', 'status_code': None}
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
return {'error': f'HTTP error: {e}', 'status_code': None}
|
||||||
|
except Exception as e:
|
||||||
|
return {'error': f'Exception: {e}', 'status_code': None}
|
||||||
|
|
||||||
|
|
||||||
|
def transform_to_platform_v2(scrape_result: dict[str, Any], source_url: str, org_name: str) -> dict[str, Any]:
|
||||||
|
"""Transform scrape result to digital_platform_v2 format."""
|
||||||
|
links: list[str] = scrape_result.get('links', [])
|
||||||
|
|
||||||
|
# Extract title, preferring og:title, then site_name, then page title
|
||||||
|
raw_title = scrape_result.get('title', '') or ''
|
||||||
|
candidate_titles: list[str | None] = [
|
||||||
|
scrape_result.get('og_title'),
|
||||||
|
scrape_result.get('og_site_name'),
|
||||||
|
raw_title.split(' - ')[0].strip() if raw_title else None,
|
||||||
|
raw_title.split(' | ')[0].strip() if raw_title else None,
|
||||||
|
]
|
||||||
|
|
||||||
|
title = org_name # Default fallback
|
||||||
|
for candidate in candidate_titles:
|
||||||
|
if candidate and not is_generic_title(candidate):
|
||||||
|
title = candidate
|
||||||
|
break
|
||||||
|
|
||||||
|
# Generate platform ID
|
||||||
|
domain = urlparse(source_url).netloc.replace('www.', '').replace('.', '_')
|
||||||
|
platform_id = f"primary_website_{domain}"
|
||||||
|
|
||||||
|
# Detect platform type
|
||||||
|
platform_type = detect_platform_type(source_url, links)
|
||||||
|
|
||||||
|
# Extract collection URLs
|
||||||
|
collection_urls = extract_collection_urls(links, source_url)
|
||||||
|
|
||||||
|
# Extract auxiliary platforms
|
||||||
|
auxiliary_platforms = extract_auxiliary_platforms(links, source_url)
|
||||||
|
|
||||||
|
# Build digital_platform_v2 structure
|
||||||
|
platform_v2: dict[str, Any] = {
|
||||||
|
'transformation_metadata': {
|
||||||
|
'transformed_from': 'httpx_beautifulsoup',
|
||||||
|
'transformation_date': datetime.now(timezone.utc).isoformat(),
|
||||||
|
'transformation_version': '2.1',
|
||||||
|
'source_status_code': scrape_result.get('status_code', 200),
|
||||||
|
},
|
||||||
|
'primary_platform': {
|
||||||
|
'platform_id': platform_id,
|
||||||
|
'platform_name': f"{title} Website" if 'website' not in title.lower() else title,
|
||||||
|
'platform_url': scrape_result.get('final_url', source_url),
|
||||||
|
'platform_type': platform_type,
|
||||||
|
'description': scrape_result.get('description', ''),
|
||||||
|
'language': scrape_result.get('language', 'nl'),
|
||||||
|
'og_image': scrape_result.get('og_image'),
|
||||||
|
'favicon': scrape_result.get('favicon'),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add collection URLs if found
|
||||||
|
if collection_urls:
|
||||||
|
platform_v2['primary_platform']['collection_urls'] = collection_urls
|
||||||
|
|
||||||
|
# Add auxiliary platforms if found
|
||||||
|
if auxiliary_platforms:
|
||||||
|
platform_v2['auxiliary_platforms'] = auxiliary_platforms
|
||||||
|
|
||||||
|
# Add internal navigation links (sample)
|
||||||
|
base_domain = urlparse(source_url).netloc
|
||||||
|
internal_links = [link for link in links if base_domain in urlparse(link).netloc][:20]
|
||||||
|
if internal_links:
|
||||||
|
platform_v2['navigation_links'] = internal_links
|
||||||
|
|
||||||
|
return platform_v2
|
||||||
|
|
||||||
|
|
||||||
|
def update_custodian_file(filepath: Path, platform_v2: dict[str, Any]) -> bool:
|
||||||
|
"""Update a custodian YAML file with digital_platform_v2 data."""
|
||||||
|
try:
|
||||||
|
with open(filepath, 'r') as f:
|
||||||
|
data = yaml.safe_load(f)
|
||||||
|
|
||||||
|
if data is None:
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
# Add digital_platform_v2 section
|
||||||
|
data['digital_platform_v2'] = platform_v2
|
||||||
|
|
||||||
|
with open(filepath, 'w') as f:
|
||||||
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error updating {filepath}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def load_failed_urls() -> list[tuple[str, str]]:
|
||||||
|
"""Load the list of failed URLs with their file paths."""
|
||||||
|
urls: list[tuple[str, str]] = []
|
||||||
|
with open(FAILED_URLS_FILE, 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if '\t' in line:
|
||||||
|
filename, url = line.split('\t', 1)
|
||||||
|
urls.append((filename, url))
|
||||||
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
def get_org_name(filepath: Path) -> str:
|
||||||
|
"""Extract organization name from custodian file."""
|
||||||
|
try:
|
||||||
|
with open(filepath, 'r') as f:
|
||||||
|
data = yaml.safe_load(f)
|
||||||
|
|
||||||
|
if data:
|
||||||
|
if 'original_entry' in data and data['original_entry'].get('organisatie'):
|
||||||
|
return str(data['original_entry']['organisatie'])
|
||||||
|
if 'custodian_name' in data:
|
||||||
|
cn = data['custodian_name']
|
||||||
|
return str(cn.get('emic_name', '') or cn.get('preferred_name', ''))
|
||||||
|
if 'name' in data:
|
||||||
|
return str(data['name'])
|
||||||
|
|
||||||
|
# Fallback: extract from filename
|
||||||
|
stem = filepath.stem
|
||||||
|
parts = stem.split('-')
|
||||||
|
return parts[-1] if parts else stem
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return filepath.stem
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description='Batch web scraper using httpx + BeautifulSoup')
|
||||||
|
parser.add_argument('--start', type=int, default=0, help='Starting index')
|
||||||
|
parser.add_argument('--limit', type=int, default=0, help='Maximum URLs to process (0=all)')
|
||||||
|
parser.add_argument('--dry-run', action='store_true', help='Print what would be done without making changes')
|
||||||
|
parser.add_argument('--delay', type=float, default=1.0, help='Delay between requests in seconds (default 1)')
|
||||||
|
parser.add_argument('--timeout', type=float, default=30.0, help='Request timeout in seconds (default 30)')
|
||||||
|
parser.add_argument('--skip-existing', action='store_true', default=True, help='Skip files that already have digital_platform_v2')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Check for BeautifulSoup
|
||||||
|
try:
|
||||||
|
from bs4 import BeautifulSoup as _ # noqa: F401
|
||||||
|
except ImportError:
|
||||||
|
print("Error: BeautifulSoup not installed. Run: pip install beautifulsoup4")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Load URLs
|
||||||
|
all_urls = load_failed_urls()
|
||||||
|
print(f"Loaded {len(all_urls)} failed URLs from {FAILED_URLS_FILE}")
|
||||||
|
|
||||||
|
# Slice based on start and limit
|
||||||
|
if args.limit > 0:
|
||||||
|
urls_to_process = all_urls[args.start:args.start + args.limit]
|
||||||
|
else:
|
||||||
|
urls_to_process = all_urls[args.start:]
|
||||||
|
|
||||||
|
print(f"Processing {len(urls_to_process)} URLs (start={args.start}, limit={args.limit or 'all'})")
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
print("\n[DRY RUN MODE - No changes will be made]")
|
||||||
|
for filename, url in urls_to_process[:10]:
|
||||||
|
print(f" Would scrape: {filename} -> {url}")
|
||||||
|
if len(urls_to_process) > 10:
|
||||||
|
print(f" ... and {len(urls_to_process) - 10} more")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Create HTTP client with headers
|
||||||
|
client = httpx.Client(
|
||||||
|
headers={
|
||||||
|
'User-Agent': USER_AGENT,
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||||
|
'Accept-Language': 'nl,en-US;q=0.9,en;q=0.8',
|
||||||
|
},
|
||||||
|
follow_redirects=True,
|
||||||
|
timeout=args.timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
success_count = 0
|
||||||
|
skip_count = 0
|
||||||
|
fail_count = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
for i, (filename, url) in enumerate(urls_to_process):
|
||||||
|
filepath = CUSTODIAN_DIR / filename
|
||||||
|
|
||||||
|
print(f"\n[{i+1}/{len(urls_to_process)}] {filename}")
|
||||||
|
print(f" URL: {url}")
|
||||||
|
|
||||||
|
if not filepath.exists():
|
||||||
|
print(f" SKIP: File not found")
|
||||||
|
skip_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if already has digital_platform_v2
|
||||||
|
if args.skip_existing:
|
||||||
|
with open(filepath, 'r') as f:
|
||||||
|
content = f.read()
|
||||||
|
if 'digital_platform_v2:' in content:
|
||||||
|
print(f" SKIP: Already has digital_platform_v2")
|
||||||
|
skip_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get org name for platform naming
|
||||||
|
org_name = get_org_name(filepath)
|
||||||
|
|
||||||
|
# Scrape URL
|
||||||
|
result = scrape_with_httpx(url, client, timeout=args.timeout)
|
||||||
|
|
||||||
|
if result and 'error' not in result:
|
||||||
|
# Transform to platform_v2
|
||||||
|
platform_v2 = transform_to_platform_v2(result, url, org_name)
|
||||||
|
|
||||||
|
# Update file
|
||||||
|
if update_custodian_file(filepath, platform_v2):
|
||||||
|
success_count += 1
|
||||||
|
platform_name = platform_v2['primary_platform']['platform_name']
|
||||||
|
print(f" SUCCESS: {platform_name}")
|
||||||
|
else:
|
||||||
|
fail_count += 1
|
||||||
|
else:
|
||||||
|
fail_count += 1
|
||||||
|
error_msg = result.get('error', 'Unknown error') if result else 'No result'
|
||||||
|
print(f" FAILED: {error_msg}")
|
||||||
|
|
||||||
|
# Rate limiting
|
||||||
|
if args.delay > 0:
|
||||||
|
time.sleep(args.delay)
|
||||||
|
|
||||||
|
# Progress update every 50 URLs
|
||||||
|
if (i + 1) % 50 == 0:
|
||||||
|
print(f"\n=== Progress: {i+1}/{len(urls_to_process)} (success={success_count}, skip={skip_count}, fail={fail_count}) ===\n")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
client.close()
|
||||||
|
|
||||||
|
print(f"\n=== Final Results ===")
|
||||||
|
print(f"Success: {success_count}")
|
||||||
|
print(f"Skipped: {skip_count}")
|
||||||
|
print(f"Failed: {fail_count}")
|
||||||
|
print(f"Total: {len(urls_to_process)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
213
scripts/detect_name_mismatch.py
Normal file
213
scripts/detect_name_mismatch.py
Normal file
|
|
@ -0,0 +1,213 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Detect name mismatches in LinkedIn entity profiles.
|
||||||
|
|
||||||
|
Compares the LinkedIn URL slug with the assigned name to find:
|
||||||
|
1. Profiles where the name doesn't match the slug at all
|
||||||
|
2. Patterns of repeated wrong names (like "Simon Kemper")
|
||||||
|
3. Other potential filler/hallucinated names
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import Counter, defaultdict
|
||||||
|
from urllib.parse import unquote
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
def normalize_name(name: str) -> str:
|
||||||
|
"""Normalize a name for comparison."""
|
||||||
|
if not name:
|
||||||
|
return ""
|
||||||
|
# Decode URL encoding
|
||||||
|
name = unquote(name)
|
||||||
|
# Normalize unicode
|
||||||
|
name = unicodedata.normalize('NFD', name)
|
||||||
|
# Remove diacritics
|
||||||
|
name = ''.join(c for c in name if unicodedata.category(c) != 'Mn')
|
||||||
|
# Lowercase
|
||||||
|
name = name.lower()
|
||||||
|
# Remove common suffixes like numbers, hyphens
|
||||||
|
name = re.sub(r'[-_\d]+$', '', name)
|
||||||
|
# Replace hyphens/underscores with spaces
|
||||||
|
name = re.sub(r'[-_]+', ' ', name)
|
||||||
|
# Remove extra whitespace
|
||||||
|
name = ' '.join(name.split())
|
||||||
|
return name
|
||||||
|
|
||||||
|
def extract_name_from_slug(slug: str) -> str:
|
||||||
|
"""Extract a human-readable name from a LinkedIn slug."""
|
||||||
|
# Decode URL encoding
|
||||||
|
slug = unquote(slug)
|
||||||
|
# Remove timestamp suffix like _20251214T115050Z
|
||||||
|
slug = re.sub(r'_\d{8}T\d{6}Z\.json$', '', slug)
|
||||||
|
# Remove trailing numbers/IDs
|
||||||
|
slug = re.sub(r'[-_][\da-f]{6,}$', '', slug)
|
||||||
|
slug = re.sub(r'[-_]\d+$', '', slug)
|
||||||
|
return normalize_name(slug)
|
||||||
|
|
||||||
|
def names_match(slug_name: str, profile_name: str) -> bool:
|
||||||
|
"""Check if the slug name and profile name are reasonably similar."""
|
||||||
|
if not slug_name or not profile_name:
|
||||||
|
return False
|
||||||
|
|
||||||
|
slug_normalized = normalize_name(slug_name)
|
||||||
|
profile_normalized = normalize_name(profile_name)
|
||||||
|
|
||||||
|
# Direct match
|
||||||
|
if slug_normalized == profile_normalized:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check if all words from slug appear in profile name
|
||||||
|
slug_words = set(slug_normalized.split())
|
||||||
|
profile_words = set(profile_normalized.split())
|
||||||
|
|
||||||
|
# If slug has meaningful words, check overlap
|
||||||
|
if slug_words and len(slug_words) >= 2:
|
||||||
|
# At least half the slug words should be in profile
|
||||||
|
overlap = slug_words & profile_words
|
||||||
|
if len(overlap) >= len(slug_words) * 0.5:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check if first name matches
|
||||||
|
slug_parts = slug_normalized.split()
|
||||||
|
profile_parts = profile_normalized.split()
|
||||||
|
if slug_parts and profile_parts:
|
||||||
|
if slug_parts[0] == profile_parts[0]:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def analyze_entity_files(entity_dir: Path):
|
||||||
|
"""Analyze all entity files for name mismatches."""
|
||||||
|
|
||||||
|
mismatches = []
|
||||||
|
name_counter = Counter()
|
||||||
|
files_by_name = defaultdict(list)
|
||||||
|
total_files = 0
|
||||||
|
fallback_files = 0
|
||||||
|
|
||||||
|
for filepath in entity_dir.glob("*.json"):
|
||||||
|
total_files += 1
|
||||||
|
filename = filepath.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(filepath, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
except (json.JSONDecodeError, IOError) as e:
|
||||||
|
print(f"Error reading {filename}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get the profile name
|
||||||
|
profile_name = None
|
||||||
|
if 'profile_data' in data and 'name' in data['profile_data']:
|
||||||
|
profile_name = data['profile_data']['name']
|
||||||
|
elif 'source_staff_info' in data and 'name' in data['source_staff_info']:
|
||||||
|
profile_name = data['source_staff_info']['name']
|
||||||
|
|
||||||
|
if not profile_name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Track all names for frequency analysis
|
||||||
|
name_counter[profile_name] += 1
|
||||||
|
files_by_name[profile_name].append(filename)
|
||||||
|
|
||||||
|
# Check if this is a fallback file
|
||||||
|
extraction_method = data.get('extraction_metadata', {}).get('extraction_method', '')
|
||||||
|
if extraction_method == 'fallback_basic':
|
||||||
|
fallback_files += 1
|
||||||
|
|
||||||
|
# Extract name from slug
|
||||||
|
slug_name = extract_name_from_slug(filename)
|
||||||
|
|
||||||
|
# Check for mismatch
|
||||||
|
if not names_match(slug_name, profile_name):
|
||||||
|
mismatches.append({
|
||||||
|
'filename': filename,
|
||||||
|
'slug_name': slug_name,
|
||||||
|
'profile_name': profile_name,
|
||||||
|
'extraction_method': extraction_method,
|
||||||
|
'linkedin_url': data.get('extraction_metadata', {}).get('linkedin_url', '')
|
||||||
|
})
|
||||||
|
|
||||||
|
return {
|
||||||
|
'total_files': total_files,
|
||||||
|
'fallback_files': fallback_files,
|
||||||
|
'mismatches': mismatches,
|
||||||
|
'name_counter': name_counter,
|
||||||
|
'files_by_name': files_by_name
|
||||||
|
}
|
||||||
|
|
||||||
|
def main():
|
||||||
|
entity_dir = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")
|
||||||
|
|
||||||
|
print("=" * 80)
|
||||||
|
print("LINKEDIN ENTITY NAME MISMATCH ANALYSIS")
|
||||||
|
print("=" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
results = analyze_entity_files(entity_dir)
|
||||||
|
|
||||||
|
print(f"Total entity files analyzed: {results['total_files']}")
|
||||||
|
print(f"Fallback (basic) files: {results['fallback_files']}")
|
||||||
|
print(f"Total mismatches detected: {len(results['mismatches'])}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Find names that appear suspiciously often (potential filler names)
|
||||||
|
print("=" * 80)
|
||||||
|
print("NAMES APPEARING MORE THAN 5 TIMES (Potential Filler Names)")
|
||||||
|
print("=" * 80)
|
||||||
|
frequent_names = [(name, count) for name, count in results['name_counter'].most_common(50) if count > 5]
|
||||||
|
|
||||||
|
for name, count in frequent_names:
|
||||||
|
# Check if this name appears in mismatches
|
||||||
|
mismatch_count = sum(1 for m in results['mismatches'] if m['profile_name'] == name)
|
||||||
|
print(f" '{name}': {count} occurrences ({mismatch_count} are mismatches)")
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("=" * 80)
|
||||||
|
print("ALL MISMATCHED FILES (slug name != profile name)")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
# Group mismatches by profile_name to see patterns
|
||||||
|
mismatch_by_name = defaultdict(list)
|
||||||
|
for m in results['mismatches']:
|
||||||
|
mismatch_by_name[m['profile_name']].append(m)
|
||||||
|
|
||||||
|
# Sort by frequency of the mismatched name
|
||||||
|
sorted_names = sorted(mismatch_by_name.items(), key=lambda x: -len(x[1]))
|
||||||
|
|
||||||
|
for profile_name, items in sorted_names[:30]: # Top 30 most frequent mismatch names
|
||||||
|
print(f"\n--- '{profile_name}' assigned to {len(items)} different slugs ---")
|
||||||
|
for item in items[:10]: # Show first 10 examples
|
||||||
|
print(f" Slug: {item['slug_name']}")
|
||||||
|
print(f" File: {item['filename']}")
|
||||||
|
print(f" Method: {item['extraction_method']}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Output detailed CSV for further analysis
|
||||||
|
csv_path = entity_dir.parent / "name_mismatch_report.csv"
|
||||||
|
with open(csv_path, 'w', encoding='utf-8') as f:
|
||||||
|
f.write("filename,slug_name,profile_name,extraction_method,linkedin_url\n")
|
||||||
|
for m in results['mismatches']:
|
||||||
|
f.write(f'"{m["filename"]}","{m["slug_name"]}","{m["profile_name"]}","{m["extraction_method"]}","{m["linkedin_url"]}"\n')
|
||||||
|
|
||||||
|
print(f"\nDetailed report saved to: {csv_path}")
|
||||||
|
|
||||||
|
# Also output JSON for programmatic use
|
||||||
|
json_path = entity_dir.parent / "name_mismatch_report.json"
|
||||||
|
with open(json_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump({
|
||||||
|
'total_files': results['total_files'],
|
||||||
|
'fallback_files': results['fallback_files'],
|
||||||
|
'total_mismatches': len(results['mismatches']),
|
||||||
|
'mismatches_by_name': {name: len(items) for name, items in mismatch_by_name.items()},
|
||||||
|
'frequent_names': [(name, count) for name, count in results['name_counter'].most_common(100)],
|
||||||
|
'mismatches': results['mismatches']
|
||||||
|
}, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
print(f"JSON report saved to: {json_path}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
666
scripts/enrich_dutch_custodians_crawl4ai.py
Executable file
666
scripts/enrich_dutch_custodians_crawl4ai.py
Executable file
|
|
@ -0,0 +1,666 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Enrich Dutch custodian YAML files with web data using Crawl4AI (free, local).
|
||||||
|
|
||||||
|
This script replaces the Firecrawl-based enrichment with Crawl4AI which:
|
||||||
|
1. Runs locally using Playwright (no API costs)
|
||||||
|
2. Extracts links, metadata, and content with XPath provenance
|
||||||
|
3. Detects APIs, catalogs, and metadata standards
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/enrich_dutch_custodians_crawl4ai.py [options]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--dry-run Show what would be enriched without modifying files
|
||||||
|
--limit N Process only first N files (for testing)
|
||||||
|
--start-index N Start from index N (for resuming)
|
||||||
|
--resume Resume from last checkpoint
|
||||||
|
--force Re-enrich even if already has crawl4ai_enrichment
|
||||||
|
--file PATH Process a single specific file
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Optional
|
||||||
|
from urllib.parse import urlparse, urlunparse
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
# Crawl4AI imports
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
|
# Load environment variables from .env file
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# Set up logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
|
||||||
|
CHECKPOINT_FILE = CUSTODIAN_DIR / ".crawl4ai_enrichment_checkpoint.json"
|
||||||
|
|
||||||
|
# Rate limiting - be nice to websites even though we're local
|
||||||
|
REQUEST_DELAY = 2.0 # seconds between requests
|
||||||
|
|
||||||
|
# Digital platform detection patterns
|
||||||
|
API_ENDPOINT_PATTERNS = [
|
||||||
|
r'/oai[-_]?pmh',
|
||||||
|
r'/api/',
|
||||||
|
r'/rest/',
|
||||||
|
r'/sparql',
|
||||||
|
r'/graphql',
|
||||||
|
r'/iiif/',
|
||||||
|
r'/sru',
|
||||||
|
r'/z39\.50',
|
||||||
|
r'/opensearch',
|
||||||
|
]
|
||||||
|
|
||||||
|
CATALOG_PATTERNS = [
|
||||||
|
r'/catalogu[es]?(?:/|\?|$)',
|
||||||
|
r'/collecti[eo]n?[s]?(?:/|\?|$)',
|
||||||
|
r'/archie[fv](?:/|\?|$)',
|
||||||
|
r'/beeldbank(?:/|\?|$)',
|
||||||
|
r'/zoeken(?:/|\?|$)',
|
||||||
|
r'/search(?:/|\?|$)',
|
||||||
|
r'/discover(?:/|\?|$)',
|
||||||
|
r'/browse(?:/|\?|$)',
|
||||||
|
]
|
||||||
|
|
||||||
|
# Dutch-specific catalog type detection
|
||||||
|
CATALOG_TYPE_PATTERNS = {
|
||||||
|
'beeldbank': {
|
||||||
|
'patterns': [r'/beeldbank', r'/beeld', r'/images', r'/foto'],
|
||||||
|
'label': 'Image Collection',
|
||||||
|
'description_nl': 'Beeldbank met gedigitaliseerde foto\'s, kaarten en afbeeldingen',
|
||||||
|
},
|
||||||
|
'genealogie': {
|
||||||
|
'patterns': [r'/genealogie', r'/stamboom', r'/persons', r'/akten'],
|
||||||
|
'label': 'Genealogy Records',
|
||||||
|
'description_nl': 'Genealogische bronnen en persoonsgegevens',
|
||||||
|
},
|
||||||
|
'archieven': {
|
||||||
|
'patterns': [r'/archie[fv]', r'/inventaris', r'/toegangen', r'/finding'],
|
||||||
|
'label': 'Archive Finding Aids',
|
||||||
|
'description_nl': 'Archiefinventarissen en toegangen',
|
||||||
|
},
|
||||||
|
'collectie': {
|
||||||
|
'patterns': [r'/collectie', r'/collection', r'/object'],
|
||||||
|
'label': 'Collection Portal',
|
||||||
|
'description_nl': 'Collectieportaal met objecten en kunstwerken',
|
||||||
|
},
|
||||||
|
'kranten': {
|
||||||
|
'patterns': [r'/kranten', r'/newspaper', r'/periodiek'],
|
||||||
|
'label': 'Newspaper Archive',
|
||||||
|
'description_nl': 'Gedigitaliseerde kranten en periodieken',
|
||||||
|
},
|
||||||
|
'kaarten': {
|
||||||
|
'patterns': [r'/kaart', r'/map', r'/cartogra'],
|
||||||
|
'label': 'Map Collection',
|
||||||
|
'description_nl': 'Historische kaarten en cartografisch materiaal',
|
||||||
|
},
|
||||||
|
'bibliotheek': {
|
||||||
|
'patterns': [r'/catalogu', r'/biblio', r'/library', r'/boek'],
|
||||||
|
'label': 'Library Catalog',
|
||||||
|
'description_nl': 'Bibliotheekcatalogus',
|
||||||
|
},
|
||||||
|
'zoeken': {
|
||||||
|
'patterns': [r'/zoeken', r'/search', r'/discover', r'/browse'],
|
||||||
|
'label': 'Search Interface',
|
||||||
|
'description_nl': 'Algemene zoekinterface',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
CMS_INDICATORS = {
|
||||||
|
'atlantis': ['atlantis', 'picturae'],
|
||||||
|
'mais_flexis': ['mais-flexis', 'mais flexis', 'de ree'],
|
||||||
|
'adlib': ['adlib', 'axiell'],
|
||||||
|
'collective_access': ['collectiveaccess', 'collective access'],
|
||||||
|
'archivematica': ['archivematica'],
|
||||||
|
'archivesspace': ['archivesspace'],
|
||||||
|
'atom': ['accesstomemory', 'atom'],
|
||||||
|
'omeka': ['omeka'],
|
||||||
|
'contentdm': ['contentdm'],
|
||||||
|
'dspace': ['dspace'],
|
||||||
|
'islandora': ['islandora'],
|
||||||
|
'memorix': ['memorix'],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Metadata standards detection patterns with regex word boundaries
|
||||||
|
METADATA_STANDARDS_PATTERNS = [
|
||||||
|
(r'\bdublin\s+core\b', 'Dublin Core', True),
|
||||||
|
(r'\bdc:', 'Dublin Core', True),
|
||||||
|
(r'\bdcterms\b', 'Dublin Core', True),
|
||||||
|
(r'\bmarc\s*21\b', 'MARC21', True),
|
||||||
|
(r'\bmarc21\b', 'MARC21', True),
|
||||||
|
(r'\bead\b', 'EAD', True),
|
||||||
|
(r'encoded\s+archival\s+description', 'EAD', True),
|
||||||
|
(r'\bead\s*2002\b', 'EAD', True),
|
||||||
|
(r'\bead3\b', 'EAD', True),
|
||||||
|
(r'\bmets\b', 'METS', True),
|
||||||
|
(r'metadata\s+encoding\s+and\s+transmission', 'METS', True),
|
||||||
|
(r'\bmods\b', 'MODS', True),
|
||||||
|
(r'metadata\s+object\s+description', 'MODS', True),
|
||||||
|
(r'\blido\b', 'LIDO', True),
|
||||||
|
(r'lightweight\s+information\s+describing', 'LIDO', True),
|
||||||
|
(r'\bcidoc[-\s]?crm\b', 'CIDOC-CRM', True),
|
||||||
|
(r'\bschema\.org\b', 'Schema.org', True),
|
||||||
|
(r'\bschema:', 'Schema.org', True),
|
||||||
|
(r'\bric[-\s]?o\b', 'RiC-O', True),
|
||||||
|
(r'records\s+in\s+contexts', 'RiC-O', True),
|
||||||
|
(r'\bpremis\b', 'PREMIS', True),
|
||||||
|
(r'preservation\s+metadata', 'PREMIS', True),
|
||||||
|
(r'\bbibframe\b', 'BIBFRAME', True),
|
||||||
|
(r'\biiif\b', 'IIIF', True),
|
||||||
|
(r'image\s+interoperability\s+framework', 'IIIF', True),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Dutch archive platform domains to detect
|
||||||
|
DUTCH_ARCHIVE_PLATFORMS = [
|
||||||
|
'archieven.nl',
|
||||||
|
'memorix.nl',
|
||||||
|
'archiefweb.eu',
|
||||||
|
'atlantisdigitaal.nl',
|
||||||
|
'picturae.nl',
|
||||||
|
'mais-flexis.nl',
|
||||||
|
'delpher.nl',
|
||||||
|
'geheugen.nl',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def get_xpath(element, tree) -> str:
|
||||||
|
"""Generate XPath for an lxml element."""
|
||||||
|
parts = []
|
||||||
|
while element is not None:
|
||||||
|
parent = element.getparent()
|
||||||
|
if parent is None:
|
||||||
|
parts.append(element.tag)
|
||||||
|
else:
|
||||||
|
siblings = [c for c in parent if c.tag == element.tag]
|
||||||
|
if len(siblings) == 1:
|
||||||
|
parts.append(element.tag)
|
||||||
|
else:
|
||||||
|
index = siblings.index(element) + 1
|
||||||
|
parts.append(f'{element.tag}[{index}]')
|
||||||
|
element = parent
|
||||||
|
return '/' + '/'.join(reversed(parts))
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_url(url: str) -> str:
|
||||||
|
"""Normalize URL by removing noise query parameters."""
|
||||||
|
if not url:
|
||||||
|
return url
|
||||||
|
|
||||||
|
parsed = urlparse(url)
|
||||||
|
# Remove common tracking/session parameters
|
||||||
|
noise_params = ['sort', 'order', 'view', 'mode', 'ss', 'page', 'offset',
|
||||||
|
'limit', 'random', 'session', 'sid', 'token', 'ref']
|
||||||
|
|
||||||
|
if parsed.query:
|
||||||
|
params = dict(p.split('=', 1) if '=' in p else (p, '')
|
||||||
|
for p in parsed.query.split('&'))
|
||||||
|
filtered = {k: v for k, v in params.items()
|
||||||
|
if not any(k.startswith(n) for n in noise_params + ['utm_', 'fbclid', 'gclid'])}
|
||||||
|
new_query = '&'.join(f'{k}={v}' if v else k for k, v in sorted(filtered.items()))
|
||||||
|
return urlunparse(parsed._replace(query=new_query))
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def detect_catalog_type(url: str) -> dict | None:
|
||||||
|
"""Detect catalog type from URL pattern."""
|
||||||
|
url_lower = url.lower()
|
||||||
|
for type_key, type_info in CATALOG_TYPE_PATTERNS.items():
|
||||||
|
for pattern in type_info['patterns']:
|
||||||
|
if re.search(pattern, url_lower):
|
||||||
|
return {
|
||||||
|
'type': type_key,
|
||||||
|
'label': type_info['label'],
|
||||||
|
'description_nl': type_info['description_nl'],
|
||||||
|
}
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def detect_metadata_standards(content: str) -> list[str]:
|
||||||
|
"""Detect metadata standards mentioned in content using regex word boundaries."""
|
||||||
|
if not content:
|
||||||
|
return []
|
||||||
|
|
||||||
|
content_lower = content.lower()
|
||||||
|
standards_found = set()
|
||||||
|
|
||||||
|
for pattern, standard_name, use_regex in METADATA_STANDARDS_PATTERNS:
|
||||||
|
if use_regex:
|
||||||
|
if re.search(pattern, content_lower, re.IGNORECASE):
|
||||||
|
standards_found.add(standard_name)
|
||||||
|
else:
|
||||||
|
if pattern.lower() in content_lower:
|
||||||
|
standards_found.add(standard_name)
|
||||||
|
|
||||||
|
return sorted(list(standards_found))
|
||||||
|
|
||||||
|
|
||||||
|
def detect_cms(content: str) -> str | None:
|
||||||
|
"""Detect CMS/collection management system from content."""
|
||||||
|
if not content:
|
||||||
|
return None
|
||||||
|
|
||||||
|
content_lower = content.lower()
|
||||||
|
for cms_name, indicators in CMS_INDICATORS.items():
|
||||||
|
for indicator in indicators:
|
||||||
|
if indicator in content_lower:
|
||||||
|
return cms_name
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_website_url(entry: dict) -> str | None:
|
||||||
|
"""Extract website URL from custodian entry."""
|
||||||
|
# Check various possible locations for website
|
||||||
|
if 'website' in entry:
|
||||||
|
return entry['website']
|
||||||
|
|
||||||
|
# Check in enrichment data
|
||||||
|
for enrichment_key in ['zcbs_enrichment', 'google_maps_enrichment', 'wikidata_enrichment']:
|
||||||
|
if enrichment_key in entry:
|
||||||
|
enrichment = entry[enrichment_key]
|
||||||
|
if isinstance(enrichment, dict):
|
||||||
|
if 'website' in enrichment:
|
||||||
|
return enrichment['website']
|
||||||
|
if 'url' in enrichment:
|
||||||
|
return enrichment['url']
|
||||||
|
|
||||||
|
# Check identifiers
|
||||||
|
if 'identifiers' in entry:
|
||||||
|
for identifier in entry.get('identifiers', []):
|
||||||
|
if isinstance(identifier, dict):
|
||||||
|
if identifier.get('identifier_scheme') == 'Website':
|
||||||
|
return identifier.get('identifier_value')
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def crawl_website(crawler: AsyncWebCrawler, url: str) -> dict:
|
||||||
|
"""
|
||||||
|
Crawl a website and extract structured data with XPath provenance.
|
||||||
|
|
||||||
|
Returns a dict with:
|
||||||
|
- success: bool
|
||||||
|
- title: str
|
||||||
|
- description: str
|
||||||
|
- html: str (raw HTML for further processing)
|
||||||
|
- markdown: str
|
||||||
|
- links: list of dicts with href, text, xpath
|
||||||
|
- metadata: dict of og/meta tags
|
||||||
|
- error: str (if failed)
|
||||||
|
"""
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
verbose=False,
|
||||||
|
# Wait for page to fully load
|
||||||
|
wait_until="networkidle",
|
||||||
|
page_timeout=30000,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await crawler.arun(url=url, config=config)
|
||||||
|
|
||||||
|
if not result.success:
|
||||||
|
return {
|
||||||
|
'success': False,
|
||||||
|
'error': f'Crawl failed with status {result.status_code}',
|
||||||
|
'status_code': result.status_code,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Parse HTML with lxml to extract XPaths
|
||||||
|
links_with_xpath = []
|
||||||
|
if result.html:
|
||||||
|
try:
|
||||||
|
tree = etree.HTML(result.html)
|
||||||
|
link_elements = tree.xpath('//a[@href]')
|
||||||
|
|
||||||
|
for link_el in link_elements:
|
||||||
|
href = link_el.get('href', '')
|
||||||
|
text = ''.join(link_el.itertext()).strip()
|
||||||
|
xpath = get_xpath(link_el, tree)
|
||||||
|
|
||||||
|
# Skip empty links and javascript
|
||||||
|
if href and not href.startswith(('javascript:', '#', 'mailto:', 'tel:')):
|
||||||
|
links_with_xpath.append({
|
||||||
|
'href': href,
|
||||||
|
'text': text[:200] if text else '', # Truncate long text
|
||||||
|
'xpath': xpath,
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error parsing HTML for XPath extraction: {e}")
|
||||||
|
|
||||||
|
# Also include crawl4ai's extracted links for completeness
|
||||||
|
internal_links = result.links.get('internal', []) if result.links else []
|
||||||
|
external_links = result.links.get('external', []) if result.links else []
|
||||||
|
|
||||||
|
return {
|
||||||
|
'success': True,
|
||||||
|
'status_code': result.status_code,
|
||||||
|
'title': result.metadata.get('title', '') if result.metadata else '',
|
||||||
|
'description': result.metadata.get('description', '') if result.metadata else '',
|
||||||
|
'html': result.html,
|
||||||
|
'markdown': result.markdown.raw_markdown if result.markdown else '',
|
||||||
|
'links_with_xpath': links_with_xpath,
|
||||||
|
'internal_links': [l.get('href', '') for l in internal_links if isinstance(l, dict)],
|
||||||
|
'external_links': [l.get('href', '') for l in external_links if isinstance(l, dict)],
|
||||||
|
'metadata': result.metadata or {},
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error crawling {url}: {e}")
|
||||||
|
return {
|
||||||
|
'success': False,
|
||||||
|
'error': str(e),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_crawl_results(crawl_data: dict, base_url: str) -> dict:
|
||||||
|
"""
|
||||||
|
Analyze crawl results to extract APIs, catalogs, and metadata standards.
|
||||||
|
|
||||||
|
Returns enrichment dict ready to add to YAML.
|
||||||
|
"""
|
||||||
|
enrichment = {
|
||||||
|
'retrieval_timestamp': datetime.now(timezone.utc).isoformat(),
|
||||||
|
'retrieval_agent': 'crawl4ai',
|
||||||
|
'source_url': base_url,
|
||||||
|
'status_code': crawl_data.get('status_code'),
|
||||||
|
}
|
||||||
|
|
||||||
|
if not crawl_data.get('success'):
|
||||||
|
enrichment['error'] = crawl_data.get('error', 'Unknown error')
|
||||||
|
return enrichment
|
||||||
|
|
||||||
|
# Basic metadata
|
||||||
|
enrichment['title'] = crawl_data.get('title', '')
|
||||||
|
enrichment['description'] = crawl_data.get('description', '')
|
||||||
|
enrichment['links_count'] = len(crawl_data.get('links_with_xpath', []))
|
||||||
|
|
||||||
|
# Collect all URLs for analysis
|
||||||
|
all_urls = set()
|
||||||
|
links_with_xpath = crawl_data.get('links_with_xpath', [])
|
||||||
|
|
||||||
|
for link in links_with_xpath:
|
||||||
|
href = link.get('href', '')
|
||||||
|
if href:
|
||||||
|
# Make absolute URL if relative
|
||||||
|
if href.startswith('/'):
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
href = f"{parsed_base.scheme}://{parsed_base.netloc}{href}"
|
||||||
|
all_urls.add(href)
|
||||||
|
|
||||||
|
# Add internal/external links from crawl4ai
|
||||||
|
for link in crawl_data.get('internal_links', []):
|
||||||
|
if link:
|
||||||
|
all_urls.add(link)
|
||||||
|
for link in crawl_data.get('external_links', []):
|
||||||
|
if link:
|
||||||
|
all_urls.add(link)
|
||||||
|
|
||||||
|
# Detect API endpoints
|
||||||
|
detected_apis = []
|
||||||
|
for url in all_urls:
|
||||||
|
url_lower = url.lower()
|
||||||
|
for pattern in API_ENDPOINT_PATTERNS:
|
||||||
|
if re.search(pattern, url_lower):
|
||||||
|
detected_apis.append({
|
||||||
|
'url': normalize_url(url),
|
||||||
|
'pattern_matched': pattern,
|
||||||
|
})
|
||||||
|
break
|
||||||
|
|
||||||
|
if detected_apis:
|
||||||
|
enrichment['detected_api_endpoints'] = detected_apis
|
||||||
|
|
||||||
|
# Detect catalog URLs with type classification
|
||||||
|
detected_catalogs = []
|
||||||
|
for url in all_urls:
|
||||||
|
url_lower = url.lower()
|
||||||
|
for pattern in CATALOG_PATTERNS:
|
||||||
|
if re.search(pattern, url_lower):
|
||||||
|
catalog_entry = {
|
||||||
|
'url': normalize_url(url),
|
||||||
|
}
|
||||||
|
catalog_type = detect_catalog_type(url)
|
||||||
|
if catalog_type:
|
||||||
|
catalog_entry['type'] = catalog_type['type']
|
||||||
|
catalog_entry['label'] = catalog_type['label']
|
||||||
|
|
||||||
|
# Find XPath for this link
|
||||||
|
for link in links_with_xpath:
|
||||||
|
if link.get('href', '').rstrip('/') == url.rstrip('/') or \
|
||||||
|
(link.get('href', '').startswith('/') and url.endswith(link.get('href', ''))):
|
||||||
|
catalog_entry['xpath'] = link.get('xpath')
|
||||||
|
catalog_entry['link_text'] = link.get('text', '')
|
||||||
|
break
|
||||||
|
|
||||||
|
detected_catalogs.append(catalog_entry)
|
||||||
|
break
|
||||||
|
|
||||||
|
if detected_catalogs:
|
||||||
|
enrichment['detected_catalog_urls'] = detected_catalogs
|
||||||
|
|
||||||
|
# Detect external archive platforms
|
||||||
|
external_platforms = []
|
||||||
|
for url in all_urls:
|
||||||
|
url_lower = url.lower()
|
||||||
|
for platform in DUTCH_ARCHIVE_PLATFORMS:
|
||||||
|
if platform in url_lower:
|
||||||
|
external_platforms.append({
|
||||||
|
'url': normalize_url(url),
|
||||||
|
'platform': platform,
|
||||||
|
})
|
||||||
|
break
|
||||||
|
|
||||||
|
if external_platforms:
|
||||||
|
enrichment['external_archive_platforms'] = external_platforms
|
||||||
|
|
||||||
|
# Detect metadata standards from content
|
||||||
|
# Handle None values explicitly to avoid string concatenation errors
|
||||||
|
markdown = crawl_data.get('markdown') or ''
|
||||||
|
title = crawl_data.get('title') or ''
|
||||||
|
description = crawl_data.get('description') or ''
|
||||||
|
content = f"{markdown} {title} {description}"
|
||||||
|
standards = detect_metadata_standards(content)
|
||||||
|
if standards:
|
||||||
|
enrichment['detected_standards'] = standards
|
||||||
|
|
||||||
|
# Detect CMS
|
||||||
|
cms = detect_cms(content)
|
||||||
|
if cms:
|
||||||
|
enrichment['detected_cms'] = cms
|
||||||
|
|
||||||
|
# Extract OG/meta tags of interest
|
||||||
|
metadata = crawl_data.get('metadata', {})
|
||||||
|
og_data = {}
|
||||||
|
for key in ['og:title', 'og:description', 'og:image', 'og:url', 'og:site_name']:
|
||||||
|
if key in metadata:
|
||||||
|
og_data[key.replace('og:', '')] = metadata[key]
|
||||||
|
if og_data:
|
||||||
|
enrichment['open_graph'] = og_data
|
||||||
|
|
||||||
|
return enrichment
|
||||||
|
|
||||||
|
|
||||||
|
def load_checkpoint() -> dict:
|
||||||
|
"""Load checkpoint from file."""
|
||||||
|
if CHECKPOINT_FILE.exists():
|
||||||
|
with open(CHECKPOINT_FILE, 'r') as f:
|
||||||
|
return json.load(f)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def save_checkpoint(checkpoint: dict):
|
||||||
|
"""Save checkpoint to file."""
|
||||||
|
with open(CHECKPOINT_FILE, 'w') as f:
|
||||||
|
json.dump(checkpoint, f, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
async def process_single_file(
|
||||||
|
crawler: AsyncWebCrawler,
|
||||||
|
filepath: Path,
|
||||||
|
dry_run: bool = False,
|
||||||
|
force: bool = False,
|
||||||
|
) -> bool:
|
||||||
|
"""Process a single custodian YAML file."""
|
||||||
|
try:
|
||||||
|
with open(filepath, 'r', encoding='utf-8') as f:
|
||||||
|
entry = yaml.safe_load(f)
|
||||||
|
|
||||||
|
if not entry:
|
||||||
|
logger.warning(f"Empty file: {filepath}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check if already enriched
|
||||||
|
if 'crawl4ai_enrichment' in entry and not force:
|
||||||
|
logger.info(f"Skipping {filepath.name}: already has crawl4ai_enrichment")
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Extract website URL
|
||||||
|
website_url = extract_website_url(entry)
|
||||||
|
if not website_url:
|
||||||
|
logger.info(f"Skipping {filepath.name}: no website URL found")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Ensure URL has protocol
|
||||||
|
if not website_url.startswith(('http://', 'https://')):
|
||||||
|
website_url = 'https://' + website_url
|
||||||
|
|
||||||
|
logger.info(f"Processing {filepath.name}: {website_url}")
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
logger.info(f" -> DRY RUN: would crawl {website_url}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Crawl the website
|
||||||
|
crawl_data = await crawl_website(crawler, website_url)
|
||||||
|
|
||||||
|
# Analyze results
|
||||||
|
enrichment = analyze_crawl_results(crawl_data, website_url)
|
||||||
|
|
||||||
|
# Add enrichment to entry
|
||||||
|
entry['crawl4ai_enrichment'] = enrichment
|
||||||
|
|
||||||
|
# Write back to file
|
||||||
|
with open(filepath, 'w', encoding='utf-8') as f:
|
||||||
|
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||||
|
|
||||||
|
# Log summary
|
||||||
|
apis_count = len(enrichment.get('detected_api_endpoints', []))
|
||||||
|
catalogs_count = len(enrichment.get('detected_catalog_urls', []))
|
||||||
|
platforms_count = len(enrichment.get('external_archive_platforms', []))
|
||||||
|
logger.info(f" -> success: {apis_count} APIs, {catalogs_count} catalogs, {platforms_count} external platforms found")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing {filepath}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
parser = argparse.ArgumentParser(description='Enrich Dutch custodians with Crawl4AI')
|
||||||
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be enriched')
|
||||||
|
parser.add_argument('--limit', type=int, help='Process only first N files')
|
||||||
|
parser.add_argument('--start-index', type=int, default=0, help='Start from index N')
|
||||||
|
parser.add_argument('--resume', action='store_true', help='Resume from last checkpoint')
|
||||||
|
parser.add_argument('--force', action='store_true', help='Re-enrich even if already enriched')
|
||||||
|
parser.add_argument('--file', type=str, help='Process a single specific file')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Create logs directory
|
||||||
|
logs_dir = Path(__file__).parent.parent / "logs"
|
||||||
|
logs_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Add file handler for logging
|
||||||
|
log_file = logs_dir / f"crawl4ai_enrichment_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
|
||||||
|
file_handler = logging.FileHandler(log_file)
|
||||||
|
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
||||||
|
logger.addHandler(file_handler)
|
||||||
|
|
||||||
|
# Single file mode
|
||||||
|
if args.file:
|
||||||
|
filepath = Path(args.file)
|
||||||
|
if not filepath.exists():
|
||||||
|
logger.error(f"File not found: {filepath}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
success = await process_single_file(crawler, filepath, args.dry_run, args.force)
|
||||||
|
sys.exit(0 if success else 1)
|
||||||
|
|
||||||
|
# Batch mode
|
||||||
|
files = sorted(CUSTODIAN_DIR.glob("NL-*.yaml"))
|
||||||
|
logger.info(f"Found {len(files)} Dutch custodian files")
|
||||||
|
|
||||||
|
# Handle resume
|
||||||
|
start_index = args.start_index
|
||||||
|
if args.resume:
|
||||||
|
checkpoint = load_checkpoint()
|
||||||
|
if 'last_processed_index' in checkpoint:
|
||||||
|
start_index = checkpoint['last_processed_index'] + 1
|
||||||
|
logger.info(f"Resuming from index {start_index}")
|
||||||
|
|
||||||
|
# Apply limit
|
||||||
|
end_index = len(files)
|
||||||
|
if args.limit:
|
||||||
|
end_index = min(start_index + args.limit, len(files))
|
||||||
|
|
||||||
|
logger.info(f"Processing files {start_index} to {end_index - 1}")
|
||||||
|
|
||||||
|
# Process files
|
||||||
|
success_count = 0
|
||||||
|
error_count = 0
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
for i, filepath in enumerate(files[start_index:end_index], start=start_index):
|
||||||
|
logger.info(f"[{i + 1}/{len(files)}] Processing {filepath.name}")
|
||||||
|
|
||||||
|
success = await process_single_file(crawler, filepath, args.dry_run, args.force)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
success_count += 1
|
||||||
|
else:
|
||||||
|
error_count += 1
|
||||||
|
|
||||||
|
# Save checkpoint
|
||||||
|
if not args.dry_run:
|
||||||
|
save_checkpoint({
|
||||||
|
'last_processed_index': i,
|
||||||
|
'last_processed_file': str(filepath),
|
||||||
|
'last_processed_time': datetime.now(timezone.utc).isoformat(),
|
||||||
|
'success_count': success_count,
|
||||||
|
'error_count': error_count,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Rate limiting
|
||||||
|
if i < end_index - 1:
|
||||||
|
await asyncio.sleep(REQUEST_DELAY)
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
logger.info(f"\n{'='*50}")
|
||||||
|
logger.info(f"Enrichment complete!")
|
||||||
|
logger.info(f" Success: {success_count}")
|
||||||
|
logger.info(f" Errors: {error_count}")
|
||||||
|
logger.info(f" Log file: {log_file}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
asyncio.run(main())
|
||||||
281
scripts/fix_collision_victims.py
Normal file
281
scripts/fix_collision_victims.py
Normal file
|
|
@ -0,0 +1,281 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Fix GHCID collision victim files.
|
||||||
|
|
||||||
|
These files have a trailing dash in their filename (e.g., NL-DR-ASS-L-BD-.yaml)
|
||||||
|
indicating they were collision victims whose internal GHCID was incorrectly set
|
||||||
|
to their collision partner's GHCID instead of getting their own unique GHCID.
|
||||||
|
|
||||||
|
This script:
|
||||||
|
1. Reads the institution's real name from original_entry.organisatie
|
||||||
|
2. Generates a proper name suffix from that name
|
||||||
|
3. Creates a new unique GHCID with the proper suffix
|
||||||
|
4. Regenerates all GHCID-derived identifiers (UUID, numeric)
|
||||||
|
5. Updates the file with correct identifiers
|
||||||
|
6. Renames the file to match the new GHCID
|
||||||
|
"""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import unicodedata
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
# GHCID namespace for UUID generation
|
||||||
|
GHCID_NAMESPACE = uuid.NAMESPACE_URL
|
||||||
|
GHCID_URL_PREFIX = "https://glam.registry/"
|
||||||
|
|
||||||
|
# Skip words for abbreviation generation (Dutch and common)
|
||||||
|
SKIP_WORDS = {
|
||||||
|
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
|
||||||
|
's', 'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder',
|
||||||
|
'door', 'en', 'of', 'stichting', 'vereniging', 'foundation', 'the', 'a',
|
||||||
|
'an', 'of', 'in', 'at', 'on', 'to', 'for', 'with', 'from', 'by', 'as',
|
||||||
|
'museum', 'bibliotheek', 'archief', 'collectie'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_diacritics(text: str) -> str:
|
||||||
|
"""Normalize diacritics to ASCII equivalents."""
|
||||||
|
normalized = unicodedata.normalize('NFD', text)
|
||||||
|
ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||||
|
return ascii_text
|
||||||
|
|
||||||
|
|
||||||
|
def generate_name_suffix(native_name: str) -> str:
|
||||||
|
"""Convert native language institution name to snake_case suffix.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
"Biblionet Drenthe POI" → "biblionet_drenthe_poi"
|
||||||
|
"Fries Verzetsmuseum" → "fries_verzetsmuseum"
|
||||||
|
"Musée d'Orsay" → "musee_dorsay"
|
||||||
|
"""
|
||||||
|
# Normalize unicode and remove diacritics
|
||||||
|
ascii_name = normalize_diacritics(native_name)
|
||||||
|
|
||||||
|
# Convert to lowercase
|
||||||
|
lowercase = ascii_name.lower()
|
||||||
|
|
||||||
|
# Remove apostrophes, commas, and other punctuation
|
||||||
|
no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase)
|
||||||
|
|
||||||
|
# Replace spaces and hyphens with underscores
|
||||||
|
underscored = re.sub(r'[\s\-]+', '_', no_punct)
|
||||||
|
|
||||||
|
# Remove any remaining non-alphanumeric characters (except underscores)
|
||||||
|
clean = re.sub(r'[^a-z0-9_]', '', underscored)
|
||||||
|
|
||||||
|
# Collapse multiple underscores
|
||||||
|
final = re.sub(r'_+', '_', clean).strip('_')
|
||||||
|
|
||||||
|
return final
|
||||||
|
|
||||||
|
|
||||||
|
def generate_ghcid_uuid(ghcid: str) -> str:
|
||||||
|
"""Generate UUID v5 from GHCID."""
|
||||||
|
return str(uuid.uuid5(GHCID_NAMESPACE, f"{GHCID_URL_PREFIX}{ghcid}"))
|
||||||
|
|
||||||
|
|
||||||
|
def generate_ghcid_uuid_sha256(ghcid: str) -> str:
|
||||||
|
"""Generate UUID v8 (SHA-256 based) from GHCID."""
|
||||||
|
return str(uuid.uuid5(GHCID_NAMESPACE, f"{GHCID_URL_PREFIX}sha256/{ghcid}"))
|
||||||
|
|
||||||
|
|
||||||
|
def generate_ghcid_numeric(ghcid: str) -> int:
|
||||||
|
"""Generate 64-bit numeric ID from GHCID."""
|
||||||
|
sha256_hash = hashlib.sha256(ghcid.encode()).hexdigest()
|
||||||
|
return int(sha256_hash[:16], 16)
|
||||||
|
|
||||||
|
|
||||||
|
def fix_collision_victim(file_path: Path, dry_run: bool = False) -> Optional[Path]:
|
||||||
|
"""Fix a single collision victim file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the collision victim YAML file
|
||||||
|
dry_run: If True, only print what would be done
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
New file path after renaming, or None if skipped/failed
|
||||||
|
"""
|
||||||
|
print(f"\n{'='*80}")
|
||||||
|
print(f"Processing: {file_path.name}")
|
||||||
|
print(f"{'='*80}")
|
||||||
|
|
||||||
|
# Read file
|
||||||
|
try:
|
||||||
|
with open(file_path) as f:
|
||||||
|
data = yaml.safe_load(f)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR: Could not read file: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
if data is None:
|
||||||
|
print(f" SKIP: File is empty or invalid")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Get institution name
|
||||||
|
org_name = data.get('original_entry', {}).get('organisatie')
|
||||||
|
if not org_name:
|
||||||
|
print(f" ERROR: No organisatie found in original_entry")
|
||||||
|
return None
|
||||||
|
|
||||||
|
print(f" Institution: {org_name}")
|
||||||
|
|
||||||
|
# Get current GHCID info
|
||||||
|
ghcid_data = data.get('ghcid', {})
|
||||||
|
old_ghcid = ghcid_data.get('ghcid_current', '')
|
||||||
|
print(f" Old GHCID: {old_ghcid}")
|
||||||
|
|
||||||
|
# Extract base GHCID from filename (remove trailing dash)
|
||||||
|
base_ghcid = file_path.stem.rstrip('-')
|
||||||
|
print(f" Base GHCID: {base_ghcid}")
|
||||||
|
|
||||||
|
# Generate new name suffix from institution name
|
||||||
|
name_suffix = generate_name_suffix(org_name)
|
||||||
|
print(f" Name suffix: {name_suffix}")
|
||||||
|
|
||||||
|
# Create new GHCID
|
||||||
|
new_ghcid = f"{base_ghcid}-{name_suffix}"
|
||||||
|
print(f" New GHCID: {new_ghcid}")
|
||||||
|
|
||||||
|
# Check if this would be the same as old (only filename is wrong)
|
||||||
|
if new_ghcid == old_ghcid:
|
||||||
|
expected_filename = f"{new_ghcid}.yaml"
|
||||||
|
if file_path.name != expected_filename:
|
||||||
|
print(f" GHCID correct, but filename wrong - needs rename only")
|
||||||
|
if dry_run:
|
||||||
|
print(f" DRY RUN: Would rename to {expected_filename}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
new_file_path = file_path.parent / expected_filename
|
||||||
|
if new_file_path.exists():
|
||||||
|
print(f" ERROR: Target file already exists: {new_file_path.name}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
shutil.move(str(file_path), str(new_file_path))
|
||||||
|
print(f" Renamed: {file_path.name} → {new_file_path.name}")
|
||||||
|
return new_file_path
|
||||||
|
else:
|
||||||
|
print(f" SKIP: GHCID and filename both correct")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Generate new identifiers
|
||||||
|
new_uuid = generate_ghcid_uuid(new_ghcid)
|
||||||
|
new_uuid_sha256 = generate_ghcid_uuid_sha256(new_ghcid)
|
||||||
|
new_numeric = generate_ghcid_numeric(new_ghcid)
|
||||||
|
|
||||||
|
print(f" New UUID: {new_uuid}")
|
||||||
|
print(f" New numeric: {new_numeric}")
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
print(f" DRY RUN: Would update file and rename to {new_ghcid}.yaml")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Update GHCID section
|
||||||
|
timestamp = datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
# Preserve old GHCID in history
|
||||||
|
ghcid_history = ghcid_data.get('ghcid_history', [])
|
||||||
|
|
||||||
|
# Add history entry for the fix
|
||||||
|
ghcid_history.append({
|
||||||
|
'ghcid': old_ghcid,
|
||||||
|
'ghcid_uuid': ghcid_data.get('ghcid_uuid', ''),
|
||||||
|
'ghcid_numeric': ghcid_data.get('ghcid_numeric', 0),
|
||||||
|
'valid_from': ghcid_data.get('generated_at', ''),
|
||||||
|
'valid_to': timestamp,
|
||||||
|
'reason': f"Collision fix: had partner's GHCID, corrected to institution's own GHCID based on name '{org_name}'"
|
||||||
|
})
|
||||||
|
|
||||||
|
data['ghcid'] = {
|
||||||
|
'ghcid_current': new_ghcid,
|
||||||
|
'ghcid_uuid': new_uuid,
|
||||||
|
'ghcid_uuid_sha256': new_uuid_sha256,
|
||||||
|
'ghcid_numeric': new_numeric,
|
||||||
|
'generated_at': timestamp,
|
||||||
|
'ghcid_history': ghcid_history
|
||||||
|
}
|
||||||
|
|
||||||
|
# Update identifiers list
|
||||||
|
identifiers = data.get('identifiers', [])
|
||||||
|
updated_identifiers = []
|
||||||
|
for ident in identifiers:
|
||||||
|
scheme = ident.get('identifier_scheme', '')
|
||||||
|
if scheme == 'GHCID':
|
||||||
|
ident['identifier_value'] = new_ghcid
|
||||||
|
ident['identifier_url'] = f"https://w3id.org/heritage/custodian/{new_ghcid}"
|
||||||
|
elif scheme == 'GHCID_UUID':
|
||||||
|
ident['identifier_value'] = new_uuid
|
||||||
|
elif scheme == 'GHCID_NUMERIC':
|
||||||
|
ident['identifier_value'] = str(new_numeric)
|
||||||
|
updated_identifiers.append(ident)
|
||||||
|
data['identifiers'] = updated_identifiers
|
||||||
|
|
||||||
|
# Write updated data back to file
|
||||||
|
with open(file_path, 'w') as f:
|
||||||
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||||
|
|
||||||
|
print(f" Updated file content")
|
||||||
|
|
||||||
|
# Rename file to match new GHCID
|
||||||
|
new_file_path = file_path.parent / f"{new_ghcid}.yaml"
|
||||||
|
|
||||||
|
if new_file_path.exists():
|
||||||
|
print(f" ERROR: Target file already exists: {new_file_path.name}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
shutil.move(str(file_path), str(new_file_path))
|
||||||
|
print(f" Renamed: {file_path.name} → {new_file_path.name}")
|
||||||
|
|
||||||
|
return new_file_path
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='Fix GHCID collision victim files')
|
||||||
|
parser.add_argument('--dry-run', action='store_true', help='Only show what would be done')
|
||||||
|
parser.add_argument('--file', type=str, help='Process only this specific file')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
custodian_dir = Path('data/custodian')
|
||||||
|
|
||||||
|
if args.file:
|
||||||
|
files = [Path(args.file)]
|
||||||
|
else:
|
||||||
|
# Find all collision victim files (trailing dash pattern)
|
||||||
|
files = sorted(custodian_dir.glob('NL-*-.yaml'))
|
||||||
|
|
||||||
|
print(f"Found {len(files)} collision victim file(s)")
|
||||||
|
|
||||||
|
fixed = 0
|
||||||
|
skipped = 0
|
||||||
|
errors = 0
|
||||||
|
|
||||||
|
for f in files:
|
||||||
|
result = fix_collision_victim(f, dry_run=args.dry_run)
|
||||||
|
if result:
|
||||||
|
fixed += 1
|
||||||
|
elif result is None:
|
||||||
|
# Check if it was empty
|
||||||
|
if f.stat().st_size == 0:
|
||||||
|
print(f"\n EMPTY FILE: {f.name} - should be deleted")
|
||||||
|
errors += 1
|
||||||
|
else:
|
||||||
|
skipped += 1
|
||||||
|
|
||||||
|
print(f"\n{'='*80}")
|
||||||
|
print(f"SUMMARY")
|
||||||
|
print(f"{'='*80}")
|
||||||
|
print(f" Fixed: {fixed}")
|
||||||
|
print(f" Skipped: {skipped}")
|
||||||
|
print(f" Errors/Empty: {errors}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
140
scripts/fix_generic_platform_names.py
Executable file
140
scripts/fix_generic_platform_names.py
Executable file
|
|
@ -0,0 +1,140 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Fix generic platform names ('Home Website', 'Homepage Website') by using
|
||||||
|
the organisatie field from original_entry.
|
||||||
|
|
||||||
|
Also filters invalid platform types (ONLINEMARKETING, ONLINEBRANDING).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
# Custom YAML representer to preserve formatting
|
||||||
|
def str_representer(dumper, data):
|
||||||
|
if '\n' in data:
|
||||||
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
|
||||||
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
|
||||||
|
|
||||||
|
yaml.add_representer(str, str_representer)
|
||||||
|
|
||||||
|
GENERIC_NAMES = {'Home Website', 'Homepage Website', 'Welkom Website'}
|
||||||
|
INVALID_TYPES = {'ONLINEMARKETING', 'ONLINEBRANDING', 'ONLINEWEBSITE', 'ONLINE'}
|
||||||
|
|
||||||
|
def fix_file(filepath: Path, dry_run: bool = False) -> dict:
|
||||||
|
"""Fix a single file. Returns stats dict."""
|
||||||
|
stats = {
|
||||||
|
'name_fixed': False,
|
||||||
|
'types_fixed': False,
|
||||||
|
'old_name': None,
|
||||||
|
'new_name': None,
|
||||||
|
'removed_types': []
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(filepath, 'r', encoding='utf-8') as f:
|
||||||
|
data = yaml.safe_load(f)
|
||||||
|
|
||||||
|
if not data:
|
||||||
|
return stats
|
||||||
|
|
||||||
|
# Check if file has digital_platform_v2
|
||||||
|
if 'digital_platform_v2' not in data:
|
||||||
|
return stats
|
||||||
|
|
||||||
|
dpv2 = data['digital_platform_v2']
|
||||||
|
modified = False
|
||||||
|
|
||||||
|
# Fix 1: Generic platform names
|
||||||
|
current_name = dpv2.get('platform_name', '')
|
||||||
|
if current_name in GENERIC_NAMES:
|
||||||
|
# Try to get organisation name
|
||||||
|
org_name = None
|
||||||
|
if 'original_entry' in data and 'organisatie' in data['original_entry']:
|
||||||
|
org_name = data['original_entry']['organisatie']
|
||||||
|
elif 'museum_register_enrichment' in data and 'museum_name' in data['museum_register_enrichment']:
|
||||||
|
org_name = data['museum_register_enrichment']['museum_name']
|
||||||
|
elif 'wikidata_enrichment' in data and 'wikidata_label_nl' in data['wikidata_enrichment']:
|
||||||
|
org_name = data['wikidata_enrichment']['wikidata_label_nl']
|
||||||
|
|
||||||
|
if org_name:
|
||||||
|
new_name = f"{org_name} Website"
|
||||||
|
stats['old_name'] = current_name
|
||||||
|
stats['new_name'] = new_name
|
||||||
|
stats['name_fixed'] = True
|
||||||
|
dpv2['platform_name'] = new_name
|
||||||
|
modified = True
|
||||||
|
|
||||||
|
# Fix 2: Invalid platform types
|
||||||
|
if 'platform_type' in dpv2 and isinstance(dpv2['platform_type'], list):
|
||||||
|
original_types = dpv2['platform_type'].copy()
|
||||||
|
filtered_types = [t for t in original_types if t not in INVALID_TYPES]
|
||||||
|
|
||||||
|
if len(filtered_types) < len(original_types):
|
||||||
|
stats['removed_types'] = [t for t in original_types if t in INVALID_TYPES]
|
||||||
|
stats['types_fixed'] = True
|
||||||
|
dpv2['platform_type'] = filtered_types if filtered_types else ['INSTITUTIONAL_WEBSITE']
|
||||||
|
modified = True
|
||||||
|
|
||||||
|
# Add fix metadata
|
||||||
|
if modified:
|
||||||
|
if '_transformation_metadata' not in dpv2:
|
||||||
|
dpv2['_transformation_metadata'] = {}
|
||||||
|
dpv2['_transformation_metadata']['quality_fix_date'] = datetime.now(timezone.utc).isoformat()
|
||||||
|
if stats['name_fixed']:
|
||||||
|
dpv2['_transformation_metadata']['name_source'] = 'organisatie_field'
|
||||||
|
|
||||||
|
if not dry_run:
|
||||||
|
with open(filepath, 'w', encoding='utf-8') as f:
|
||||||
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
import argparse
|
||||||
|
parser = argparse.ArgumentParser(description='Fix generic platform names')
|
||||||
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be changed without modifying files')
|
||||||
|
parser.add_argument('--path', default='/Users/kempersc/apps/glam/data/custodian', help='Path to custodian files')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
custodian_path = Path(args.path)
|
||||||
|
|
||||||
|
# Find files with digital_platform_v2
|
||||||
|
files_fixed_names = 0
|
||||||
|
files_fixed_types = 0
|
||||||
|
total_checked = 0
|
||||||
|
|
||||||
|
print(f"{'[DRY RUN] ' if args.dry_run else ''}Scanning {custodian_path}...")
|
||||||
|
print()
|
||||||
|
|
||||||
|
for filepath in sorted(custodian_path.glob('NL-*.yaml')):
|
||||||
|
stats = fix_file(filepath, dry_run=args.dry_run)
|
||||||
|
|
||||||
|
if stats['name_fixed'] or stats['types_fixed']:
|
||||||
|
total_checked += 1
|
||||||
|
|
||||||
|
if stats['name_fixed']:
|
||||||
|
files_fixed_names += 1
|
||||||
|
print(f"✓ {filepath.name}")
|
||||||
|
print(f" Name: '{stats['old_name']}' → '{stats['new_name']}'")
|
||||||
|
|
||||||
|
if stats['types_fixed']:
|
||||||
|
files_fixed_types += 1
|
||||||
|
print(f" Removed types: {stats['removed_types']}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"{'[DRY RUN] ' if args.dry_run else ''}Summary:")
|
||||||
|
print(f" Files with name fixed: {files_fixed_names}")
|
||||||
|
print(f" Files with types fixed: {files_fixed_types}")
|
||||||
|
print(f" Total files modified: {total_checked}")
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
print()
|
||||||
|
print("Run without --dry-run to apply changes.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
97
scripts/fix_generic_platform_names_fast.py
Executable file
97
scripts/fix_generic_platform_names_fast.py
Executable file
|
|
@ -0,0 +1,97 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Fast fix for generic platform names - processes only files from stdin or file list.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
GENERIC_NAMES = {'Home Website', 'Homepage Website', 'Welkom Website'}
|
||||||
|
INVALID_TYPES = {'ONLINEMARKETING', 'ONLINEBRANDING', 'ONLINEWEBSITE', 'ONLINE'}
|
||||||
|
|
||||||
|
def fix_file(filepath: Path, dry_run: bool = False) -> dict:
|
||||||
|
"""Fix a single file."""
|
||||||
|
stats = {'name_fixed': False, 'types_fixed': False, 'old_name': None, 'new_name': None, 'removed_types': []}
|
||||||
|
|
||||||
|
with open(filepath, 'r', encoding='utf-8') as f:
|
||||||
|
content = f.read()
|
||||||
|
data = yaml.safe_load(content)
|
||||||
|
|
||||||
|
if not data or 'digital_platform_v2' not in data:
|
||||||
|
return stats
|
||||||
|
|
||||||
|
dpv2 = data['digital_platform_v2']
|
||||||
|
modified = False
|
||||||
|
|
||||||
|
# Fix generic names
|
||||||
|
current_name = dpv2.get('platform_name', '')
|
||||||
|
if current_name in GENERIC_NAMES:
|
||||||
|
org_name = None
|
||||||
|
if 'original_entry' in data and data['original_entry'].get('organisatie'):
|
||||||
|
org_name = data['original_entry']['organisatie']
|
||||||
|
elif 'museum_register_enrichment' in data and data['museum_register_enrichment'].get('museum_name'):
|
||||||
|
org_name = data['museum_register_enrichment']['museum_name']
|
||||||
|
elif 'wikidata_enrichment' in data and data['wikidata_enrichment'].get('wikidata_label_nl'):
|
||||||
|
org_name = data['wikidata_enrichment']['wikidata_label_nl']
|
||||||
|
|
||||||
|
if org_name:
|
||||||
|
new_name = f"{org_name} Website"
|
||||||
|
stats['old_name'] = current_name
|
||||||
|
stats['new_name'] = new_name
|
||||||
|
stats['name_fixed'] = True
|
||||||
|
dpv2['platform_name'] = new_name
|
||||||
|
modified = True
|
||||||
|
|
||||||
|
# Fix invalid types
|
||||||
|
if 'platform_type' in dpv2 and isinstance(dpv2['platform_type'], list):
|
||||||
|
original_types = dpv2['platform_type'].copy()
|
||||||
|
filtered_types = [t for t in original_types if t not in INVALID_TYPES]
|
||||||
|
if len(filtered_types) < len(original_types):
|
||||||
|
stats['removed_types'] = [t for t in original_types if t in INVALID_TYPES]
|
||||||
|
stats['types_fixed'] = True
|
||||||
|
dpv2['platform_type'] = filtered_types if filtered_types else ['INSTITUTIONAL_WEBSITE']
|
||||||
|
modified = True
|
||||||
|
|
||||||
|
if modified:
|
||||||
|
if '_transformation_metadata' not in dpv2:
|
||||||
|
dpv2['_transformation_metadata'] = {}
|
||||||
|
dpv2['_transformation_metadata']['quality_fix_date'] = datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
if not dry_run:
|
||||||
|
with open(filepath, 'w', encoding='utf-8') as f:
|
||||||
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
def main():
|
||||||
|
dry_run = '--dry-run' in sys.argv
|
||||||
|
file_list = sys.argv[1] if len(sys.argv) > 1 and not sys.argv[1].startswith('--') else None
|
||||||
|
|
||||||
|
if file_list:
|
||||||
|
with open(file_list) as f:
|
||||||
|
files = [Path(line.strip()) for line in f if line.strip()]
|
||||||
|
else:
|
||||||
|
files = [Path(line.strip()) for line in sys.stdin if line.strip()]
|
||||||
|
|
||||||
|
fixed_names = 0
|
||||||
|
fixed_types = 0
|
||||||
|
|
||||||
|
for filepath in files:
|
||||||
|
if not filepath.exists():
|
||||||
|
continue
|
||||||
|
stats = fix_file(filepath, dry_run=dry_run)
|
||||||
|
|
||||||
|
if stats['name_fixed'] or stats['types_fixed']:
|
||||||
|
if stats['name_fixed']:
|
||||||
|
fixed_names += 1
|
||||||
|
print(f"✓ {filepath.name}: '{stats['old_name']}' → '{stats['new_name']}'")
|
||||||
|
if stats['types_fixed']:
|
||||||
|
fixed_types += 1
|
||||||
|
print(f" Removed: {stats['removed_types']}")
|
||||||
|
|
||||||
|
print(f"\n{'[DRY RUN] ' if dry_run else ''}Fixed: {fixed_names} names, {fixed_types} type lists")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
523
scripts/fix_ghcid_type.py
Normal file
523
scripts/fix_ghcid_type.py
Normal file
|
|
@ -0,0 +1,523 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Fix GHCID type codes in Dutch custodian files.
|
||||||
|
|
||||||
|
This script corrects GHCID type codes (position 4) for files where the
|
||||||
|
type was incorrectly assigned. Common corrections:
|
||||||
|
- U→M: Unknown should be Museum
|
||||||
|
- U→I: Unknown should be Intangible Heritage
|
||||||
|
- U→T: Unknown should be Taste/Smell Heritage
|
||||||
|
- X→I: Mixed should be Intangible Heritage (single type)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
# Dry run (preview changes)
|
||||||
|
python scripts/fix_ghcid_type.py --dry-run
|
||||||
|
|
||||||
|
# Apply fixes
|
||||||
|
python scripts/fix_ghcid_type.py
|
||||||
|
|
||||||
|
# Process specific correction type only
|
||||||
|
python scripts/fix_ghcid_type.py --correction U-to-I --dry-run
|
||||||
|
python scripts/fix_ghcid_type.py --correction U-to-M
|
||||||
|
|
||||||
|
# Process a single file
|
||||||
|
python scripts/fix_ghcid_type.py --file data/custodian/NL-DR-FRE-U-FCFE.yaml --new-type I
|
||||||
|
|
||||||
|
Author: GLAM Data Quality Team
|
||||||
|
Date: 2025-12-14
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import hashlib
|
||||||
|
import shutil
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
# GHCID namespace for UUID v5 generation (same as DNS namespace per project spec)
|
||||||
|
GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8")
|
||||||
|
|
||||||
|
# Type code corrections: filename pattern -> new type code
|
||||||
|
# These are determined by analyzing original_entry.type_organisatie in each file
|
||||||
|
#
|
||||||
|
# Current U-type breakdown (173 files):
|
||||||
|
# - 143 files: type_organisatie: museum → should be M
|
||||||
|
# - 14 files: type_organisatie: intangible_heritage_custodian → should be I
|
||||||
|
# - 7 files: type_organisatie: unknown → keep as U (correct)
|
||||||
|
#
|
||||||
|
# Current X-type files (2 files):
|
||||||
|
# - Both are intangible_heritage_custodian → should be I (single type, not mixed)
|
||||||
|
#
|
||||||
|
TYPE_CORRECTIONS = {
|
||||||
|
# U→I: Intangible heritage custodians incorrectly marked as Unknown (14 files)
|
||||||
|
"U-to-I": {
|
||||||
|
"files": [
|
||||||
|
"NL-DR-FRE-U-FCFE.yaml",
|
||||||
|
"NL-GE-TIE-U-BO.yaml",
|
||||||
|
"NL-LI-VAL-U-C.yaml",
|
||||||
|
"NL-NH-AMS-U-C.yaml",
|
||||||
|
"NL-NH-ASS-U-HA.yaml",
|
||||||
|
"NL-NH-SAN-U-HSO.yaml",
|
||||||
|
"NL-OV-GEN-U-GB.yaml",
|
||||||
|
"NL-OV-GEN-U-GMS.yaml",
|
||||||
|
"NL-OV-OMM-U-EO.yaml",
|
||||||
|
"NL-OV-SAA-U-BS.yaml",
|
||||||
|
"NL-ZH-BOD-U-GB.yaml",
|
||||||
|
"NL-ZH-GOU-U-BI.yaml",
|
||||||
|
"NL-ZH-HIL-U-HHO.yaml",
|
||||||
|
"NL-ZH-LIS-U-HLO.yaml",
|
||||||
|
],
|
||||||
|
"old_type": "U",
|
||||||
|
"new_type": "I",
|
||||||
|
"reason": "Type corrected: intangible_heritage_custodian should use type I (Intangible Heritage), not U (Unknown)",
|
||||||
|
},
|
||||||
|
# X→I: Mixed type should be Intangible (single primary type) (2 files)
|
||||||
|
"X-to-I": {
|
||||||
|
"files": [
|
||||||
|
"NL-OV-KAL-X-BW.yaml",
|
||||||
|
"NL-GE-HAT-X-IGR.yaml",
|
||||||
|
],
|
||||||
|
"old_type": "X",
|
||||||
|
"new_type": "I",
|
||||||
|
"reason": "Type corrected: intangible_heritage_custodian should use type I (Intangible Heritage), not X (Mixed)",
|
||||||
|
},
|
||||||
|
# U→M: Museums incorrectly marked as Unknown (143 files)
|
||||||
|
# Use --auto-detect-museums flag to populate this list dynamically
|
||||||
|
"U-to-M": {
|
||||||
|
"files": [
|
||||||
|
# Auto-detected by checking original_entry.type_organisatie == "museum"
|
||||||
|
# Run with: python scripts/fix_ghcid_type.py --auto-detect-museums --dry-run
|
||||||
|
],
|
||||||
|
"old_type": "U",
|
||||||
|
"new_type": "M",
|
||||||
|
"reason": "Type corrected: museum should use type M (Museum), not U (Unknown)",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def generate_uuid_v5(ghcid_string: str) -> str:
|
||||||
|
"""Generate deterministic UUID v5 from GHCID string."""
|
||||||
|
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
|
||||||
|
|
||||||
|
|
||||||
|
def generate_uuid_v8_sha256(ghcid_string: str) -> str:
|
||||||
|
"""Generate UUID v8 from SHA-256 hash of GHCID string."""
|
||||||
|
sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
|
||||||
|
# Take first 16 bytes for UUID
|
||||||
|
uuid_bytes = bytearray(sha256_hash[:16])
|
||||||
|
# Set version to 8 (custom)
|
||||||
|
uuid_bytes[6] = (uuid_bytes[6] & 0x0f) | 0x80
|
||||||
|
# Set variant to RFC 4122
|
||||||
|
uuid_bytes[8] = (uuid_bytes[8] & 0x3f) | 0x80
|
||||||
|
return str(uuid.UUID(bytes=bytes(uuid_bytes)))
|
||||||
|
|
||||||
|
|
||||||
|
def generate_numeric_id(ghcid_string: str) -> int:
|
||||||
|
"""Generate 64-bit numeric ID from SHA-256 hash."""
|
||||||
|
sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
|
||||||
|
# Take first 8 bytes as 64-bit unsigned integer
|
||||||
|
numeric_id = int.from_bytes(sha256_hash[:8], byteorder='big')
|
||||||
|
return numeric_id
|
||||||
|
|
||||||
|
|
||||||
|
def fix_ghcid_type(ghcid: str, old_type: str, new_type: str) -> str:
|
||||||
|
"""
|
||||||
|
Replace the type code in a GHCID string.
|
||||||
|
|
||||||
|
GHCID format: CC-RR-CCC-T-ABBREV[-suffix]
|
||||||
|
Position 4 (0-indexed 3) is the type code.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
NL-DR-FRE-U-FCFE → NL-DR-FRE-I-FCFE
|
||||||
|
NL-OV-KAL-X-BW → NL-OV-KAL-I-BW
|
||||||
|
"""
|
||||||
|
parts = ghcid.split('-')
|
||||||
|
if len(parts) < 5:
|
||||||
|
raise ValueError(f"Invalid GHCID format: {ghcid}")
|
||||||
|
|
||||||
|
current_type = parts[3]
|
||||||
|
if current_type != old_type:
|
||||||
|
raise ValueError(f"Expected type '{old_type}' but found '{current_type}' in GHCID: {ghcid}")
|
||||||
|
|
||||||
|
parts[3] = new_type
|
||||||
|
return '-'.join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def process_file(
|
||||||
|
file_path: Path,
|
||||||
|
old_type: str,
|
||||||
|
new_type: str,
|
||||||
|
reason: str,
|
||||||
|
dry_run: bool = True
|
||||||
|
) -> Optional[dict]:
|
||||||
|
"""
|
||||||
|
Process a single YAML file to fix GHCID type code.
|
||||||
|
|
||||||
|
Returns dict with change info, or None if no change needed or error.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
data = yaml.safe_load(f)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error reading {file_path}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
if not data or 'ghcid' not in data:
|
||||||
|
print(f" Warning: No ghcid section in {file_path}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
ghcid_section = data.get('ghcid', {})
|
||||||
|
old_ghcid = ghcid_section.get('ghcid_current', '')
|
||||||
|
|
||||||
|
if not old_ghcid:
|
||||||
|
print(f" Warning: No ghcid_current in {file_path}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Check if the type matches what we expect to fix
|
||||||
|
parts = old_ghcid.split('-')
|
||||||
|
if len(parts) < 5:
|
||||||
|
print(f" Warning: Invalid GHCID format in {file_path}: {old_ghcid}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
current_type = parts[3]
|
||||||
|
if current_type != old_type:
|
||||||
|
print(f" Skipping {file_path}: type is '{current_type}', expected '{old_type}'")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Fix the GHCID
|
||||||
|
try:
|
||||||
|
new_ghcid = fix_ghcid_type(old_ghcid, old_type, new_type)
|
||||||
|
except ValueError as e:
|
||||||
|
print(f" Error: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
if new_ghcid == old_ghcid:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Generate new identifiers
|
||||||
|
new_uuid_v5 = generate_uuid_v5(new_ghcid)
|
||||||
|
new_uuid_v8 = generate_uuid_v8_sha256(new_ghcid)
|
||||||
|
new_numeric = generate_numeric_id(new_ghcid)
|
||||||
|
timestamp_now = datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
change_info = {
|
||||||
|
'file': str(file_path),
|
||||||
|
'old_ghcid': old_ghcid,
|
||||||
|
'new_ghcid': new_ghcid,
|
||||||
|
'old_type': old_type,
|
||||||
|
'new_type': new_type,
|
||||||
|
'old_uuid': ghcid_section.get('ghcid_uuid', ''),
|
||||||
|
'new_uuid': new_uuid_v5,
|
||||||
|
'old_numeric': ghcid_section.get('ghcid_numeric', 0),
|
||||||
|
'new_numeric': new_numeric,
|
||||||
|
}
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
return change_info
|
||||||
|
|
||||||
|
# Update ghcid section
|
||||||
|
ghcid_section['ghcid_current'] = new_ghcid
|
||||||
|
ghcid_section['ghcid_uuid'] = new_uuid_v5
|
||||||
|
ghcid_section['ghcid_uuid_sha256'] = new_uuid_v8
|
||||||
|
ghcid_section['ghcid_numeric'] = new_numeric
|
||||||
|
# Keep ghcid_original as-is (for historical reference)
|
||||||
|
|
||||||
|
# Add history entry for the fix
|
||||||
|
ghcid_history = ghcid_section.get('ghcid_history', [])
|
||||||
|
|
||||||
|
# Add new entry at the beginning
|
||||||
|
new_history_entry = {
|
||||||
|
'ghcid': new_ghcid,
|
||||||
|
'ghcid_numeric': new_numeric,
|
||||||
|
'valid_from': timestamp_now,
|
||||||
|
'reason': reason,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Mark previous entry as superseded
|
||||||
|
if ghcid_history:
|
||||||
|
if 'valid_to' not in ghcid_history[0] or ghcid_history[0]['valid_to'] is None:
|
||||||
|
ghcid_history[0]['valid_to'] = timestamp_now
|
||||||
|
ghcid_history[0]['superseded_by'] = new_ghcid
|
||||||
|
|
||||||
|
ghcid_section['ghcid_history'] = [new_history_entry] + ghcid_history
|
||||||
|
data['ghcid'] = ghcid_section
|
||||||
|
|
||||||
|
# Update identifiers section
|
||||||
|
identifiers = data.get('identifiers', [])
|
||||||
|
for ident in identifiers:
|
||||||
|
scheme = ident.get('identifier_scheme')
|
||||||
|
if scheme == 'GHCID':
|
||||||
|
ident['identifier_value'] = new_ghcid
|
||||||
|
elif scheme == 'GHCID_UUID':
|
||||||
|
ident['identifier_value'] = new_uuid_v5
|
||||||
|
ident['identifier_url'] = f"urn:uuid:{new_uuid_v5}"
|
||||||
|
elif scheme == 'GHCID_UUID_SHA256':
|
||||||
|
ident['identifier_value'] = new_uuid_v8
|
||||||
|
ident['identifier_url'] = f"urn:uuid:{new_uuid_v8}"
|
||||||
|
elif scheme == 'GHCID_NUMERIC':
|
||||||
|
ident['identifier_value'] = str(new_numeric)
|
||||||
|
data['identifiers'] = identifiers
|
||||||
|
|
||||||
|
# Also update original_entry.type if present (to keep consistency)
|
||||||
|
if 'original_entry' in data and 'type' in data['original_entry']:
|
||||||
|
# Update type list to use new type
|
||||||
|
current_types = data['original_entry']['type']
|
||||||
|
if isinstance(current_types, list):
|
||||||
|
# Replace old type with new type in the list
|
||||||
|
data['original_entry']['type'] = [
|
||||||
|
new_type if t == old_type else t for t in current_types
|
||||||
|
]
|
||||||
|
|
||||||
|
# Write updated file
|
||||||
|
with open(file_path, 'w', encoding='utf-8') as f:
|
||||||
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||||
|
|
||||||
|
# Rename file to match new GHCID
|
||||||
|
old_filename = file_path.name
|
||||||
|
new_filename = f"{new_ghcid}.yaml"
|
||||||
|
|
||||||
|
if old_filename != new_filename:
|
||||||
|
new_file_path = file_path.parent / new_filename
|
||||||
|
if new_file_path.exists():
|
||||||
|
print(f" Warning: Target file already exists: {new_file_path}")
|
||||||
|
# Don't rename if target exists
|
||||||
|
else:
|
||||||
|
shutil.move(str(file_path), str(new_file_path))
|
||||||
|
change_info['new_file'] = str(new_file_path)
|
||||||
|
|
||||||
|
return change_info
|
||||||
|
|
||||||
|
|
||||||
|
def find_files_for_correction(
|
||||||
|
custodian_dir: Path,
|
||||||
|
correction_key: str
|
||||||
|
) -> list[Path]:
|
||||||
|
"""Find files that need the specified type correction."""
|
||||||
|
correction = TYPE_CORRECTIONS.get(correction_key)
|
||||||
|
if not correction:
|
||||||
|
print(f"Unknown correction type: {correction_key}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
files = []
|
||||||
|
for filename in correction['files']:
|
||||||
|
file_path = custodian_dir / filename
|
||||||
|
if file_path.exists():
|
||||||
|
files.append(file_path)
|
||||||
|
else:
|
||||||
|
print(f" Warning: File not found: {file_path}")
|
||||||
|
|
||||||
|
return files
|
||||||
|
|
||||||
|
|
||||||
|
def auto_detect_museum_files(custodian_dir: Path) -> list[Path]:
|
||||||
|
"""
|
||||||
|
Auto-detect files where type should be M (Museum) based on:
|
||||||
|
- original_entry.type_organisatie == "museum"
|
||||||
|
- Current GHCID type is U (Unknown)
|
||||||
|
"""
|
||||||
|
museum_files = []
|
||||||
|
|
||||||
|
# Find all NL-*-U-*.yaml files (Dutch files with Unknown type)
|
||||||
|
for file_path in custodian_dir.glob("NL-*-U-*.yaml"):
|
||||||
|
try:
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
data = yaml.safe_load(f)
|
||||||
|
|
||||||
|
if not data:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if type_organisatie indicates this is a museum
|
||||||
|
orig_entry = data.get('original_entry', {})
|
||||||
|
type_org = orig_entry.get('type_organisatie', '').lower()
|
||||||
|
|
||||||
|
if type_org == 'museum':
|
||||||
|
museum_files.append(file_path)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return museum_files
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Fix GHCID type codes in Dutch custodian files"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--dry-run',
|
||||||
|
action='store_true',
|
||||||
|
help='Preview changes without modifying files'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--correction',
|
||||||
|
choices=['U-to-I', 'U-to-M', 'X-to-I', 'all'],
|
||||||
|
default='all',
|
||||||
|
help='Which correction type to apply (default: all)'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--file',
|
||||||
|
type=str,
|
||||||
|
help='Process a single file instead of batch'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--new-type',
|
||||||
|
type=str,
|
||||||
|
help='New type code when processing single file'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--auto-detect-museums',
|
||||||
|
action='store_true',
|
||||||
|
help='Auto-detect museum files based on type_organisatie field'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--custodian-dir',
|
||||||
|
type=str,
|
||||||
|
default='data/custodian',
|
||||||
|
help='Path to custodian directory (default: data/custodian)'
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Find project root (where data/ directory is)
|
||||||
|
script_dir = Path(__file__).parent
|
||||||
|
project_root = script_dir.parent
|
||||||
|
custodian_dir = project_root / args.custodian_dir
|
||||||
|
|
||||||
|
if not custodian_dir.exists():
|
||||||
|
print(f"Error: Custodian directory not found: {custodian_dir}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
print(f"GHCID Type Correction Script")
|
||||||
|
print(f"{'=' * 50}")
|
||||||
|
print(f"Mode: {'DRY RUN' if args.dry_run else 'APPLY CHANGES'}")
|
||||||
|
print(f"Custodian directory: {custodian_dir}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
all_changes = []
|
||||||
|
|
||||||
|
# Single file mode
|
||||||
|
if args.file:
|
||||||
|
if not args.new_type:
|
||||||
|
print("Error: --new-type is required when using --file")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
file_path = Path(args.file)
|
||||||
|
if not file_path.is_absolute():
|
||||||
|
file_path = project_root / file_path
|
||||||
|
|
||||||
|
if not file_path.exists():
|
||||||
|
print(f"Error: File not found: {file_path}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# Detect old type from filename
|
||||||
|
parts = file_path.stem.split('-')
|
||||||
|
if len(parts) >= 4:
|
||||||
|
old_type = parts[3]
|
||||||
|
else:
|
||||||
|
print(f"Error: Cannot determine type from filename: {file_path}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
reason = f"Type corrected: {old_type} → {args.new_type} (manual correction)"
|
||||||
|
|
||||||
|
print(f"Processing single file: {file_path}")
|
||||||
|
change = process_file(file_path, old_type, args.new_type, reason, args.dry_run)
|
||||||
|
if change:
|
||||||
|
all_changes.append(change)
|
||||||
|
|
||||||
|
# Auto-detect museum files
|
||||||
|
elif args.auto_detect_museums:
|
||||||
|
print("Auto-detecting museum files...")
|
||||||
|
museum_files = auto_detect_museum_files(custodian_dir)
|
||||||
|
print(f"Found {len(museum_files)} museum files with type U")
|
||||||
|
|
||||||
|
# Update the U-to-M correction with detected files
|
||||||
|
TYPE_CORRECTIONS['U-to-M']['files'] = [f.name for f in museum_files]
|
||||||
|
|
||||||
|
# Process them
|
||||||
|
correction = TYPE_CORRECTIONS['U-to-M']
|
||||||
|
for file_path in museum_files:
|
||||||
|
change = process_file(
|
||||||
|
file_path,
|
||||||
|
correction['old_type'],
|
||||||
|
correction['new_type'],
|
||||||
|
correction['reason'],
|
||||||
|
args.dry_run
|
||||||
|
)
|
||||||
|
if change:
|
||||||
|
all_changes.append(change)
|
||||||
|
|
||||||
|
# Batch mode
|
||||||
|
else:
|
||||||
|
corrections_to_apply = []
|
||||||
|
|
||||||
|
if args.correction == 'all':
|
||||||
|
corrections_to_apply = list(TYPE_CORRECTIONS.keys())
|
||||||
|
else:
|
||||||
|
corrections_to_apply = [args.correction]
|
||||||
|
|
||||||
|
for correction_key in corrections_to_apply:
|
||||||
|
correction = TYPE_CORRECTIONS[correction_key]
|
||||||
|
|
||||||
|
if not correction['files']:
|
||||||
|
print(f"\nSkipping {correction_key}: no files specified")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"\nProcessing {correction_key}:")
|
||||||
|
print(f" {correction['old_type']} → {correction['new_type']}")
|
||||||
|
print(f" Files: {len(correction['files'])}")
|
||||||
|
|
||||||
|
files = find_files_for_correction(custodian_dir, correction_key)
|
||||||
|
|
||||||
|
for file_path in files:
|
||||||
|
change = process_file(
|
||||||
|
file_path,
|
||||||
|
correction['old_type'],
|
||||||
|
correction['new_type'],
|
||||||
|
correction['reason'],
|
||||||
|
args.dry_run
|
||||||
|
)
|
||||||
|
if change:
|
||||||
|
all_changes.append(change)
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print(f"\n{'=' * 50}")
|
||||||
|
print(f"SUMMARY")
|
||||||
|
print(f"{'=' * 50}")
|
||||||
|
|
||||||
|
if not all_changes:
|
||||||
|
print("No changes needed or no matching files found.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
print(f"Total changes: {len(all_changes)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Group by type change
|
||||||
|
by_type_change = {}
|
||||||
|
for change in all_changes:
|
||||||
|
key = f"{change['old_type']}→{change['new_type']}"
|
||||||
|
if key not in by_type_change:
|
||||||
|
by_type_change[key] = []
|
||||||
|
by_type_change[key].append(change)
|
||||||
|
|
||||||
|
for key, changes in sorted(by_type_change.items()):
|
||||||
|
print(f"\n{key}: {len(changes)} files")
|
||||||
|
for change in changes:
|
||||||
|
print(f" {change['old_ghcid']} → {change['new_ghcid']}")
|
||||||
|
if 'new_file' in change:
|
||||||
|
print(f" Renamed to: {Path(change['new_file']).name}")
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
print(f"\n{'=' * 50}")
|
||||||
|
print("DRY RUN - No files were modified.")
|
||||||
|
print("Run without --dry-run to apply changes.")
|
||||||
|
else:
|
||||||
|
print(f"\n{'=' * 50}")
|
||||||
|
print(f"Successfully updated {len(all_changes)} files.")
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
exit(main())
|
||||||
269
scripts/fix_simon_kemper_contamination.py
Normal file
269
scripts/fix_simon_kemper_contamination.py
Normal file
|
|
@ -0,0 +1,269 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Fix Simon Kemper contamination in entity profiles.
|
||||||
|
|
||||||
|
For entries where:
|
||||||
|
1. Name is "Simon Kemper"
|
||||||
|
2. But the LinkedIn slug clearly indicates a different person
|
||||||
|
|
||||||
|
We derive the correct name from the slug and update the profile.
|
||||||
|
|
||||||
|
IMPORTANT: Per Rule 21 (Data Fabrication Prohibition) - if we cannot reliably
|
||||||
|
derive the name from the slug, we mark it as "Unknown" rather than guessing.
|
||||||
|
Compound slugs without hyphens (like "jponjee") cannot be reliably parsed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import unquote
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
|
||||||
|
def is_compound_slug(slug: str) -> bool:
|
||||||
|
"""Check if slug is a compound name without separators.
|
||||||
|
|
||||||
|
Returns True for slugs like:
|
||||||
|
- 'jponjee' (no hyphens, all lowercase)
|
||||||
|
- 'sharellyemanuelson'
|
||||||
|
- 'addieroelofsen'
|
||||||
|
- 'adheliap'
|
||||||
|
|
||||||
|
Returns False for slugs like:
|
||||||
|
- 'willem-blok' (has hyphens)
|
||||||
|
- 'jan-van-den-borre' (has hyphens)
|
||||||
|
- 'miriam-h' (has hyphens, even if short)
|
||||||
|
- 'olivi%C3%AB-7153658' (has hyphens after URL decoding)
|
||||||
|
"""
|
||||||
|
# First decode URL encoding (e.g., %C3%AB -> ë)
|
||||||
|
slug = unquote(slug)
|
||||||
|
|
||||||
|
# After removing trailing ID, check if there are NO hyphens
|
||||||
|
clean_slug = re.sub(r'[-_][\da-f]{6,}$', '', slug)
|
||||||
|
clean_slug = re.sub(r'[-_]\d{5,}$', '', clean_slug)
|
||||||
|
|
||||||
|
# If no hyphens remain, it's a compound slug that can't be reliably parsed
|
||||||
|
# Even short ones like "jponjee" (7 chars) could be "J. Ponjee" or "J Ponjee"
|
||||||
|
if '-' not in clean_slug:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def slug_to_name(slug: str) -> tuple[str, bool]:
|
||||||
|
"""Convert a LinkedIn slug to a human-readable name.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (name, is_reliable) where:
|
||||||
|
- name: The derived name or "Unknown"
|
||||||
|
- is_reliable: True if we're confident in the derivation
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
'willem-blok-b6a46648' -> ('Willem Blok', True)
|
||||||
|
'dave-van-den-nieuwenhof-4446b3146' -> ('Dave van den Nieuwenhof', True)
|
||||||
|
'olivi%C3%AB-7153658' -> ('Olivië', True)
|
||||||
|
'jponjee' -> ('Unknown', False) # Compound slug, cannot parse reliably
|
||||||
|
'sharellyemanuelson' -> ('Unknown', False) # Compound slug
|
||||||
|
"""
|
||||||
|
# Decode URL encoding
|
||||||
|
slug = unquote(slug)
|
||||||
|
|
||||||
|
# Remove trailing ID (hex or numeric)
|
||||||
|
clean_slug = re.sub(r'[-_][\da-f]{6,}$', '', slug)
|
||||||
|
clean_slug = re.sub(r'[-_]\d{5,}$', '', clean_slug)
|
||||||
|
|
||||||
|
# Check if this is a compound slug we can't reliably parse
|
||||||
|
if is_compound_slug(slug):
|
||||||
|
return ("Unknown", False)
|
||||||
|
|
||||||
|
# Split by hyphens
|
||||||
|
parts = clean_slug.split('-')
|
||||||
|
|
||||||
|
# Filter out empty parts
|
||||||
|
parts = [p for p in parts if p]
|
||||||
|
|
||||||
|
if not parts:
|
||||||
|
return ("Unknown", False)
|
||||||
|
|
||||||
|
# Capitalize appropriately
|
||||||
|
# Dutch particles that should stay lowercase: van, de, den, der, het, 't
|
||||||
|
dutch_particles = {'van', 'de', 'den', 'der', 'het', 't', "'t"}
|
||||||
|
|
||||||
|
name_parts = []
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
if part.lower() in dutch_particles and i > 0:
|
||||||
|
name_parts.append(part.lower())
|
||||||
|
else:
|
||||||
|
# Capitalize first letter, preserve rest
|
||||||
|
name_parts.append(part.capitalize())
|
||||||
|
|
||||||
|
name = ' '.join(name_parts)
|
||||||
|
|
||||||
|
# Additional validation - name should have at least 2 characters
|
||||||
|
if len(name) < 2:
|
||||||
|
return ("Unknown", False)
|
||||||
|
|
||||||
|
return (name, True)
|
||||||
|
|
||||||
|
def fix_contaminated_files(entity_dir: Path, dry_run: bool = True):
|
||||||
|
"""Find and fix Simon Kemper contaminated files.
|
||||||
|
|
||||||
|
Only processes files where name is ACTUALLY "Simon Kemper" (contaminated).
|
||||||
|
Skips files where name was already corrected or was never contaminated.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (contaminated_list, fixed_list, unreliable_list)
|
||||||
|
"""
|
||||||
|
|
||||||
|
contaminated = []
|
||||||
|
fixed = []
|
||||||
|
unreliable = [] # Files where we couldn't reliably derive the name
|
||||||
|
|
||||||
|
for filepath in entity_dir.glob("*.json"):
|
||||||
|
try:
|
||||||
|
with open(filepath, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
except (json.JSONDecodeError, IOError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if this is a Simon Kemper contamination
|
||||||
|
profile_name = data.get('profile_data', {}).get('name', '')
|
||||||
|
source_name = data.get('source_staff_info', {}).get('name', '')
|
||||||
|
|
||||||
|
# ONLY process files where the name is ACTUALLY "Simon Kemper"
|
||||||
|
if profile_name != 'Simon Kemper' and source_name != 'Simon Kemper':
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get the slug from filename or URL
|
||||||
|
filename = filepath.name
|
||||||
|
linkedin_url = data.get('extraction_metadata', {}).get('linkedin_url', '')
|
||||||
|
|
||||||
|
# Extract slug from URL
|
||||||
|
slug_match = re.search(r'/in/([^/]+)/?$', linkedin_url)
|
||||||
|
if not slug_match:
|
||||||
|
continue
|
||||||
|
|
||||||
|
slug = slug_match.group(1)
|
||||||
|
|
||||||
|
# Check if this is truly contamination (slug doesn't match simon kemper)
|
||||||
|
slug_lower = slug.lower().replace('%', '')
|
||||||
|
if 'simonkemper' in slug_lower or 'simon-kemper' in slug_lower:
|
||||||
|
# This is the real Simon Kemper, skip
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Derive correct name from slug
|
||||||
|
correct_name, is_reliable = slug_to_name(slug)
|
||||||
|
|
||||||
|
entry = {
|
||||||
|
'file': filepath.name,
|
||||||
|
'slug': slug,
|
||||||
|
'profile_name': profile_name,
|
||||||
|
'source_name': source_name,
|
||||||
|
'contaminated_field': 'profile_data.name' if profile_name == 'Simon Kemper' else 'source_staff_info.name',
|
||||||
|
'correct_name': correct_name,
|
||||||
|
'is_reliable': is_reliable,
|
||||||
|
'headline': data.get('profile_data', {}).get('headline', ''),
|
||||||
|
'custodian': data.get('affiliations', [{}])[0].get('custodian_name', '') if data.get('affiliations') else ''
|
||||||
|
}
|
||||||
|
|
||||||
|
if is_reliable:
|
||||||
|
contaminated.append(entry)
|
||||||
|
else:
|
||||||
|
unreliable.append(entry)
|
||||||
|
|
||||||
|
if not dry_run:
|
||||||
|
# Fix the data
|
||||||
|
if 'profile_data' in data:
|
||||||
|
data['profile_data']['name'] = correct_name
|
||||||
|
if 'source_staff_info' in data:
|
||||||
|
data['source_staff_info']['name'] = correct_name
|
||||||
|
|
||||||
|
# Add fix metadata
|
||||||
|
if 'extraction_metadata' not in data:
|
||||||
|
data['extraction_metadata'] = {}
|
||||||
|
|
||||||
|
if is_reliable:
|
||||||
|
fix_note = f"Name corrected from 'Simon Kemper' (contamination) to '{correct_name}' (derived from slug) on {datetime.now(timezone.utc).isoformat()}"
|
||||||
|
else:
|
||||||
|
fix_note = f"Name set to 'Unknown' (was 'Simon Kemper' contamination). Original slug: {slug}. Compound slug cannot be reliably parsed. Fixed on {datetime.now(timezone.utc).isoformat()}"
|
||||||
|
# Also preserve slug in a dedicated field for future reference
|
||||||
|
data['extraction_metadata']['original_slug'] = slug
|
||||||
|
|
||||||
|
existing_notes = data['extraction_metadata'].get('notes', '')
|
||||||
|
if existing_notes:
|
||||||
|
data['extraction_metadata']['notes'] = f"{existing_notes} | {fix_note}"
|
||||||
|
else:
|
||||||
|
data['extraction_metadata']['notes'] = fix_note
|
||||||
|
|
||||||
|
# Write back
|
||||||
|
with open(filepath, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
fixed.append(filepath.name)
|
||||||
|
|
||||||
|
return contaminated, fixed, unreliable
|
||||||
|
|
||||||
|
def main():
|
||||||
|
import argparse
|
||||||
|
parser = argparse.ArgumentParser(description='Fix Simon Kemper contamination')
|
||||||
|
parser.add_argument('--fix', action='store_true', help='Actually fix files (default: dry run)')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
entity_dir = Path("/Users/kempersc/apps/glam/data/custodian/person/entity")
|
||||||
|
|
||||||
|
dry_run = not args.fix
|
||||||
|
mode = "DRY RUN" if dry_run else "FIXING"
|
||||||
|
|
||||||
|
print("=" * 80)
|
||||||
|
print(f"SIMON KEMPER CONTAMINATION FIX - {mode}")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
contaminated, fixed, unreliable = fix_contaminated_files(entity_dir, dry_run=dry_run)
|
||||||
|
|
||||||
|
print(f"\n{'='*40}")
|
||||||
|
print(f"RELIABLY PARSEABLE ({len(contaminated)} files)")
|
||||||
|
print(f"{'='*40}")
|
||||||
|
print("These slugs have hyphens and can be reliably converted to names:\n")
|
||||||
|
|
||||||
|
for c in contaminated:
|
||||||
|
print(f" File: {c['file']}")
|
||||||
|
print(f" Slug: {c['slug']}")
|
||||||
|
print(f" Contaminated: {c['contaminated_field']} = 'Simon Kemper'")
|
||||||
|
print(f" Correct name: '{c['correct_name']}'")
|
||||||
|
headline = c['headline']
|
||||||
|
print(f" Headline: {headline[:60]}..." if len(headline) > 60 else f" Headline: {headline}")
|
||||||
|
print(f" Custodian: {c['custodian']}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if unreliable:
|
||||||
|
print(f"\n{'='*40}")
|
||||||
|
print(f"COMPOUND SLUGS - SET TO 'Unknown' ({len(unreliable)} files)")
|
||||||
|
print(f"{'='*40}")
|
||||||
|
print("These slugs have no hyphens and cannot be reliably parsed.")
|
||||||
|
print("Per Rule 21: Names will be set to 'Unknown' (no hallucination).\n")
|
||||||
|
|
||||||
|
for u in unreliable:
|
||||||
|
print(f" File: {u['file']}")
|
||||||
|
print(f" Slug: {u['slug']}")
|
||||||
|
print(f" Contaminated: {u['contaminated_field']} = 'Simon Kemper'")
|
||||||
|
print(f" Will be set to: 'Unknown' (slug preserved in metadata)")
|
||||||
|
headline = u['headline']
|
||||||
|
print(f" Headline: {headline[:60]}..." if len(headline) > 60 else f" Headline: {headline}")
|
||||||
|
print(f" Custodian: {u['custodian']}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
print(f"\n{'='*40}")
|
||||||
|
print("SUMMARY")
|
||||||
|
print(f"{'='*40}")
|
||||||
|
print(f" Reliably fixable: {len(contaminated)}")
|
||||||
|
print(f" Set to 'Unknown': {len(unreliable)}")
|
||||||
|
print(f" Total: {len(contaminated) + len(unreliable)}")
|
||||||
|
|
||||||
|
if not dry_run:
|
||||||
|
print(f"\n✅ Fixed {len(fixed)} files")
|
||||||
|
else:
|
||||||
|
print(f"\n⚠️ DRY RUN - No files modified. Run with --fix to apply changes.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -99,6 +99,62 @@ NON_HERITAGE_KEYWORDS = [
|
||||||
'organiser', 'opruimhulp', 'verpleeg', 'nurse'
|
'organiser', 'opruimhulp', 'verpleeg', 'nurse'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Organizations that are explicitly NOT heritage institutions
|
||||||
|
# These should never be classified as heritage-relevant
|
||||||
|
NON_HERITAGE_ORGANIZATIONS = [
|
||||||
|
# Banks & Financial
|
||||||
|
'ing ', 'ing nederland', 'rabobank', 'abn amro', 'postbank', 'triodos',
|
||||||
|
# Security companies
|
||||||
|
'i-sec', 'g4s', 'securitas', 'trigion', 'chubb',
|
||||||
|
# Police/Government (non-cultural)
|
||||||
|
'politie', 'police', 'rijkswaterstaat', 'belastingdienst', 'douane', 'defensie',
|
||||||
|
# Political parties
|
||||||
|
'vvd', 'pvda', 'cda', 'd66', 'groenlinks', 'pvv', 'bbb', 'nsc', 'volt',
|
||||||
|
'sp ', 'forum voor democratie', 'ja21', 'bij1', 'denk', 'sgp', 'cu ',
|
||||||
|
# Tech companies (non-heritage)
|
||||||
|
'google', 'microsoft', 'amazon', 'meta', 'facebook', 'apple', 'netflix',
|
||||||
|
'uber', 'airbnb', 'booking.com', 'adyen', 'mollie', 'messagebird',
|
||||||
|
'coolblue', 'bol.com', 'picnic', 'takeaway', 'just eat',
|
||||||
|
# Telecom
|
||||||
|
'kpn', 'vodafone', 't-mobile', 'ziggo',
|
||||||
|
# Postal / Logistics
|
||||||
|
'postnl', 'postkantoren', 'dhl', 'ups', 'fedex',
|
||||||
|
# Healthcare
|
||||||
|
'ziekenhuis', 'hospital', 'ggz', 'ggd', 'thuiszorg',
|
||||||
|
# Retail
|
||||||
|
'albert heijn', 'jumbo', 'lidl', 'aldi', 'ikea', 'hema', 'action',
|
||||||
|
# Consulting / Professional services
|
||||||
|
'deloitte', 'kpmg', 'pwc', 'ey ', 'ernst & young', 'mckinsey', 'bcg',
|
||||||
|
'accenture', 'capgemini', 'ordina', 'atos', 'cgi ',
|
||||||
|
# Recruitment / HR
|
||||||
|
'randstad', 'tempo-team', 'manpower', 'hays', 'brunel',
|
||||||
|
# Energy / Utilities
|
||||||
|
'shell', 'bp ', 'eneco', 'vattenfall', 'essent', 'nuon',
|
||||||
|
# Transport
|
||||||
|
'ns ', 'prorail', 'schiphol', 'klm', 'transavia',
|
||||||
|
# Other
|
||||||
|
'freelance', 'zelfstandig', 'zzp', 'eigen bedrijf',
|
||||||
|
]
|
||||||
|
|
||||||
|
# Heritage organization keywords - organizations that ARE heritage institutions
|
||||||
|
# Used to validate that 'D' (Digital) roles are actually at heritage orgs
|
||||||
|
HERITAGE_ORGANIZATION_KEYWORDS = [
|
||||||
|
# Archives
|
||||||
|
'archief', 'archive', 'nationaal archief', 'stadsarchief', 'regionaal archief',
|
||||||
|
'beeld en geluid', 'beeld & geluid', 'niod', 'iish', 'iisg',
|
||||||
|
# Museums
|
||||||
|
'museum', 'musea', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis',
|
||||||
|
'tropenmuseum', 'allard pierson', 'kröller', 'boijmans',
|
||||||
|
# Libraries
|
||||||
|
'bibliotheek', 'library', 'koninklijke bibliotheek', 'kb ',
|
||||||
|
# Film/AV heritage
|
||||||
|
'eye film', 'filmmuseum', 'eye ', 'sound and vision',
|
||||||
|
# Heritage platforms
|
||||||
|
'erfgoed', 'heritage', 'cultural', 'cultureel',
|
||||||
|
# Research institutes (heritage-focused)
|
||||||
|
'knaw', 'humanities cluster', 'meertens', 'huygens',
|
||||||
|
]
|
||||||
|
|
||||||
# Lines that indicate LinkedIn UI noise (to skip entirely)
|
# Lines that indicate LinkedIn UI noise (to skip entirely)
|
||||||
NOISE_EXACT = {
|
NOISE_EXACT = {
|
||||||
'0 notifications', 'Search', 'Home', 'My Network', 'Jobs', 'Messaging',
|
'0 notifications', 'Search', 'Home', 'My Network', 'Jobs', 'Messaging',
|
||||||
|
|
@ -276,16 +332,35 @@ def is_location_line(line: str) -> bool:
|
||||||
def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]:
|
def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]:
|
||||||
"""
|
"""
|
||||||
Detect if a headline is heritage-relevant and what type.
|
Detect if a headline is heritage-relevant and what type.
|
||||||
|
|
||||||
|
Two-stage classification:
|
||||||
|
1. Check if organization is explicitly non-heritage (blocklist)
|
||||||
|
2. Check if role/organization matches heritage patterns
|
||||||
|
|
||||||
|
For 'D' (Digital) type, require BOTH a tech role AND a heritage organization.
|
||||||
"""
|
"""
|
||||||
headline_lower = headline.lower()
|
headline_lower = headline.lower()
|
||||||
|
|
||||||
# Check for non-heritage indicators
|
# Stage 1: Check for non-heritage organizations (blocklist)
|
||||||
|
for org in NON_HERITAGE_ORGANIZATIONS:
|
||||||
|
if org.lower() in headline_lower:
|
||||||
|
return (False, None)
|
||||||
|
|
||||||
|
# Stage 2: Check for non-heritage role indicators
|
||||||
for keyword in NON_HERITAGE_KEYWORDS:
|
for keyword in NON_HERITAGE_KEYWORDS:
|
||||||
if keyword.lower() in headline_lower:
|
if keyword.lower() in headline_lower:
|
||||||
return (False, None)
|
return (False, None)
|
||||||
|
|
||||||
|
# Stage 3: Check if this is a heritage organization
|
||||||
|
is_heritage_org = False
|
||||||
|
for org_keyword in HERITAGE_ORGANIZATION_KEYWORDS:
|
||||||
|
if org_keyword.lower() in headline_lower:
|
||||||
|
is_heritage_org = True
|
||||||
|
break
|
||||||
|
|
||||||
# Check heritage keywords by type (order matters - more specific first)
|
# Check heritage keywords by type (order matters - more specific first)
|
||||||
type_order = ['A', 'M', 'L', 'O', 'R', 'E', 'D', 'G', 'S', 'C']
|
# 'D' (Digital) is checked last and requires heritage org validation
|
||||||
|
type_order = ['A', 'M', 'L', 'G', 'S', 'C', 'O', 'R', 'E'] # D removed from here
|
||||||
|
|
||||||
for heritage_type in type_order:
|
for heritage_type in type_order:
|
||||||
keywords = HERITAGE_KEYWORDS.get(heritage_type, [])
|
keywords = HERITAGE_KEYWORDS.get(heritage_type, [])
|
||||||
|
|
@ -293,7 +368,15 @@ def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]:
|
||||||
if keyword.lower() in headline_lower:
|
if keyword.lower() in headline_lower:
|
||||||
return (True, heritage_type)
|
return (True, heritage_type)
|
||||||
|
|
||||||
# Generic heritage terms
|
# Special handling for 'D' (Digital) - ONLY if at a heritage organization
|
||||||
|
# This prevents generic IT workers from being classified as heritage-relevant
|
||||||
|
if is_heritage_org:
|
||||||
|
digital_keywords = HERITAGE_KEYWORDS.get('D', [])
|
||||||
|
for keyword in digital_keywords:
|
||||||
|
if keyword.lower() in headline_lower:
|
||||||
|
return (True, 'D')
|
||||||
|
|
||||||
|
# Generic heritage terms (without specific type)
|
||||||
generic_heritage = [
|
generic_heritage = [
|
||||||
'heritage', 'erfgoed', 'culture', 'cultuur', 'cultural',
|
'heritage', 'erfgoed', 'culture', 'cultuur', 'cultural',
|
||||||
'film', 'cinema', 'media', 'arts', 'kunst', 'creative',
|
'film', 'cinema', 'media', 'arts', 'kunst', 'creative',
|
||||||
|
|
|
||||||
|
|
@ -66,6 +66,62 @@ NON_HERITAGE_KEYWORDS = [
|
||||||
'investment', 'e-commerce', 'organiser', 'opruimhulp', 'verpleeg', 'nurse'
|
'investment', 'e-commerce', 'organiser', 'opruimhulp', 'verpleeg', 'nurse'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Organizations that are explicitly NOT heritage institutions
|
||||||
|
# These should never be classified as heritage-relevant
|
||||||
|
NON_HERITAGE_ORGANIZATIONS = [
|
||||||
|
# Banks & Financial
|
||||||
|
'ing ', 'ing nederland', 'rabobank', 'abn amro', 'postbank', 'triodos',
|
||||||
|
# Security companies
|
||||||
|
'i-sec', 'g4s', 'securitas', 'trigion', 'chubb',
|
||||||
|
# Police/Government (non-cultural)
|
||||||
|
'politie', 'police', 'rijkswaterstaat', 'belastingdienst', 'douane', 'defensie',
|
||||||
|
# Political parties
|
||||||
|
'vvd', 'pvda', 'cda', 'd66', 'groenlinks', 'pvv', 'bbb', 'nsc', 'volt',
|
||||||
|
'sp ', 'forum voor democratie', 'ja21', 'bij1', 'denk', 'sgp', 'cu ',
|
||||||
|
# Tech companies (non-heritage)
|
||||||
|
'google', 'microsoft', 'amazon', 'meta', 'facebook', 'apple', 'netflix',
|
||||||
|
'uber', 'airbnb', 'booking.com', 'adyen', 'mollie', 'messagebird',
|
||||||
|
'coolblue', 'bol.com', 'picnic', 'takeaway', 'just eat',
|
||||||
|
# Telecom
|
||||||
|
'kpn', 'vodafone', 't-mobile', 'ziggo',
|
||||||
|
# Postal / Logistics
|
||||||
|
'postnl', 'postkantoren', 'dhl', 'ups', 'fedex',
|
||||||
|
# Healthcare
|
||||||
|
'ziekenhuis', 'hospital', 'ggz', 'ggd', 'thuiszorg',
|
||||||
|
# Retail
|
||||||
|
'albert heijn', 'jumbo', 'lidl', 'aldi', 'ikea', 'hema', 'action',
|
||||||
|
# Consulting / Professional services
|
||||||
|
'deloitte', 'kpmg', 'pwc', 'ey ', 'ernst & young', 'mckinsey', 'bcg',
|
||||||
|
'accenture', 'capgemini', 'ordina', 'atos', 'cgi ',
|
||||||
|
# Recruitment / HR
|
||||||
|
'randstad', 'tempo-team', 'manpower', 'hays', 'brunel',
|
||||||
|
# Energy / Utilities
|
||||||
|
'shell', 'bp ', 'eneco', 'vattenfall', 'essent', 'nuon',
|
||||||
|
# Transport
|
||||||
|
'ns ', 'prorail', 'schiphol', 'klm', 'transavia',
|
||||||
|
# Other
|
||||||
|
'freelance', 'zelfstandig', 'zzp', 'eigen bedrijf',
|
||||||
|
]
|
||||||
|
|
||||||
|
# Heritage organization keywords - organizations that ARE heritage institutions
|
||||||
|
# Used to validate that 'D' (Digital) roles are actually at heritage orgs
|
||||||
|
HERITAGE_ORGANIZATION_KEYWORDS = [
|
||||||
|
# Archives
|
||||||
|
'archief', 'archive', 'nationaal archief', 'stadsarchief', 'regionaal archief',
|
||||||
|
'beeld en geluid', 'beeld & geluid', 'niod', 'iish', 'iisg',
|
||||||
|
# Museums
|
||||||
|
'museum', 'musea', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis',
|
||||||
|
'tropenmuseum', 'allard pierson', 'kröller', 'boijmans',
|
||||||
|
# Libraries
|
||||||
|
'bibliotheek', 'library', 'koninklijke bibliotheek', 'kb ',
|
||||||
|
# Film/AV heritage
|
||||||
|
'eye film', 'filmmuseum', 'eye ', 'sound and vision',
|
||||||
|
# Heritage platforms
|
||||||
|
'erfgoed', 'heritage', 'cultural', 'cultureel',
|
||||||
|
# Research institutes (heritage-focused)
|
||||||
|
'knaw', 'humanities cluster', 'meertens', 'huygens',
|
||||||
|
]
|
||||||
|
|
||||||
# LinkedIn status phrases that pollute name fields (extracted from img alt text)
|
# LinkedIn status phrases that pollute name fields (extracted from img alt text)
|
||||||
# These should be removed from names and stored as metadata
|
# These should be removed from names and stored as metadata
|
||||||
LINKEDIN_STATUS_PHRASES = [
|
LINKEDIN_STATUS_PHRASES = [
|
||||||
|
|
@ -168,8 +224,8 @@ class LinkedInProfileCardParser(HTMLParser):
|
||||||
|
|
||||||
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
||||||
attrs_dict = dict(attrs)
|
attrs_dict = dict(attrs)
|
||||||
attr_id = attrs_dict.get('id', '')
|
attr_id = attrs_dict.get('id') or ''
|
||||||
attr_class = attrs_dict.get('class', '')
|
attr_class = attrs_dict.get('class') or ''
|
||||||
|
|
||||||
# Detect profile card start - can be on <a> tag (regular) OR <img> tag (anonymous)
|
# Detect profile card start - can be on <a> tag (regular) OR <img> tag (anonymous)
|
||||||
if 'org-people-profile-card__profile-image' in attr_id:
|
if 'org-people-profile-card__profile-image' in attr_id:
|
||||||
|
|
@ -367,28 +423,58 @@ class LinkedInProfileCardParser(HTMLParser):
|
||||||
|
|
||||||
|
|
||||||
def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]:
|
def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]:
|
||||||
"""Detect if a headline is heritage-relevant and what type."""
|
"""
|
||||||
|
Detect if a headline is heritage-relevant and what type.
|
||||||
|
|
||||||
|
Two-stage classification:
|
||||||
|
1. Check if organization is explicitly non-heritage (blocklist)
|
||||||
|
2. Check if role/organization matches heritage patterns
|
||||||
|
|
||||||
|
For 'D' (Digital) type, require BOTH a tech role AND a heritage organization.
|
||||||
|
This prevents generic IT workers at banks/police from being classified as heritage.
|
||||||
|
"""
|
||||||
if not headline:
|
if not headline:
|
||||||
return (False, None)
|
return (False, None)
|
||||||
|
|
||||||
headline_lower = headline.lower()
|
headline_lower = headline.lower()
|
||||||
|
|
||||||
# Check non-heritage first
|
# Stage 1: Check for non-heritage organizations (blocklist)
|
||||||
|
for org in NON_HERITAGE_ORGANIZATIONS:
|
||||||
|
if org.lower() in headline_lower:
|
||||||
|
return (False, None)
|
||||||
|
|
||||||
|
# Stage 2: Check for non-heritage role indicators
|
||||||
for keyword in NON_HERITAGE_KEYWORDS:
|
for keyword in NON_HERITAGE_KEYWORDS:
|
||||||
if keyword.lower() in headline_lower:
|
if keyword.lower() in headline_lower:
|
||||||
return (False, None)
|
return (False, None)
|
||||||
|
|
||||||
# Check heritage keywords by type
|
# Stage 3: Check if this is a heritage organization
|
||||||
type_order = ['A', 'M', 'L', 'O', 'R', 'E', 'D', 'G', 'S', 'C']
|
is_heritage_org = False
|
||||||
|
for org_keyword in HERITAGE_ORGANIZATION_KEYWORDS:
|
||||||
|
if org_keyword.lower() in headline_lower:
|
||||||
|
is_heritage_org = True
|
||||||
|
break
|
||||||
|
|
||||||
|
# Check heritage keywords by type (order matters - more specific first)
|
||||||
|
# 'D' (Digital) is checked last and requires heritage org validation
|
||||||
|
type_order = ['A', 'M', 'L', 'G', 'S', 'C', 'O', 'R', 'E'] # D removed from main loop
|
||||||
|
|
||||||
for heritage_type in type_order:
|
for heritage_type in type_order:
|
||||||
keywords = HERITAGE_KEYWORDS.get(heritage_type, [])
|
keywords = HERITAGE_KEYWORDS.get(heritage_type, [])
|
||||||
for keyword in keywords:
|
for keyword in keywords:
|
||||||
if keyword.lower() in headline_lower:
|
if keyword.lower() in headline_lower:
|
||||||
return (True, heritage_type)
|
return (True, heritage_type)
|
||||||
|
|
||||||
# Generic heritage terms
|
# Special handling for 'D' (Digital) - ONLY if at a heritage organization
|
||||||
|
if is_heritage_org:
|
||||||
|
digital_keywords = HERITAGE_KEYWORDS.get('D', [])
|
||||||
|
for keyword in digital_keywords:
|
||||||
|
if keyword.lower() in headline_lower:
|
||||||
|
return (True, 'D')
|
||||||
|
|
||||||
|
# Generic heritage terms (without specific type)
|
||||||
generic = ['heritage', 'erfgoed', 'culture', 'cultuur', 'cultural', 'film', 'cinema',
|
generic = ['heritage', 'erfgoed', 'culture', 'cultuur', 'cultural', 'film', 'cinema',
|
||||||
'media', 'arts', 'kunst', 'creative', 'preservation', 'conservation']
|
'media', 'arts', 'kunst', 'creative', 'preservation', 'conservation', 'collection']
|
||||||
for keyword in generic:
|
for keyword in generic:
|
||||||
if keyword in headline_lower:
|
if keyword in headline_lower:
|
||||||
return (True, None)
|
return (True, None)
|
||||||
|
|
|
||||||
445
scripts/scan_dutch_data_quality.py
Normal file
445
scripts/scan_dutch_data_quality.py
Normal file
|
|
@ -0,0 +1,445 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Comprehensive data quality scan for Dutch custodian YAML files.
|
||||||
|
Identifies issues like wrong GHCID types, missing web claims, Google Maps mismatches, etc.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import yaml
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import defaultdict
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
||||||
|
|
||||||
|
# Issue categories
|
||||||
|
issues = defaultdict(list)
|
||||||
|
|
||||||
|
def extract_ghcid_type(filename):
|
||||||
|
"""Extract type code from GHCID filename (e.g., NL-ZH-ZOE-A-SAZS -> A)"""
|
||||||
|
match = re.match(r'NL-[A-Z]{2}-[A-Z]{3}-([A-Z])-', filename)
|
||||||
|
return match.group(1) if match else None
|
||||||
|
|
||||||
|
def get_expected_type(data):
|
||||||
|
"""Determine expected type from original_entry or other fields"""
|
||||||
|
# Check original_entry.type
|
||||||
|
if 'original_entry' in data:
|
||||||
|
oe = data['original_entry']
|
||||||
|
if 'type' in oe and oe['type']:
|
||||||
|
types = oe['type']
|
||||||
|
if isinstance(types, list) and len(types) > 0:
|
||||||
|
return types[0]
|
||||||
|
if 'type_organisatie' in oe:
|
||||||
|
type_org = oe['type_organisatie']
|
||||||
|
if type_org:
|
||||||
|
type_map = {
|
||||||
|
'archive': 'A', 'archief': 'A',
|
||||||
|
'library': 'L', 'bibliotheek': 'L',
|
||||||
|
'museum': 'M',
|
||||||
|
'gallery': 'G', 'galerie': 'G',
|
||||||
|
}
|
||||||
|
return type_map.get(type_org.lower(), None)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def check_google_maps_mismatch(data, filename):
|
||||||
|
"""Check if Google Maps name doesn't match organization name"""
|
||||||
|
if 'google_maps_enrichment' not in data:
|
||||||
|
return None
|
||||||
|
|
||||||
|
gm = data['google_maps_enrichment']
|
||||||
|
gm_name = gm.get('name', '')
|
||||||
|
|
||||||
|
# Get original org name
|
||||||
|
org_name = ''
|
||||||
|
if 'original_entry' in data:
|
||||||
|
org_name = data['original_entry'].get('organisatie', '')
|
||||||
|
if 'custodian_name' in data:
|
||||||
|
cn = data['custodian_name']
|
||||||
|
if isinstance(cn, dict):
|
||||||
|
org_name = cn.get('claim_value', org_name)
|
||||||
|
|
||||||
|
if not gm_name or not org_name:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Simple similarity check - if names share less than 30% of words, flag it
|
||||||
|
gm_words = set(gm_name.lower().split())
|
||||||
|
org_words = set(org_name.lower().split())
|
||||||
|
|
||||||
|
# Remove common words
|
||||||
|
stopwords = {'de', 'het', 'van', 'en', 'in', 'te', 'der', 'voor', 'stichting', 'vereniging'}
|
||||||
|
gm_words = gm_words - stopwords
|
||||||
|
org_words = org_words - stopwords
|
||||||
|
|
||||||
|
if len(gm_words) == 0 or len(org_words) == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
overlap = len(gm_words & org_words)
|
||||||
|
similarity = overlap / max(len(gm_words), len(org_words))
|
||||||
|
|
||||||
|
if similarity < 0.3:
|
||||||
|
return {
|
||||||
|
'google_name': gm_name,
|
||||||
|
'org_name': org_name,
|
||||||
|
'similarity': round(similarity, 2)
|
||||||
|
}
|
||||||
|
return None
|
||||||
|
|
||||||
|
def check_absolute_paths(data, filename):
|
||||||
|
"""Check for absolute paths that should be relative"""
|
||||||
|
yaml_str = yaml.dump(data, default_flow_style=False)
|
||||||
|
abs_paths = []
|
||||||
|
|
||||||
|
patterns = [
|
||||||
|
r'/Volumes/KINGSTON/',
|
||||||
|
r'/Users/kempersc/',
|
||||||
|
r'/mnt/',
|
||||||
|
r'C:\\',
|
||||||
|
r'D:\\'
|
||||||
|
]
|
||||||
|
|
||||||
|
for pattern in patterns:
|
||||||
|
if re.search(pattern, yaml_str):
|
||||||
|
abs_paths.append(pattern.rstrip('/\\'))
|
||||||
|
|
||||||
|
return abs_paths if abs_paths else None
|
||||||
|
|
||||||
|
def check_web_claims(data, filename):
|
||||||
|
"""Check web claims quality"""
|
||||||
|
issues_found = []
|
||||||
|
|
||||||
|
if 'web_claims' not in data:
|
||||||
|
return ['no_web_claims']
|
||||||
|
|
||||||
|
wc = data['web_claims']
|
||||||
|
|
||||||
|
# Check if claims exist
|
||||||
|
claims = wc.get('claims', [])
|
||||||
|
if not claims:
|
||||||
|
issues_found.append('empty_claims')
|
||||||
|
|
||||||
|
# Check for verified_claims
|
||||||
|
if 'verified_claims' not in wc:
|
||||||
|
issues_found.append('no_verified_claims')
|
||||||
|
else:
|
||||||
|
vc = wc['verified_claims']
|
||||||
|
if isinstance(vc, dict):
|
||||||
|
vc_claims = vc.get('claims', [])
|
||||||
|
# Check for XPath provenance
|
||||||
|
claims_without_xpath = 0
|
||||||
|
for claim in vc_claims:
|
||||||
|
if isinstance(claim, dict) and 'xpath' not in claim:
|
||||||
|
claims_without_xpath += 1
|
||||||
|
if claims_without_xpath > 0:
|
||||||
|
issues_found.append(f'claims_missing_xpath:{claims_without_xpath}')
|
||||||
|
|
||||||
|
return issues_found if issues_found else None
|
||||||
|
|
||||||
|
def check_coordinates(data, filename):
|
||||||
|
"""Check for coordinate issues"""
|
||||||
|
issues_found = []
|
||||||
|
|
||||||
|
# Check if location exists
|
||||||
|
if 'location' not in data:
|
||||||
|
issues_found.append('no_location')
|
||||||
|
return issues_found
|
||||||
|
|
||||||
|
loc = data['location']
|
||||||
|
lat = loc.get('latitude')
|
||||||
|
lon = loc.get('longitude')
|
||||||
|
|
||||||
|
if lat is None or lon is None:
|
||||||
|
issues_found.append('missing_coordinates')
|
||||||
|
elif not (50.5 < lat < 53.7 and 3.3 < lon < 7.3):
|
||||||
|
# Rough Netherlands bounding box
|
||||||
|
issues_found.append('coordinates_outside_netherlands')
|
||||||
|
|
||||||
|
# Check if coordinates from Google Maps differ significantly from corrected
|
||||||
|
if 'coordinate_provenance' in loc:
|
||||||
|
prov = loc['coordinate_provenance']
|
||||||
|
if 'previous_coordinates' in prov:
|
||||||
|
issues_found.append('has_coordinate_correction')
|
||||||
|
|
||||||
|
return issues_found if issues_found else None
|
||||||
|
|
||||||
|
def check_digital_platforms(data, filename):
|
||||||
|
"""Check for missing digital platforms"""
|
||||||
|
if 'digital_platforms' not in data or not data['digital_platforms']:
|
||||||
|
return ['no_digital_platforms']
|
||||||
|
|
||||||
|
platforms = data['digital_platforms']
|
||||||
|
if len(platforms) == 0:
|
||||||
|
return ['empty_digital_platforms']
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def check_identifiers(data, filename):
|
||||||
|
"""Check identifier completeness"""
|
||||||
|
issues_found = []
|
||||||
|
|
||||||
|
if 'identifiers' not in data:
|
||||||
|
issues_found.append('no_identifiers')
|
||||||
|
return issues_found
|
||||||
|
|
||||||
|
ids = data['identifiers']
|
||||||
|
id_types = [i.get('identifier_scheme') for i in ids if isinstance(i, dict)]
|
||||||
|
|
||||||
|
if 'ISIL' not in id_types:
|
||||||
|
issues_found.append('no_isil')
|
||||||
|
if 'GHCID' not in id_types:
|
||||||
|
issues_found.append('no_ghcid')
|
||||||
|
|
||||||
|
return issues_found if issues_found else None
|
||||||
|
|
||||||
|
def check_wikidata(data, filename):
|
||||||
|
"""Check Wikidata enrichment status"""
|
||||||
|
if 'wikidata_enrichment' not in data:
|
||||||
|
return 'no_wikidata_enrichment'
|
||||||
|
|
||||||
|
wd = data['wikidata_enrichment']
|
||||||
|
status = wd.get('status', '')
|
||||||
|
|
||||||
|
if status == 'NOT_FOUND':
|
||||||
|
return 'wikidata_not_found'
|
||||||
|
elif status in ['SUCCESS', 'ENRICHED']:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return f'wikidata_status:{status}'
|
||||||
|
|
||||||
|
def check_url(data, filename):
|
||||||
|
"""Check URL issues"""
|
||||||
|
issues_found = []
|
||||||
|
|
||||||
|
url = data.get('url', '')
|
||||||
|
if not url:
|
||||||
|
issues_found.append('no_url')
|
||||||
|
elif url.startswith('http://'):
|
||||||
|
issues_found.append('http_not_https')
|
||||||
|
|
||||||
|
# Check if URL was corrected (indicates previous wrong URL)
|
||||||
|
if 'url_correction' in data:
|
||||||
|
issues_found.append('has_url_correction')
|
||||||
|
|
||||||
|
return issues_found if issues_found else None
|
||||||
|
|
||||||
|
def scan_file(filepath):
|
||||||
|
"""Scan a single file for all issue types"""
|
||||||
|
filename = filepath.name
|
||||||
|
file_issues = {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(filepath, 'r', encoding='utf-8') as f:
|
||||||
|
data = yaml.safe_load(f)
|
||||||
|
except Exception as e:
|
||||||
|
return {'parse_error': str(e)}
|
||||||
|
|
||||||
|
if not data:
|
||||||
|
return {'empty_file': True}
|
||||||
|
|
||||||
|
# 1. Check GHCID type mismatch
|
||||||
|
ghcid_type = extract_ghcid_type(filename)
|
||||||
|
expected_type = get_expected_type(data)
|
||||||
|
|
||||||
|
if ghcid_type and expected_type and ghcid_type != expected_type:
|
||||||
|
if ghcid_type == 'U' and expected_type != 'U':
|
||||||
|
file_issues['wrong_ghcid_type'] = {
|
||||||
|
'current': ghcid_type,
|
||||||
|
'expected': expected_type
|
||||||
|
}
|
||||||
|
|
||||||
|
# Also check for U type that should be something else
|
||||||
|
if ghcid_type == 'U':
|
||||||
|
file_issues['unknown_type'] = True
|
||||||
|
|
||||||
|
# 2. Check Google Maps mismatch
|
||||||
|
gm_mismatch = check_google_maps_mismatch(data, filename)
|
||||||
|
if gm_mismatch:
|
||||||
|
file_issues['google_maps_mismatch'] = gm_mismatch
|
||||||
|
|
||||||
|
# 3. Check absolute paths
|
||||||
|
abs_paths = check_absolute_paths(data, filename)
|
||||||
|
if abs_paths:
|
||||||
|
file_issues['absolute_paths'] = abs_paths
|
||||||
|
|
||||||
|
# 4. Check web claims
|
||||||
|
wc_issues = check_web_claims(data, filename)
|
||||||
|
if wc_issues:
|
||||||
|
file_issues['web_claims_issues'] = wc_issues
|
||||||
|
|
||||||
|
# 5. Check coordinates
|
||||||
|
coord_issues = check_coordinates(data, filename)
|
||||||
|
if coord_issues:
|
||||||
|
file_issues['coordinate_issues'] = coord_issues
|
||||||
|
|
||||||
|
# 6. Check digital platforms
|
||||||
|
dp_issues = check_digital_platforms(data, filename)
|
||||||
|
if dp_issues:
|
||||||
|
file_issues['digital_platform_issues'] = dp_issues
|
||||||
|
|
||||||
|
# 7. Check identifiers
|
||||||
|
id_issues = check_identifiers(data, filename)
|
||||||
|
if id_issues:
|
||||||
|
file_issues['identifier_issues'] = id_issues
|
||||||
|
|
||||||
|
# 8. Check Wikidata
|
||||||
|
wd_issue = check_wikidata(data, filename)
|
||||||
|
if wd_issue:
|
||||||
|
file_issues['wikidata_issue'] = wd_issue
|
||||||
|
|
||||||
|
# 9. Check URL
|
||||||
|
url_issues = check_url(data, filename)
|
||||||
|
if url_issues:
|
||||||
|
file_issues['url_issues'] = url_issues
|
||||||
|
|
||||||
|
return file_issues
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print(f"Scanning Dutch custodian files in {CUSTODIAN_DIR}")
|
||||||
|
print(f"Scan started: {datetime.now().isoformat()}")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
# Collect all issues
|
||||||
|
all_issues = {}
|
||||||
|
issue_counts = defaultdict(int)
|
||||||
|
|
||||||
|
files = sorted(CUSTODIAN_DIR.glob("NL-*.yaml"))
|
||||||
|
total_files = len(files)
|
||||||
|
|
||||||
|
print(f"Found {total_files} Dutch custodian files\n")
|
||||||
|
|
||||||
|
for i, filepath in enumerate(files):
|
||||||
|
if (i + 1) % 200 == 0:
|
||||||
|
print(f"Progress: {i+1}/{total_files} files scanned...", flush=True)
|
||||||
|
|
||||||
|
file_issues = scan_file(filepath)
|
||||||
|
|
||||||
|
if file_issues:
|
||||||
|
all_issues[filepath.name] = file_issues
|
||||||
|
for issue_type in file_issues.keys():
|
||||||
|
issue_counts[issue_type] += 1
|
||||||
|
|
||||||
|
print(f"\nScan complete: {total_files} files analyzed")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
# Summary report
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("SUMMARY REPORT: Data Quality Issues")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
print(f"\nTotal files scanned: {total_files}")
|
||||||
|
print(f"Files with issues: {len(all_issues)}")
|
||||||
|
print(f"Files without issues: {total_files - len(all_issues)}")
|
||||||
|
|
||||||
|
print("\n" + "-" * 80)
|
||||||
|
print("ISSUE BREAKDOWN BY TYPE")
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
# Sort issues by count
|
||||||
|
sorted_issues = sorted(issue_counts.items(), key=lambda x: -x[1])
|
||||||
|
|
||||||
|
for issue_type, count in sorted_issues:
|
||||||
|
pct = (count / total_files) * 100
|
||||||
|
print(f"{issue_type:40} {count:5} files ({pct:5.1f}%)")
|
||||||
|
|
||||||
|
# Detailed breakdown for critical issues
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("CRITICAL ISSUES - REQUIRE IMMEDIATE ATTENTION")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
# 1. Wrong GHCID type
|
||||||
|
wrong_type_files = [(f, d) for f, d in all_issues.items() if 'wrong_ghcid_type' in d]
|
||||||
|
print(f"\n1. WRONG GHCID TYPE ({len(wrong_type_files)} files)")
|
||||||
|
print("-" * 40)
|
||||||
|
if wrong_type_files:
|
||||||
|
for filename, data in wrong_type_files[:20]:
|
||||||
|
info = data['wrong_ghcid_type']
|
||||||
|
print(f" {filename}: {info['current']} -> should be {info['expected']}")
|
||||||
|
if len(wrong_type_files) > 20:
|
||||||
|
print(f" ... and {len(wrong_type_files) - 20} more")
|
||||||
|
else:
|
||||||
|
print(" None found")
|
||||||
|
|
||||||
|
# 2. Google Maps mismatches
|
||||||
|
gm_mismatch_files = [(f, d) for f, d in all_issues.items() if 'google_maps_mismatch' in d]
|
||||||
|
print(f"\n2. GOOGLE MAPS MISMATCHES ({len(gm_mismatch_files)} files)")
|
||||||
|
print("-" * 40)
|
||||||
|
if gm_mismatch_files:
|
||||||
|
for filename, data in gm_mismatch_files[:20]:
|
||||||
|
info = data['google_maps_mismatch']
|
||||||
|
print(f" {filename}")
|
||||||
|
print(f" Google: {info['google_name']}")
|
||||||
|
print(f" Org: {info['org_name']}")
|
||||||
|
print(f" Similarity: {info['similarity']}")
|
||||||
|
if len(gm_mismatch_files) > 20:
|
||||||
|
print(f" ... and {len(gm_mismatch_files) - 20} more")
|
||||||
|
else:
|
||||||
|
print(" None found")
|
||||||
|
|
||||||
|
# 3. Absolute paths
|
||||||
|
abs_path_files = [(f, d) for f, d in all_issues.items() if 'absolute_paths' in d]
|
||||||
|
print(f"\n3. ABSOLUTE PATHS ({len(abs_path_files)} files)")
|
||||||
|
print("-" * 40)
|
||||||
|
if abs_path_files:
|
||||||
|
for filename, data in abs_path_files[:10]:
|
||||||
|
print(f" {filename}: {data['absolute_paths']}")
|
||||||
|
if len(abs_path_files) > 10:
|
||||||
|
print(f" ... and {len(abs_path_files) - 10} more")
|
||||||
|
else:
|
||||||
|
print(" None found")
|
||||||
|
|
||||||
|
# 4. Unknown type (U)
|
||||||
|
unknown_type_files = [f for f, d in all_issues.items() if 'unknown_type' in d]
|
||||||
|
print(f"\n4. UNKNOWN TYPE CODE 'U' ({len(unknown_type_files)} files)")
|
||||||
|
print("-" * 40)
|
||||||
|
if unknown_type_files:
|
||||||
|
for filename in unknown_type_files[:30]:
|
||||||
|
print(f" {filename}")
|
||||||
|
if len(unknown_type_files) > 30:
|
||||||
|
print(f" ... and {len(unknown_type_files) - 30} more")
|
||||||
|
else:
|
||||||
|
print(" None found")
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("ENRICHMENT GAPS")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
# Web claims issues
|
||||||
|
no_verified_claims = [f for f, d in all_issues.items()
|
||||||
|
if 'web_claims_issues' in d and 'no_verified_claims' in d['web_claims_issues']]
|
||||||
|
print(f"\n5. NO VERIFIED WEB CLAIMS ({len(no_verified_claims)} files)")
|
||||||
|
|
||||||
|
# Digital platforms
|
||||||
|
no_platforms = [f for f, d in all_issues.items()
|
||||||
|
if 'digital_platform_issues' in d]
|
||||||
|
print(f"6. NO DIGITAL PLATFORMS ({len(no_platforms)} files)")
|
||||||
|
|
||||||
|
# Wikidata
|
||||||
|
no_wikidata = [f for f, d in all_issues.items()
|
||||||
|
if d.get('wikidata_issue') in ['no_wikidata_enrichment', 'wikidata_not_found']]
|
||||||
|
print(f"7. NO WIKIDATA ENRICHMENT ({len(no_wikidata)} files)")
|
||||||
|
|
||||||
|
# URLs
|
||||||
|
no_url = [f for f, d in all_issues.items()
|
||||||
|
if 'url_issues' in d and 'no_url' in d['url_issues']]
|
||||||
|
print(f"8. NO URL ({len(no_url)} files)")
|
||||||
|
|
||||||
|
# Save detailed report
|
||||||
|
report_file = CUSTODIAN_DIR.parent / 'reports' / 'dutch_data_quality_scan.yaml'
|
||||||
|
report_file.parent.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
report = {
|
||||||
|
'scan_timestamp': datetime.now().isoformat(),
|
||||||
|
'total_files': total_files,
|
||||||
|
'files_with_issues': len(all_issues),
|
||||||
|
'issue_counts': dict(sorted_issues),
|
||||||
|
'detailed_issues': all_issues
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(report_file, 'w', encoding='utf-8') as f:
|
||||||
|
yaml.dump(report, f, default_flow_style=False, allow_unicode=True)
|
||||||
|
|
||||||
|
print(f"\n\nDetailed report saved to: {report_file}")
|
||||||
|
print(f"Scan completed: {datetime.now().isoformat()}")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
199
scripts/scan_dutch_fast.py
Normal file
199
scripts/scan_dutch_fast.py
Normal file
|
|
@ -0,0 +1,199 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Fast data quality scan - optimized for speed."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import yaml
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import defaultdict
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# Use C loader for speed
|
||||||
|
try:
|
||||||
|
from yaml import CSafeLoader as SafeLoader
|
||||||
|
except ImportError:
|
||||||
|
from yaml import SafeLoader
|
||||||
|
|
||||||
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
||||||
|
|
||||||
|
def extract_ghcid_type(filename):
|
||||||
|
match = re.match(r'NL-[A-Z]{2}-[A-Z]{3}-([A-Z])-', filename)
|
||||||
|
return match.group(1) if match else None
|
||||||
|
|
||||||
|
def scan_file_fast(filepath):
|
||||||
|
"""Fast scan using string operations where possible."""
|
||||||
|
filename = filepath.name
|
||||||
|
issues = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(filepath, 'r', encoding='utf-8') as f:
|
||||||
|
content = f.read()
|
||||||
|
except Exception as e:
|
||||||
|
return ['parse_error']
|
||||||
|
|
||||||
|
# Quick string-based checks first
|
||||||
|
|
||||||
|
# Absolute paths
|
||||||
|
if '/Volumes/KINGSTON/' in content or '/Users/kempersc/' in content:
|
||||||
|
issues.append('absolute_paths')
|
||||||
|
|
||||||
|
# No URL
|
||||||
|
if '\nurl:' not in content and 'url: ' not in content[:500]:
|
||||||
|
issues.append('no_url')
|
||||||
|
|
||||||
|
# HTTP instead of HTTPS
|
||||||
|
if 'url: http://' in content:
|
||||||
|
issues.append('http_not_https')
|
||||||
|
|
||||||
|
# No digital_platforms
|
||||||
|
if 'digital_platforms:' not in content:
|
||||||
|
issues.append('no_digital_platforms')
|
||||||
|
elif 'digital_platforms: []\n' in content or 'digital_platforms:\n-' not in content:
|
||||||
|
issues.append('empty_digital_platforms')
|
||||||
|
|
||||||
|
# No verified_claims
|
||||||
|
if 'verified_claims:' not in content:
|
||||||
|
issues.append('no_verified_claims')
|
||||||
|
|
||||||
|
# Wikidata NOT_FOUND
|
||||||
|
if "status: NOT_FOUND" in content:
|
||||||
|
issues.append('wikidata_not_found')
|
||||||
|
elif 'wikidata_enrichment:' not in content:
|
||||||
|
issues.append('no_wikidata_enrichment')
|
||||||
|
|
||||||
|
# Unknown type in filename
|
||||||
|
ghcid_type = extract_ghcid_type(filename)
|
||||||
|
if ghcid_type == 'U':
|
||||||
|
issues.append('unknown_type_U')
|
||||||
|
|
||||||
|
# Parse YAML only for complex checks
|
||||||
|
try:
|
||||||
|
data = yaml.load(content, Loader=SafeLoader)
|
||||||
|
except:
|
||||||
|
issues.append('yaml_parse_error')
|
||||||
|
return issues
|
||||||
|
|
||||||
|
if not data:
|
||||||
|
issues.append('empty_file')
|
||||||
|
return issues
|
||||||
|
|
||||||
|
# Check GHCID type mismatch
|
||||||
|
if 'original_entry' in data:
|
||||||
|
oe = data['original_entry']
|
||||||
|
expected = None
|
||||||
|
if 'type' in oe and oe['type'] and isinstance(oe['type'], list):
|
||||||
|
expected = oe['type'][0]
|
||||||
|
elif 'type_organisatie' in oe and oe['type_organisatie']:
|
||||||
|
type_map = {'archive': 'A', 'archief': 'A', 'library': 'L',
|
||||||
|
'bibliotheek': 'L', 'museum': 'M', 'gallery': 'G'}
|
||||||
|
expected = type_map.get(oe['type_organisatie'].lower())
|
||||||
|
|
||||||
|
if expected and ghcid_type and ghcid_type != expected:
|
||||||
|
issues.append(f'wrong_type:{ghcid_type}→{expected}')
|
||||||
|
|
||||||
|
# Check Google Maps mismatch
|
||||||
|
if 'google_maps_enrichment' in data and 'original_entry' in data:
|
||||||
|
gm_name = data['google_maps_enrichment'].get('name', '').lower()
|
||||||
|
org_name = data['original_entry'].get('organisatie', '').lower()
|
||||||
|
|
||||||
|
if gm_name and org_name:
|
||||||
|
gm_words = set(gm_name.split()) - {'de', 'het', 'van', 'en', 'stichting'}
|
||||||
|
org_words = set(org_name.split()) - {'de', 'het', 'van', 'en', 'stichting'}
|
||||||
|
|
||||||
|
if gm_words and org_words:
|
||||||
|
overlap = len(gm_words & org_words)
|
||||||
|
similarity = overlap / max(len(gm_words), len(org_words))
|
||||||
|
if similarity < 0.25:
|
||||||
|
issues.append('google_maps_mismatch')
|
||||||
|
|
||||||
|
# Check coordinates
|
||||||
|
if 'location' in data:
|
||||||
|
loc = data['location']
|
||||||
|
lat = loc.get('latitude')
|
||||||
|
lon = loc.get('longitude')
|
||||||
|
if lat is None or lon is None:
|
||||||
|
issues.append('missing_coordinates')
|
||||||
|
elif not (50.5 < lat < 53.7 and 3.3 < lon < 7.3):
|
||||||
|
issues.append('coords_outside_NL')
|
||||||
|
else:
|
||||||
|
issues.append('no_location')
|
||||||
|
|
||||||
|
return issues
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print(f"Fast scan started: {datetime.now().isoformat()}")
|
||||||
|
|
||||||
|
files = sorted(CUSTODIAN_DIR.glob("NL-*.yaml"))
|
||||||
|
total = len(files)
|
||||||
|
|
||||||
|
print(f"Scanning {total} Dutch custodian files...")
|
||||||
|
|
||||||
|
issue_counts = defaultdict(int)
|
||||||
|
files_with_issues = defaultdict(list)
|
||||||
|
|
||||||
|
for i, fp in enumerate(files):
|
||||||
|
issues = scan_file_fast(fp)
|
||||||
|
for issue in issues:
|
||||||
|
issue_counts[issue] += 1
|
||||||
|
files_with_issues[issue].append(fp.name)
|
||||||
|
|
||||||
|
print(f"\nScan complete: {datetime.now().isoformat()}")
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("DATA QUALITY SUMMARY REPORT")
|
||||||
|
print("=" * 80)
|
||||||
|
print(f"\nTotal files: {total}")
|
||||||
|
|
||||||
|
# Count files with any issue
|
||||||
|
all_issue_files = set()
|
||||||
|
for files_list in files_with_issues.values():
|
||||||
|
all_issue_files.update(files_list)
|
||||||
|
|
||||||
|
print(f"Files with issues: {len(all_issue_files)} ({100*len(all_issue_files)/total:.1f}%)")
|
||||||
|
print(f"Clean files: {total - len(all_issue_files)}")
|
||||||
|
|
||||||
|
print("\n" + "-" * 80)
|
||||||
|
print("ISSUE BREAKDOWN")
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
# Sort by count
|
||||||
|
for issue, count in sorted(issue_counts.items(), key=lambda x: -x[1]):
|
||||||
|
pct = 100 * count / total
|
||||||
|
bar = "█" * int(pct / 2)
|
||||||
|
print(f"{issue:35} {count:5} ({pct:5.1f}%) {bar}")
|
||||||
|
|
||||||
|
# Critical issues detail
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("CRITICAL ISSUES (require manual fix)")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
critical_issues = ['wrong_type:', 'google_maps_mismatch', 'absolute_paths', 'unknown_type_U']
|
||||||
|
|
||||||
|
for critical in critical_issues:
|
||||||
|
matching = [(k, v) for k, v in files_with_issues.items() if critical in k or k == critical]
|
||||||
|
if matching:
|
||||||
|
for issue_key, file_list in matching:
|
||||||
|
print(f"\n{issue_key} ({len(file_list)} files):")
|
||||||
|
for f in file_list[:15]:
|
||||||
|
print(f" - {f}")
|
||||||
|
if len(file_list) > 15:
|
||||||
|
print(f" ... and {len(file_list) - 15} more")
|
||||||
|
|
||||||
|
# Save report
|
||||||
|
report_path = CUSTODIAN_DIR.parent / 'reports' / 'dutch_data_quality_fast.yaml'
|
||||||
|
report_path.parent.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
report = {
|
||||||
|
'scan_timestamp': datetime.now().isoformat(),
|
||||||
|
'total_files': total,
|
||||||
|
'files_with_issues': len(all_issue_files),
|
||||||
|
'issue_counts': dict(sorted(issue_counts.items(), key=lambda x: -x[1])),
|
||||||
|
'files_by_issue': {k: v for k, v in files_with_issues.items()}
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(report_path, 'w') as f:
|
||||||
|
yaml.dump(report, f, default_flow_style=False, allow_unicode=True)
|
||||||
|
|
||||||
|
print(f"\n\nFull report saved: {report_path}")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
575
scripts/transform_crawl4ai_to_digital_platform.py
Normal file
575
scripts/transform_crawl4ai_to_digital_platform.py
Normal file
|
|
@ -0,0 +1,575 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Transform crawl4ai_enrichment data into proper digital_platform YAML structure.
|
||||||
|
|
||||||
|
This script processes custodian YAML files that have crawl4ai_enrichment data
|
||||||
|
and creates/updates the digital_platform block conforming to the LinkML schema.
|
||||||
|
|
||||||
|
Schema Reference:
|
||||||
|
- DigitalPlatform: schemas/20251121/linkml/modules/classes/DigitalPlatform.yaml
|
||||||
|
- AuxiliaryDigitalPlatform: schemas/20251121/linkml/modules/classes/AuxiliaryDigitalPlatform.yaml
|
||||||
|
- DigitalPlatformTypeEnum: schemas/20251121/linkml/modules/enums/DigitalPlatformTypeEnum.yaml
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/transform_crawl4ai_to_digital_platform.py [--dry-run] [--file FILE]
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
from urllib.parse import unquote, urlparse
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.StreamHandler(sys.stdout),
|
||||||
|
logging.FileHandler(f'logs/transform_digital_platform_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
|
||||||
|
]
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# Mapping from crawl4ai detected_catalog_urls type to DigitalPlatformTypeEnum
|
||||||
|
# and to the appropriate slot (collection_web_addresses or inventory_web_addresses)
|
||||||
|
CATALOG_TYPE_MAPPING = {
|
||||||
|
# Image collections → collection_web_addresses
|
||||||
|
'beeldbank': {
|
||||||
|
'platform_types': ['PHOTOGRAPH_COLLECTION'],
|
||||||
|
'slot': 'collection_web_addresses',
|
||||||
|
'description': 'Image/photograph collection'
|
||||||
|
},
|
||||||
|
# Genealogy → collection_web_addresses (specialized database)
|
||||||
|
'genealogie': {
|
||||||
|
'platform_types': ['GENEALOGY_DATABASE'],
|
||||||
|
'slot': 'collection_web_addresses',
|
||||||
|
'description': 'Genealogy records database'
|
||||||
|
},
|
||||||
|
# Archives/inventories → inventory_web_addresses
|
||||||
|
'archieven': {
|
||||||
|
'platform_types': ['ARCHIVES_PORTAL'],
|
||||||
|
'slot': 'inventory_web_addresses',
|
||||||
|
'description': 'Archival finding aids and inventories'
|
||||||
|
},
|
||||||
|
'inventaris': {
|
||||||
|
'platform_types': ['ARCHIVES_PORTAL'],
|
||||||
|
'slot': 'inventory_web_addresses',
|
||||||
|
'description': 'Archival inventory'
|
||||||
|
},
|
||||||
|
# Collections → collection_web_addresses
|
||||||
|
'collectie': {
|
||||||
|
'platform_types': ['ONLINE_DATABASE'],
|
||||||
|
'slot': 'collection_web_addresses',
|
||||||
|
'description': 'General collection access'
|
||||||
|
},
|
||||||
|
# Library → collection_web_addresses
|
||||||
|
'bibliotheek': {
|
||||||
|
'platform_types': ['DIGITAL_LIBRARY'],
|
||||||
|
'slot': 'collection_web_addresses',
|
||||||
|
'description': 'Library catalog'
|
||||||
|
},
|
||||||
|
# Search interfaces → collection_web_addresses
|
||||||
|
'zoeken': {
|
||||||
|
'platform_types': ['ONLINE_DATABASE'],
|
||||||
|
'slot': 'collection_web_addresses',
|
||||||
|
'description': 'Search interface'
|
||||||
|
},
|
||||||
|
# Kranten (newspapers) → collection_web_addresses
|
||||||
|
'kranten': {
|
||||||
|
'platform_types': ['ONLINE_NEWS_ARCHIVE'],
|
||||||
|
'slot': 'collection_web_addresses',
|
||||||
|
'description': 'Historical newspapers'
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Mapping for external archive platforms to AuxiliaryDigitalPlatformTypeEnum
|
||||||
|
EXTERNAL_PLATFORM_MAPPING = {
|
||||||
|
'archieven.nl': {
|
||||||
|
'platform_name': 'Archieven.nl',
|
||||||
|
'auxiliary_platform_type': 'AGGREGATOR',
|
||||||
|
'description': 'National Dutch archives aggregator'
|
||||||
|
},
|
||||||
|
'archiefweb.eu': {
|
||||||
|
'platform_name': 'Archiefweb.eu',
|
||||||
|
'auxiliary_platform_type': 'ARCHIVAL_REPOSITORY',
|
||||||
|
'description': 'Web archiving service'
|
||||||
|
},
|
||||||
|
'memorix.nl': {
|
||||||
|
'platform_name': 'Memorix',
|
||||||
|
'auxiliary_platform_type': 'DIGITAL_ARCHIVE',
|
||||||
|
'description': 'Heritage information management platform'
|
||||||
|
},
|
||||||
|
'opendata.archieven.nl': {
|
||||||
|
'platform_name': 'Open Data Archieven.nl',
|
||||||
|
'auxiliary_platform_type': 'OPEN_DATA_PORTAL',
|
||||||
|
'description': 'Open data from Dutch archives'
|
||||||
|
},
|
||||||
|
'regionaalarchief': {
|
||||||
|
'platform_name': 'Regionaal Archief',
|
||||||
|
'auxiliary_platform_type': 'ARCHIVES_PORTAL',
|
||||||
|
'description': 'Regional archive portal'
|
||||||
|
},
|
||||||
|
'delpher.nl': {
|
||||||
|
'platform_name': 'Delpher',
|
||||||
|
'auxiliary_platform_type': 'DIGITAL_LIBRARY',
|
||||||
|
'description': 'KB digitized newspapers, books, and periodicals'
|
||||||
|
},
|
||||||
|
'wiewaswie.nl': {
|
||||||
|
'platform_name': 'WieWasWie',
|
||||||
|
'auxiliary_platform_type': 'GENEALOGY_DATABASE',
|
||||||
|
'description': 'Dutch genealogy database'
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_url(url: str) -> str:
|
||||||
|
"""Normalize URL by decoding and extracting base path."""
|
||||||
|
if not url:
|
||||||
|
return url
|
||||||
|
|
||||||
|
# URL decode
|
||||||
|
decoded = unquote(url)
|
||||||
|
|
||||||
|
# Parse URL
|
||||||
|
parsed = urlparse(decoded)
|
||||||
|
|
||||||
|
# Reconstruct without query parameters for deduplication key
|
||||||
|
base_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
||||||
|
|
||||||
|
# Remove trailing slash for consistency (except root)
|
||||||
|
if base_url.endswith('/') and len(parsed.path) > 1:
|
||||||
|
base_url = base_url[:-1]
|
||||||
|
|
||||||
|
return base_url
|
||||||
|
|
||||||
|
|
||||||
|
def extract_base_path_key(url: str) -> str:
|
||||||
|
"""Extract base path for deduplication (without query params)."""
|
||||||
|
parsed = urlparse(url)
|
||||||
|
return f"{parsed.netloc}{parsed.path}".rstrip('/')
|
||||||
|
|
||||||
|
|
||||||
|
def deduplicate_catalog_urls(catalog_urls: list[dict]) -> list[dict]:
|
||||||
|
"""
|
||||||
|
Deduplicate catalog URLs, preferring entries with XPath provenance.
|
||||||
|
|
||||||
|
Strategy:
|
||||||
|
1. Group URLs by base path (without query params)
|
||||||
|
2. For each group, prefer entries with xpath provenance
|
||||||
|
3. Return one representative URL per type per base path
|
||||||
|
"""
|
||||||
|
if not catalog_urls:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Group by (base_path, type)
|
||||||
|
grouped: dict[tuple[str, str], list[dict]] = defaultdict(list)
|
||||||
|
|
||||||
|
for entry in catalog_urls:
|
||||||
|
url = entry.get('url', '')
|
||||||
|
url_type = entry.get('type', 'unknown')
|
||||||
|
base_key = extract_base_path_key(url)
|
||||||
|
grouped[(base_key, url_type)].append(entry)
|
||||||
|
|
||||||
|
# Select best entry from each group
|
||||||
|
deduplicated = []
|
||||||
|
for (base_key, url_type), entries in grouped.items():
|
||||||
|
# Sort: entries with xpath first, then by URL length (shorter preferred)
|
||||||
|
sorted_entries = sorted(
|
||||||
|
entries,
|
||||||
|
key=lambda e: (0 if e.get('xpath') else 1, len(e.get('url', '')))
|
||||||
|
)
|
||||||
|
best = sorted_entries[0]
|
||||||
|
|
||||||
|
# Normalize the URL
|
||||||
|
best_copy = best.copy()
|
||||||
|
best_copy['url'] = normalize_url(best['url'])
|
||||||
|
deduplicated.append(best_copy)
|
||||||
|
|
||||||
|
return deduplicated
|
||||||
|
|
||||||
|
|
||||||
|
def generate_platform_id(ghcid: str) -> str:
|
||||||
|
"""Generate platform_id URI from GHCID."""
|
||||||
|
ghcid_lower = ghcid.lower().replace('_', '-')
|
||||||
|
return f"https://nde.nl/ontology/hc/platform/{ghcid_lower}-website"
|
||||||
|
|
||||||
|
|
||||||
|
def extract_ghcid_from_file(file_path: Path) -> str | None:
|
||||||
|
"""Extract GHCID from filename."""
|
||||||
|
stem = file_path.stem
|
||||||
|
# GHCID pattern: CC-RR-CCC-T-ABBREV (e.g., NL-DR-ASS-A-DA)
|
||||||
|
if re.match(r'^[A-Z]{2}-[A-Z]{2,3}-[A-Z]{3}-[A-Z]-', stem):
|
||||||
|
return stem
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def determine_platform_types(catalog_urls: list[dict]) -> list[str]:
|
||||||
|
"""
|
||||||
|
Determine platform types from detected catalog URLs.
|
||||||
|
|
||||||
|
Returns list of DigitalPlatformTypeEnum values.
|
||||||
|
"""
|
||||||
|
types_set = set()
|
||||||
|
|
||||||
|
for entry in catalog_urls:
|
||||||
|
url_type = entry.get('type', '')
|
||||||
|
mapping = CATALOG_TYPE_MAPPING.get(url_type, {})
|
||||||
|
for pt in mapping.get('platform_types', []):
|
||||||
|
types_set.add(pt)
|
||||||
|
|
||||||
|
# If we have catalog URLs but no specific types, add generic ONLINE_DATABASE
|
||||||
|
if catalog_urls and not types_set:
|
||||||
|
types_set.add('ONLINE_DATABASE')
|
||||||
|
|
||||||
|
# Always include INSTITUTIONAL_WEBSITE as base type
|
||||||
|
types_set.add('INSTITUTIONAL_WEBSITE')
|
||||||
|
|
||||||
|
return sorted(list(types_set))
|
||||||
|
|
||||||
|
|
||||||
|
def categorize_urls_by_slot(catalog_urls: list[dict]) -> dict[str, list[str]]:
|
||||||
|
"""
|
||||||
|
Categorize URLs by target slot (collection_web_addresses vs inventory_web_addresses).
|
||||||
|
"""
|
||||||
|
slots = {
|
||||||
|
'collection_web_addresses': [],
|
||||||
|
'inventory_web_addresses': []
|
||||||
|
}
|
||||||
|
|
||||||
|
seen_urls = set()
|
||||||
|
|
||||||
|
for entry in catalog_urls:
|
||||||
|
url = entry.get('url', '')
|
||||||
|
if not url or url in seen_urls:
|
||||||
|
continue
|
||||||
|
|
||||||
|
url_type = entry.get('type', '')
|
||||||
|
mapping = CATALOG_TYPE_MAPPING.get(url_type, {})
|
||||||
|
slot = mapping.get('slot', 'collection_web_addresses')
|
||||||
|
|
||||||
|
slots[slot].append(url)
|
||||||
|
seen_urls.add(url)
|
||||||
|
|
||||||
|
return slots
|
||||||
|
|
||||||
|
|
||||||
|
def transform_external_platforms(external_platforms: list[dict]) -> list[dict]:
|
||||||
|
"""
|
||||||
|
Transform external_archive_platforms to auxiliary_platforms structure.
|
||||||
|
"""
|
||||||
|
if not external_platforms:
|
||||||
|
return []
|
||||||
|
|
||||||
|
auxiliary = []
|
||||||
|
seen_platforms = set()
|
||||||
|
|
||||||
|
for entry in external_platforms:
|
||||||
|
url = entry.get('url', '')
|
||||||
|
platform_key = entry.get('platform', '')
|
||||||
|
|
||||||
|
if not url or platform_key in seen_platforms:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Find mapping
|
||||||
|
mapping = None
|
||||||
|
for key, config in EXTERNAL_PLATFORM_MAPPING.items():
|
||||||
|
if key in platform_key or key in url:
|
||||||
|
mapping = config
|
||||||
|
break
|
||||||
|
|
||||||
|
if not mapping:
|
||||||
|
# Generic external platform
|
||||||
|
mapping = {
|
||||||
|
'platform_name': platform_key.replace('.', ' ').title() if platform_key else 'External Platform',
|
||||||
|
'auxiliary_platform_type': 'WEB_PORTAL',
|
||||||
|
'description': 'External heritage platform'
|
||||||
|
}
|
||||||
|
|
||||||
|
aux_platform = {
|
||||||
|
'platform_name': mapping['platform_name'],
|
||||||
|
'platform_url': url,
|
||||||
|
'auxiliary_platform_type': mapping['auxiliary_platform_type'],
|
||||||
|
'platform_purpose': mapping.get('description', '')
|
||||||
|
}
|
||||||
|
|
||||||
|
auxiliary.append(aux_platform)
|
||||||
|
seen_platforms.add(platform_key)
|
||||||
|
|
||||||
|
return auxiliary
|
||||||
|
|
||||||
|
|
||||||
|
def get_platform_name(data: dict, ghcid: str) -> str:
|
||||||
|
"""
|
||||||
|
Determine the best platform name from available data.
|
||||||
|
|
||||||
|
Priority:
|
||||||
|
1. custodian_name.emic_name or custodian_name.name
|
||||||
|
2. crawl4ai_enrichment.title (cleaned)
|
||||||
|
3. GHCID-based fallback
|
||||||
|
"""
|
||||||
|
# Try custodian_name first
|
||||||
|
custodian_name = data.get('custodian_name', {})
|
||||||
|
if isinstance(custodian_name, dict):
|
||||||
|
name = custodian_name.get('emic_name') or custodian_name.get('name')
|
||||||
|
if name:
|
||||||
|
return f"{name} Website"
|
||||||
|
|
||||||
|
# Try top-level name
|
||||||
|
if data.get('name'):
|
||||||
|
return f"{data['name']} Website"
|
||||||
|
|
||||||
|
# Try crawl4ai title
|
||||||
|
crawl4ai = data.get('crawl4ai_enrichment', {})
|
||||||
|
title = crawl4ai.get('title', '')
|
||||||
|
if title:
|
||||||
|
# Clean up title (remove common suffixes)
|
||||||
|
cleaned = re.sub(r'\s*[-–|]\s*.+$', '', title).strip()
|
||||||
|
if cleaned and len(cleaned) > 3:
|
||||||
|
return f"{cleaned} Website"
|
||||||
|
|
||||||
|
# Fallback to GHCID
|
||||||
|
return f"{ghcid} Website"
|
||||||
|
|
||||||
|
|
||||||
|
def transform_crawl4ai_to_digital_platform(data: dict, ghcid: str) -> dict | None:
|
||||||
|
"""
|
||||||
|
Transform crawl4ai_enrichment into digital_platform structure.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: Full custodian YAML data
|
||||||
|
ghcid: Global Heritage Custodian Identifier
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
digital_platform dict or None if no crawl4ai_enrichment
|
||||||
|
"""
|
||||||
|
crawl4ai = data.get('crawl4ai_enrichment')
|
||||||
|
if not crawl4ai:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Skip failed fetches - accept 2xx and 3xx status codes
|
||||||
|
status_code = crawl4ai.get('status_code')
|
||||||
|
if status_code is None or status_code >= 400:
|
||||||
|
logger.debug(f"Skipping {ghcid}: HTTP status {status_code}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
source_url = crawl4ai.get('source_url', '')
|
||||||
|
if not source_url:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Get and deduplicate catalog URLs
|
||||||
|
catalog_urls = crawl4ai.get('detected_catalog_urls', [])
|
||||||
|
deduped_catalogs = deduplicate_catalog_urls(catalog_urls)
|
||||||
|
|
||||||
|
# Determine platform types
|
||||||
|
platform_types = determine_platform_types(deduped_catalogs)
|
||||||
|
|
||||||
|
# Categorize URLs by slot
|
||||||
|
url_slots = categorize_urls_by_slot(deduped_catalogs)
|
||||||
|
|
||||||
|
# Transform external platforms
|
||||||
|
external_platforms = crawl4ai.get('external_archive_platforms', [])
|
||||||
|
auxiliary_platforms = transform_external_platforms(external_platforms)
|
||||||
|
|
||||||
|
# Build digital_platform structure
|
||||||
|
digital_platform = {
|
||||||
|
'platform_id': generate_platform_id(ghcid),
|
||||||
|
'platform_name': get_platform_name(data, ghcid),
|
||||||
|
'homepage_web_address': source_url,
|
||||||
|
'refers_to_custodian': f"https://nde.nl/ontology/hc/{ghcid.lower()}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add platform types if we have more than just INSTITUTIONAL_WEBSITE
|
||||||
|
if platform_types and len(platform_types) > 1:
|
||||||
|
digital_platform['platform_type'] = platform_types
|
||||||
|
elif platform_types:
|
||||||
|
digital_platform['platform_type'] = platform_types
|
||||||
|
|
||||||
|
# Add collection URLs
|
||||||
|
if url_slots['collection_web_addresses']:
|
||||||
|
digital_platform['collection_web_addresses'] = url_slots['collection_web_addresses']
|
||||||
|
|
||||||
|
# Add inventory URLs
|
||||||
|
if url_slots['inventory_web_addresses']:
|
||||||
|
digital_platform['inventory_web_addresses'] = url_slots['inventory_web_addresses']
|
||||||
|
|
||||||
|
# Add auxiliary platforms
|
||||||
|
if auxiliary_platforms:
|
||||||
|
digital_platform['auxiliary_platforms'] = auxiliary_platforms
|
||||||
|
|
||||||
|
# Add transformation metadata
|
||||||
|
digital_platform['_transformation_metadata'] = {
|
||||||
|
'source': 'crawl4ai_enrichment',
|
||||||
|
'transformation_date': datetime.now(timezone.utc).isoformat(),
|
||||||
|
'catalog_urls_original': len(catalog_urls),
|
||||||
|
'catalog_urls_deduplicated': len(deduped_catalogs),
|
||||||
|
'external_platforms_count': len(external_platforms)
|
||||||
|
}
|
||||||
|
|
||||||
|
return digital_platform
|
||||||
|
|
||||||
|
|
||||||
|
def process_file(file_path: Path, dry_run: bool = False) -> dict:
|
||||||
|
"""
|
||||||
|
Process a single custodian YAML file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict with processing statistics
|
||||||
|
"""
|
||||||
|
stats = {
|
||||||
|
'file': str(file_path.name),
|
||||||
|
'status': 'skipped',
|
||||||
|
'has_crawl4ai': False,
|
||||||
|
'has_digital_platform': False,
|
||||||
|
'catalog_urls': 0,
|
||||||
|
'external_platforms': 0
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Read YAML file
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
data = yaml.safe_load(f)
|
||||||
|
|
||||||
|
if not data:
|
||||||
|
stats['status'] = 'empty'
|
||||||
|
return stats
|
||||||
|
|
||||||
|
# Extract GHCID
|
||||||
|
ghcid = extract_ghcid_from_file(file_path)
|
||||||
|
if not ghcid:
|
||||||
|
stats['status'] = 'no_ghcid'
|
||||||
|
return stats
|
||||||
|
|
||||||
|
# Check for crawl4ai_enrichment
|
||||||
|
crawl4ai = data.get('crawl4ai_enrichment')
|
||||||
|
if not crawl4ai:
|
||||||
|
stats['status'] = 'no_crawl4ai'
|
||||||
|
return stats
|
||||||
|
|
||||||
|
stats['has_crawl4ai'] = True
|
||||||
|
stats['catalog_urls'] = len(crawl4ai.get('detected_catalog_urls', []))
|
||||||
|
stats['external_platforms'] = len(crawl4ai.get('external_archive_platforms', []))
|
||||||
|
|
||||||
|
# Check if digital_platform_v2 already exists (avoid overwriting)
|
||||||
|
if 'digital_platform_v2' in data:
|
||||||
|
stats['has_digital_platform'] = True
|
||||||
|
stats['status'] = 'already_transformed'
|
||||||
|
return stats
|
||||||
|
|
||||||
|
# Transform to digital_platform
|
||||||
|
digital_platform = transform_crawl4ai_to_digital_platform(data, ghcid)
|
||||||
|
|
||||||
|
if not digital_platform:
|
||||||
|
stats['status'] = 'transform_failed'
|
||||||
|
return stats
|
||||||
|
|
||||||
|
# Add to data as digital_platform_v2 (to distinguish from any existing digital_platform)
|
||||||
|
data['digital_platform_v2'] = digital_platform
|
||||||
|
|
||||||
|
if not dry_run:
|
||||||
|
# Write back to file
|
||||||
|
with open(file_path, 'w', encoding='utf-8') as f:
|
||||||
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||||
|
stats['status'] = 'transformed'
|
||||||
|
else:
|
||||||
|
stats['status'] = 'would_transform'
|
||||||
|
logger.info(f"[DRY-RUN] Would transform {file_path.name}")
|
||||||
|
logger.debug(f" Platform types: {digital_platform.get('platform_type', [])}")
|
||||||
|
logger.debug(f" Collection URLs: {len(digital_platform.get('collection_web_addresses', []))}")
|
||||||
|
logger.debug(f" Inventory URLs: {len(digital_platform.get('inventory_web_addresses', []))}")
|
||||||
|
logger.debug(f" Auxiliary platforms: {len(digital_platform.get('auxiliary_platforms', []))}")
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
except yaml.YAMLError as e:
|
||||||
|
logger.error(f"YAML error in {file_path.name}: {e}")
|
||||||
|
stats['status'] = 'yaml_error'
|
||||||
|
return stats
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing {file_path.name}: {e}")
|
||||||
|
stats['status'] = 'error'
|
||||||
|
return stats
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Transform crawl4ai_enrichment to digital_platform structure'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--dry-run',
|
||||||
|
action='store_true',
|
||||||
|
help='Show what would be done without making changes'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--file',
|
||||||
|
type=Path,
|
||||||
|
help='Process a single file instead of all NL-*.yaml files'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--verbose', '-v',
|
||||||
|
action='store_true',
|
||||||
|
help='Enable verbose logging'
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.verbose:
|
||||||
|
logging.getLogger().setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
# Ensure logs directory exists
|
||||||
|
Path('logs').mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Get files to process
|
||||||
|
data_dir = Path('data/custodian')
|
||||||
|
|
||||||
|
if args.file:
|
||||||
|
if not args.file.exists():
|
||||||
|
logger.error(f"File not found: {args.file}")
|
||||||
|
sys.exit(1)
|
||||||
|
files = [args.file]
|
||||||
|
else:
|
||||||
|
files = sorted(data_dir.glob('NL-*.yaml'))
|
||||||
|
|
||||||
|
logger.info(f"Processing {len(files)} files...")
|
||||||
|
if args.dry_run:
|
||||||
|
logger.info("DRY-RUN MODE - no files will be modified")
|
||||||
|
|
||||||
|
# Process files
|
||||||
|
stats_summary = defaultdict(int)
|
||||||
|
total_catalog_urls = 0
|
||||||
|
total_external_platforms = 0
|
||||||
|
|
||||||
|
for i, file_path in enumerate(files):
|
||||||
|
if (i + 1) % 100 == 0:
|
||||||
|
logger.info(f"Progress: {i + 1}/{len(files)} files processed")
|
||||||
|
|
||||||
|
stats = process_file(file_path, dry_run=args.dry_run)
|
||||||
|
stats_summary[stats['status']] += 1
|
||||||
|
total_catalog_urls += stats.get('catalog_urls', 0)
|
||||||
|
total_external_platforms += stats.get('external_platforms', 0)
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
logger.info("\n" + "=" * 60)
|
||||||
|
logger.info("TRANSFORMATION SUMMARY")
|
||||||
|
logger.info("=" * 60)
|
||||||
|
logger.info(f"Total files processed: {len(files)}")
|
||||||
|
|
||||||
|
for status, count in sorted(stats_summary.items()):
|
||||||
|
logger.info(f" {status}: {count}")
|
||||||
|
|
||||||
|
logger.info(f"\nTotal catalog URLs found: {total_catalog_urls}")
|
||||||
|
logger.info(f"Total external platforms found: {total_external_platforms}")
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
logger.info("\n[DRY-RUN] No files were modified. Run without --dry-run to apply changes.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Loading…
Reference in a new issue