From 0c36429257bb946e828a2259d2a1ccfbd68221f4 Mon Sep 17 00:00:00 2001 From: kempersc Date: Mon, 15 Dec 2025 01:47:46 +0100 Subject: [PATCH] feat(scripts): Add batch crawling and data quality scripts - batch_crawl4ai_recrawl.py: Retry failed URL crawls - batch_firecrawl_recrawl.py: FireCrawl batch processing - batch_httpx_scrape.py: HTTPX-based scraping - detect_name_mismatch.py: Find name mismatches in data - enrich_dutch_custodians_crawl4ai.py: Dutch custodian enrichment - fix_collision_victims.py: GHCID collision resolution - fix_generic_platform_names*.py: Platform name cleanup - fix_ghcid_type.py: GHCID type corrections - fix_simon_kemper_contamination.py: Data cleanup - scan_dutch_data_quality.py: Data quality scanning - transform_crawl4ai_to_digital_platform.py: Data transformation --- scripts/batch_crawl4ai_recrawl.py | 371 ++++++++++ scripts/batch_firecrawl_recrawl.py | 434 ++++++++++++ scripts/batch_httpx_scrape.py | 488 +++++++++++++ scripts/detect_name_mismatch.py | 213 ++++++ scripts/enrich_dutch_custodians_crawl4ai.py | 666 ++++++++++++++++++ scripts/fix_collision_victims.py | 281 ++++++++ scripts/fix_generic_platform_names.py | 140 ++++ scripts/fix_generic_platform_names_fast.py | 97 +++ scripts/fix_ghcid_type.py | 523 ++++++++++++++ scripts/fix_simon_kemper_contamination.py | 269 +++++++ scripts/parse_linkedin_connections.py | 89 ++- scripts/parse_linkedin_html.py | 102 ++- scripts/scan_dutch_data_quality.py | 445 ++++++++++++ scripts/scan_dutch_fast.py | 199 ++++++ .../transform_crawl4ai_to_digital_platform.py | 575 +++++++++++++++ 15 files changed, 4881 insertions(+), 11 deletions(-) create mode 100644 scripts/batch_crawl4ai_recrawl.py create mode 100644 scripts/batch_firecrawl_recrawl.py create mode 100644 scripts/batch_httpx_scrape.py create mode 100644 scripts/detect_name_mismatch.py create mode 100755 scripts/enrich_dutch_custodians_crawl4ai.py create mode 100644 scripts/fix_collision_victims.py create mode 100755 scripts/fix_generic_platform_names.py create mode 100755 scripts/fix_generic_platform_names_fast.py create mode 100644 scripts/fix_ghcid_type.py create mode 100644 scripts/fix_simon_kemper_contamination.py create mode 100644 scripts/scan_dutch_data_quality.py create mode 100644 scripts/scan_dutch_fast.py create mode 100644 scripts/transform_crawl4ai_to_digital_platform.py diff --git a/scripts/batch_crawl4ai_recrawl.py b/scripts/batch_crawl4ai_recrawl.py new file mode 100644 index 0000000000..2e85cce3b9 --- /dev/null +++ b/scripts/batch_crawl4ai_recrawl.py @@ -0,0 +1,371 @@ +#!/usr/bin/env python3 +""" +Batch re-crawl failed URLs using crawl4ai (free, local) and transform to digital_platform_v2. + +This script: +1. Reads the list of failed crawl URLs +2. Uses crawl4ai to fetch content (free, no API limits) +3. Transforms results to digital_platform_v2 format +4. Updates the custodian YAML files + +Usage: + python scripts/batch_crawl4ai_recrawl.py --limit 100 --start 0 +""" + +import argparse +import asyncio +import re +import sys +from datetime import datetime, timezone +from pathlib import Path +from typing import Any +from urllib.parse import urlparse + +import yaml +from crawl4ai import AsyncWebCrawler + +# Configuration +CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") +FAILED_URLS_FILE = Path("/Users/kempersc/apps/glam/data/failed_crawl_urls.txt") + +# Platform type detection patterns +PLATFORM_PATTERNS = { + 'DISCOVERY_PORTAL': [ + r'/collectie', r'/collection', r'/catalogus', r'/catalog', + r'/zoeken', r'/search', r'/archief', r'/archive', + r'/beeldbank', r'/images', r'/foto', r'/photo', + ], + 'DIGITAL_ARCHIVE': [ + r'archieven\.nl', r'archief', r'archive', + r'/inventaris', r'/inventory', r'/toegang', + ], + 'EDUCATION': [ + r'/educatie', r'/education', r'/onderwijs', r'/leren', + r'/scholen', r'/schools', r'/lesmateriaal', + ], + 'INSTITUTIONAL_WEBSITE': [ + r'/over-ons', r'/about', r'/contact', r'/bezoek', + r'/visit', r'/openingstijden', r'/hours', + ], +} + + +def detect_platform_type(url: str, links: list[str] | None = None) -> str: + """Detect the platform type based on URL patterns and extracted links.""" + url_lower = url.lower() + all_urls = [url_lower] + [l.lower() for l in (links or [])] + + for platform_type, patterns in PLATFORM_PATTERNS.items(): + for pattern in patterns: + for check_url in all_urls: + if re.search(pattern, check_url): + return platform_type + + return 'INSTITUTIONAL_WEBSITE' + + +def extract_collection_urls(links: list[str], base_url: str) -> list[str]: + """Extract URLs that appear to be collection/catalog pages.""" + collection_patterns = [ + r'/collectie', r'/collection', r'/catalogus', r'/catalog', + r'/zoeken', r'/search', r'/beeldbank', r'/inventaris', + r'/archief(?!en\.)', r'/archiefstukken', r'/toegangen', + ] + + collection_urls = [] + base_domain = urlparse(base_url).netloc + + for link in links: + try: + parsed = urlparse(link) + if base_domain in parsed.netloc or parsed.netloc in base_domain: + for pattern in collection_patterns: + if re.search(pattern, link.lower()): + if link not in collection_urls: + collection_urls.append(link) + break + except Exception: + continue + + return collection_urls[:10] + + +def extract_auxiliary_platforms(links: list[str], base_url: str) -> list[dict]: + """Extract external platform links (aggregators, portals, etc.).""" + external_patterns = { + 'archieven.nl': {'name': 'Archieven.nl', 'type': 'AGGREGATOR'}, + 'europeana.eu': {'name': 'Europeana', 'type': 'AGGREGATOR'}, + 'collectienederland.nl': {'name': 'Collectie Nederland', 'type': 'AGGREGATOR'}, + 'erfgoedthesaurus.nl': {'name': 'Erfgoedthesaurus', 'type': 'THESAURUS'}, + 'delpher.nl': {'name': 'Delpher', 'type': 'DIGITAL_ARCHIVE'}, + 'geheugen.nl': {'name': 'Geheugen van Nederland', 'type': 'AGGREGATOR'}, + } + + base_domain = urlparse(base_url).netloc + auxiliary = [] + seen_domains = set() + + for link in links: + try: + parsed = urlparse(link) + domain = parsed.netloc.replace('www.', '') + + if base_domain in domain or domain in base_domain: + continue + + for pattern, info in external_patterns.items(): + if pattern in domain and domain not in seen_domains: + seen_domains.add(domain) + auxiliary.append({ + 'platform_name': info['name'], + 'platform_url': link, + 'platform_type': info['type'], + 'integration_type': 'external_aggregator', + }) + break + except Exception: + continue + + return auxiliary[:5] + + +def is_generic_title(title: str) -> bool: + """Check if a title is too generic to use as platform name.""" + generic_patterns = [ + 'home', 'homepage', 'welkom', 'welcome', 'startpagina', + 'index', 'main', 'website', 'webpagina', 'homepagina', + ] + if not title: + return True + title_lower = title.lower().strip() + for pattern in generic_patterns: + if title_lower == pattern or title_lower.startswith(f"{pattern} -") or title_lower.startswith(f"{pattern} |"): + return True + return len(title) < 3 + + +def transform_to_platform_v2(crawl_result, source_url: str, org_name: str) -> dict[str, Any]: + """Transform crawl4ai result to digital_platform_v2 format.""" + metadata = crawl_result.metadata or {} + + # Get internal links + internal_links = [] + if crawl_result.links: + internal_links = [l.get('href', '') for l in crawl_result.links.get('internal', []) if l.get('href')] + + # Extract title, checking for generic titles + candidate_titles = [ + metadata.get('og:title'), + metadata.get('title', '').split(' - ')[0].strip(), + metadata.get('title', '').split(' | ')[0].strip(), + metadata.get('og:site_name'), + ] + + title = org_name # Default fallback + for candidate in candidate_titles: + if candidate and not is_generic_title(candidate): + title = candidate + break + + # Generate platform ID + domain = urlparse(source_url).netloc.replace('www.', '').replace('.', '_') + platform_id = f"primary_website_{domain}" + + # Detect platform type + platform_type = detect_platform_type(source_url, internal_links) + + # Extract collection URLs + collection_urls = extract_collection_urls(internal_links, source_url) + + # Extract auxiliary platforms + auxiliary_platforms = extract_auxiliary_platforms(internal_links, source_url) + + # Build digital_platform_v2 structure + platform_v2: dict[str, Any] = { + 'transformation_metadata': { + 'transformed_from': 'crawl4ai_recrawl', + 'transformation_date': datetime.now(timezone.utc).isoformat(), + 'transformation_version': '2.0', + 'source_status_code': crawl_result.status_code, + }, + 'primary_platform': { + 'platform_id': platform_id, + 'platform_name': f"{title} Website" if 'website' not in title.lower() else title, + 'platform_url': source_url, + 'platform_type': platform_type, + 'description': metadata.get('description') or metadata.get('og:description', ''), + 'language': metadata.get('language', 'nl'), + 'og_image': metadata.get('og:image'), + 'favicon': metadata.get('favicon'), + }, + } + + if collection_urls: + platform_v2['primary_platform']['collection_urls'] = collection_urls + + if auxiliary_platforms: + platform_v2['auxiliary_platforms'] = auxiliary_platforms + + if internal_links: + platform_v2['navigation_links'] = internal_links[:20] + + return platform_v2 + + +def update_custodian_file(filepath: Path, platform_v2: dict) -> bool: + """Update a custodian YAML file with digital_platform_v2 data.""" + try: + with open(filepath, 'r') as f: + data = yaml.safe_load(f) + + if data is None: + data = {} + + data['digital_platform_v2'] = platform_v2 + + if 'crawl4ai_enrichment' in data: + data['crawl4ai_enrichment']['recrawled_with'] = 'crawl4ai_v2' + data['crawl4ai_enrichment']['recrawl_date'] = datetime.now(timezone.utc).isoformat() + + with open(filepath, 'w') as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + return True + + except Exception as e: + print(f" Error updating {filepath}: {e}") + return False + + +def load_failed_urls() -> list[tuple[str, str]]: + """Load the list of failed URLs with their file paths.""" + urls = [] + with open(FAILED_URLS_FILE, 'r') as f: + for line in f: + line = line.strip() + if '\t' in line: + filename, url = line.split('\t', 1) + urls.append((filename, url)) + return urls + + +def get_org_name(filepath: Path) -> str: + """Extract organization name from custodian file.""" + try: + with open(filepath, 'r') as f: + data = yaml.safe_load(f) + + if data: + if 'original_entry' in data and data['original_entry'].get('organisatie'): + return data['original_entry']['organisatie'] + if 'custodian_name' in data: + return data['custodian_name'].get('emic_name', '') or data['custodian_name'].get('preferred_name', '') + if 'name' in data: + return data['name'] + + stem = filepath.stem + parts = stem.split('-') + return parts[-1] if parts else stem + + except Exception: + return filepath.stem + + +async def scrape_single_url(crawler: AsyncWebCrawler, url: str) -> Any: + """Scrape a single URL using crawl4ai.""" + try: + result = await crawler.arun(url, verbose=False) + if result.success: + return result + print(f" Crawl failed: {result.error_message}") + return None + except Exception as e: + print(f" Exception: {e}") + return None + + +async def main_async(args): + """Async main function.""" + all_urls = load_failed_urls() + print(f"Loaded {len(all_urls)} failed URLs") + + if args.limit > 0: + urls_to_process = all_urls[args.start:args.start + args.limit] + else: + urls_to_process = all_urls[args.start:] + + print(f"Processing {len(urls_to_process)} URLs (start={args.start}, limit={args.limit or 'all'})") + + if args.dry_run: + print("\n[DRY RUN MODE - No changes will be made]") + for filename, url in urls_to_process[:10]: + print(f" Would scrape: {filename} -> {url}") + print(f" ... and {len(urls_to_process) - 10} more") + return + + success_count = 0 + fail_count = 0 + skip_count = 0 + + async with AsyncWebCrawler(verbose=False) as crawler: + for i, (filename, url) in enumerate(urls_to_process): + filepath = CUSTODIAN_DIR / filename + + print(f"\n[{i+1}/{len(urls_to_process)}] {filename}") + print(f" URL: {url}") + + if not filepath.exists(): + print(f" SKIP: File not found") + skip_count += 1 + continue + + # Check if already has digital_platform_v2 + with open(filepath, 'r') as f: + content = f.read() + if 'digital_platform_v2:' in content: + print(f" SKIP: Already has digital_platform_v2") + skip_count += 1 + continue + + org_name = get_org_name(filepath) + + result = await scrape_single_url(crawler, url) + + if result: + platform_v2 = transform_to_platform_v2(result, url, org_name) + + if update_custodian_file(filepath, platform_v2): + success_count += 1 + print(f" SUCCESS: {platform_v2['primary_platform']['platform_name']}") + else: + fail_count += 1 + else: + fail_count += 1 + print(f" FAILED: Could not scrape URL") + + # Small delay to be polite + await asyncio.sleep(args.delay) + + if (i + 1) % 50 == 0: + print(f"\n=== Progress: {i+1}/{len(urls_to_process)} (success={success_count}, skip={skip_count}, fail={fail_count}) ===\n") + + print(f"\n=== Final Results ===") + print(f"Success: {success_count}") + print(f"Skipped: {skip_count}") + print(f"Failed: {fail_count}") + print(f"Total: {len(urls_to_process)}") + + +def main(): + parser = argparse.ArgumentParser(description='Batch re-crawl failed URLs with crawl4ai') + parser.add_argument('--start', type=int, default=0, help='Starting index') + parser.add_argument('--limit', type=int, default=0, help='Maximum URLs to process (0=all)') + parser.add_argument('--dry-run', action='store_true', help='Print what would be done without making changes') + parser.add_argument('--delay', type=float, default=0.5, help='Delay between requests in seconds') + args = parser.parse_args() + + asyncio.run(main_async(args)) + + +if __name__ == '__main__': + main() diff --git a/scripts/batch_firecrawl_recrawl.py b/scripts/batch_firecrawl_recrawl.py new file mode 100644 index 0000000000..de4da5641b --- /dev/null +++ b/scripts/batch_firecrawl_recrawl.py @@ -0,0 +1,434 @@ +#!/usr/bin/env python3 +""" +Batch re-crawl failed URLs using Firecrawl and transform to digital_platform_v2. + +This script: +1. Reads the list of failed crawl URLs +2. Uses Firecrawl batch_scrape or individual scrape to fetch content +3. Transforms results to digital_platform_v2 format +4. Updates the custodian YAML files + +Usage: + python scripts/batch_firecrawl_recrawl.py --batch-size 50 --start 0 + +Firecrawl API reference: https://docs.firecrawl.dev/api-reference/endpoint/scrape +""" + +import argparse +import json +import os +import re +import sys +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Any +from urllib.parse import urlparse + +import httpx +import yaml + +# Configuration +CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") +FAILED_URLS_FILE = Path("/Users/kempersc/apps/glam/data/failed_crawl_urls.txt") +FIRECRAWL_API_KEY = os.environ.get("FIRECRAWL_API_KEY", "") +FIRECRAWL_BASE_URL = "https://api.firecrawl.dev/v1" + +# Platform type detection patterns +PLATFORM_PATTERNS = { + 'DISCOVERY_PORTAL': [ + r'/collectie', r'/collection', r'/catalogus', r'/catalog', + r'/zoeken', r'/search', r'/archief', r'/archive', + r'/beeldbank', r'/images', r'/foto', r'/photo', + ], + 'DIGITAL_ARCHIVE': [ + r'archieven\.nl', r'archief', r'archive', + r'/inventaris', r'/inventory', r'/toegang', + ], + 'EDUCATION': [ + r'/educatie', r'/education', r'/onderwijs', r'/leren', + r'/scholen', r'/schools', r'/lesmateriaal', + ], + 'INSTITUTIONAL_WEBSITE': [ + r'/over-ons', r'/about', r'/contact', r'/bezoek', + r'/visit', r'/openingstijden', r'/hours', + ], +} + + +def detect_platform_type(url: str, links: list[str] | None = None) -> str: + """Detect the platform type based on URL patterns and extracted links.""" + url_lower = url.lower() + all_urls = [url_lower] + [l.lower() for l in (links or [])] + + for platform_type, patterns in PLATFORM_PATTERNS.items(): + for pattern in patterns: + for check_url in all_urls: + if re.search(pattern, check_url): + return platform_type + + return 'INSTITUTIONAL_WEBSITE' + + +def extract_collection_urls(links: list[str], base_url: str) -> list[str]: + """Extract URLs that appear to be collection/catalog pages.""" + collection_patterns = [ + r'/collectie', r'/collection', r'/catalogus', r'/catalog', + r'/zoeken', r'/search', r'/beeldbank', r'/inventaris', + r'/archief(?!en\.)', r'/archiefstukken', r'/toegangen', + ] + + collection_urls = [] + base_domain = urlparse(base_url).netloc + + for link in links: + try: + parsed = urlparse(link) + # Only include links from same domain or subdomains + if base_domain in parsed.netloc or parsed.netloc in base_domain: + for pattern in collection_patterns: + if re.search(pattern, link.lower()): + if link not in collection_urls: + collection_urls.append(link) + break + except Exception: + continue + + return collection_urls[:10] # Limit to 10 collection URLs + + +def extract_auxiliary_platforms(links: list[str], base_url: str) -> list[dict]: + """Extract external platform links (aggregators, portals, etc.).""" + external_patterns = { + 'archieven.nl': {'name': 'Archieven.nl', 'type': 'AGGREGATOR'}, + 'europeana.eu': {'name': 'Europeana', 'type': 'AGGREGATOR'}, + 'collectienederland.nl': {'name': 'Collectie Nederland', 'type': 'AGGREGATOR'}, + 'erfgoedthesaurus.nl': {'name': 'Erfgoedthesaurus', 'type': 'THESAURUS'}, + 'delpher.nl': {'name': 'Delpher', 'type': 'DIGITAL_ARCHIVE'}, + 'geheugen.nl': {'name': 'Geheugen van Nederland', 'type': 'AGGREGATOR'}, + 'archiefweb.eu': {'name': 'Archiefweb', 'type': 'DIGITAL_ARCHIVE'}, + } + + base_domain = urlparse(base_url).netloc + auxiliary = [] + seen_domains = set() + + for link in links: + try: + parsed = urlparse(link) + domain = parsed.netloc.replace('www.', '') + + # Skip if same domain as base URL + if base_domain in domain or domain in base_domain: + continue + + # Check for known external platforms + for pattern, info in external_patterns.items(): + if pattern in domain and domain not in seen_domains: + seen_domains.add(domain) + auxiliary.append({ + 'platform_name': info['name'], + 'platform_url': link, + 'platform_type': info['type'], + 'integration_type': 'external_aggregator', + }) + break + except Exception: + continue + + return auxiliary[:5] # Limit to 5 auxiliary platforms + + +def is_generic_title(title: str) -> bool: + """Check if a title is too generic to use as platform name.""" + generic_patterns = [ + 'home', 'homepage', 'welkom', 'welcome', 'startpagina', + 'index', 'main', 'website', 'webpagina', 'web page', + ] + if not title: + return True + title_lower = title.lower().strip() + # Check if title is just one of the generic patterns + for pattern in generic_patterns: + if title_lower == pattern or title_lower == f"{pattern} -" or title_lower.startswith(f"{pattern} |"): + return True + return len(title) < 3 + + +def transform_to_platform_v2(scrape_result: dict, source_url: str, org_name: str) -> dict[str, Any]: + """Transform Firecrawl scrape result to digital_platform_v2 format.""" + metadata = scrape_result.get('metadata', {}) + links = scrape_result.get('links', []) + markdown = scrape_result.get('markdown', '') + + # Extract title from metadata, checking for generic titles + candidate_titles = [ + metadata.get('ogTitle'), + metadata.get('title', '').split(' - ')[0].strip(), + metadata.get('title', '').split(' | ')[0].strip(), + metadata.get('og:title'), + metadata.get('ogSiteName'), + metadata.get('og:site_name'), + ] + + # Find first non-generic title + title = org_name # Default fallback + for candidate in candidate_titles: + if candidate and not is_generic_title(candidate): + title = candidate + break + + # Generate platform ID + domain = urlparse(source_url).netloc.replace('www.', '').replace('.', '_') + platform_id = f"primary_website_{domain}" + + # Detect platform type + platform_type = detect_platform_type(source_url, links) + + # Extract collection URLs + collection_urls = extract_collection_urls(links, source_url) + + # Extract auxiliary platforms + auxiliary_platforms = extract_auxiliary_platforms(links, source_url) + + # Build digital_platform_v2 structure + platform_v2 = { + 'transformation_metadata': { + 'transformed_from': 'firecrawl_scrape', + 'transformation_date': datetime.now(timezone.utc).isoformat(), + 'transformation_version': '2.0', + 'source_status_code': metadata.get('statusCode', 200), + }, + 'primary_platform': { + 'platform_id': platform_id, + 'platform_name': f"{title} Website" if 'website' not in title.lower() else title, + 'platform_url': source_url, + 'platform_type': platform_type, + 'description': metadata.get('description') or metadata.get('ogDescription', ''), + 'language': metadata.get('language', 'nl'), + 'og_image': metadata.get('ogImage') or metadata.get('og:image'), + 'favicon': metadata.get('favicon'), + }, + } + + # Add collection URLs if found + if collection_urls: + platform_v2['primary_platform']['collection_urls'] = collection_urls + + # Add auxiliary platforms if found + if auxiliary_platforms: + platform_v2['auxiliary_platforms'] = auxiliary_platforms + + # Add internal navigation links (sample) + internal_links = [ + l for l in links + if urlparse(l).netloc in urlparse(source_url).netloc + ][:20] + if internal_links: + platform_v2['navigation_links'] = internal_links + + return platform_v2 + + +def scrape_single_url(url: str, client: httpx.Client, max_retries: int = 3) -> dict | None: + """Scrape a single URL using Firecrawl API with retry on rate limit.""" + for attempt in range(max_retries): + try: + response = client.post( + f"{FIRECRAWL_BASE_URL}/scrape", + json={ + 'url': url, + 'formats': ['markdown', 'links'], + 'onlyMainContent': True, + }, + timeout=60.0, + ) + + if response.status_code == 200: + data = response.json() + if data.get('success'): + return data.get('data', {}) + + # Handle rate limiting (429) + if response.status_code == 429: + wait_time = 15 * (attempt + 1) # 15s, 30s, 45s + print(f" Rate limited, waiting {wait_time}s (attempt {attempt + 1}/{max_retries})") + time.sleep(wait_time) + continue + + print(f" Error {response.status_code}: {response.text[:200]}") + return None + + except Exception as e: + print(f" Exception: {e}") + if attempt < max_retries - 1: + time.sleep(5) + continue + return None + + print(f" Max retries exceeded") + return None + + +def update_custodian_file(filepath: Path, platform_v2: dict) -> bool: + """Update a custodian YAML file with digital_platform_v2 data.""" + try: + with open(filepath, 'r') as f: + data = yaml.safe_load(f) + + if data is None: + data = {} + + # Add digital_platform_v2 section + data['digital_platform_v2'] = platform_v2 + + # Update crawl4ai_enrichment status + if 'crawl4ai_enrichment' in data: + data['crawl4ai_enrichment']['recrawled_with'] = 'firecrawl' + data['crawl4ai_enrichment']['recrawl_date'] = datetime.now(timezone.utc).isoformat() + + with open(filepath, 'w') as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + return True + + except Exception as e: + print(f" Error updating {filepath}: {e}") + return False + + +def load_failed_urls() -> list[tuple[str, str]]: + """Load the list of failed URLs with their file paths.""" + urls = [] + with open(FAILED_URLS_FILE, 'r') as f: + for line in f: + line = line.strip() + if '\t' in line: + filename, url = line.split('\t', 1) + urls.append((filename, url)) + return urls + + +def get_org_name(filepath: Path) -> str: + """Extract organization name from custodian file.""" + try: + with open(filepath, 'r') as f: + data = yaml.safe_load(f) + + # Try different name fields + if data: + if 'original_entry' in data and data['original_entry'].get('organisatie'): + return data['original_entry']['organisatie'] + if 'custodian_name' in data: + return data['custodian_name'].get('emic_name', '') or data['custodian_name'].get('preferred_name', '') + if 'name' in data: + return data['name'] + + # Fallback: extract from filename + stem = filepath.stem + parts = stem.split('-') + return parts[-1] if parts else stem + + except Exception: + return filepath.stem + + +def main(): + parser = argparse.ArgumentParser(description='Batch re-crawl failed URLs with Firecrawl') + parser.add_argument('--batch-size', type=int, default=50, help='Number of URLs per batch') + parser.add_argument('--start', type=int, default=0, help='Starting index') + parser.add_argument('--limit', type=int, default=0, help='Maximum URLs to process (0=all)') + parser.add_argument('--dry-run', action='store_true', help='Print what would be done without making changes') + parser.add_argument('--delay', type=float, default=6.0, help='Delay between requests in seconds (default 6 for rate limits)') + args = parser.parse_args() + + if not FIRECRAWL_API_KEY: + print("Error: FIRECRAWL_API_KEY environment variable not set") + sys.exit(1) + + # Load URLs + all_urls = load_failed_urls() + print(f"Loaded {len(all_urls)} failed URLs") + + # Slice based on start and limit + if args.limit > 0: + urls_to_process = all_urls[args.start:args.start + args.limit] + else: + urls_to_process = all_urls[args.start:] + + print(f"Processing {len(urls_to_process)} URLs (start={args.start}, limit={args.limit or 'all'})") + + if args.dry_run: + print("\n[DRY RUN MODE - No changes will be made]") + for filename, url in urls_to_process[:10]: + print(f" Would scrape: {filename} -> {url}") + print(f" ... and {len(urls_to_process) - 10} more") + return + + # Create HTTP client + client = httpx.Client( + headers={ + 'Authorization': f'Bearer {FIRECRAWL_API_KEY}', + 'Content-Type': 'application/json', + } + ) + + success_count = 0 + fail_count = 0 + + try: + for i, (filename, url) in enumerate(urls_to_process): + filepath = CUSTODIAN_DIR / filename + + print(f"\n[{i+1}/{len(urls_to_process)}] {filename}") + print(f" URL: {url}") + + if not filepath.exists(): + print(f" SKIP: File not found") + continue + + # Check if already has digital_platform_v2 + with open(filepath, 'r') as f: + content = f.read() + if 'digital_platform_v2:' in content: + print(f" SKIP: Already has digital_platform_v2") + continue + + # Get org name for platform naming + org_name = get_org_name(filepath) + + # Scrape URL + result = scrape_single_url(url, client) + + if result: + # Transform to platform_v2 + platform_v2 = transform_to_platform_v2(result, url, org_name) + + # Update file + if update_custodian_file(filepath, platform_v2): + success_count += 1 + print(f" SUCCESS: {platform_v2['primary_platform']['platform_name']}") + else: + fail_count += 1 + else: + fail_count += 1 + print(f" FAILED: Could not scrape URL") + + # Rate limiting + time.sleep(args.delay) + + # Progress update every 50 URLs + if (i + 1) % 50 == 0: + print(f"\n=== Progress: {i+1}/{len(urls_to_process)} (success={success_count}, fail={fail_count}) ===\n") + + finally: + client.close() + + print(f"\n=== Final Results ===") + print(f"Success: {success_count}") + print(f"Failed: {fail_count}") + print(f"Total: {len(urls_to_process)}") + + +if __name__ == '__main__': + main() diff --git a/scripts/batch_httpx_scrape.py b/scripts/batch_httpx_scrape.py new file mode 100644 index 0000000000..1817c7784c --- /dev/null +++ b/scripts/batch_httpx_scrape.py @@ -0,0 +1,488 @@ +#!/usr/bin/env python3 +""" +Batch web scraper using httpx + BeautifulSoup for digital_platform_v2 enrichment. + +This script: +1. Reads the list of failed crawl URLs +2. Uses httpx to fetch HTML content directly (no browser, no external API) +3. Uses BeautifulSoup to parse and extract metadata +4. Transforms results to digital_platform_v2 format +5. Updates the custodian YAML files + +Usage: + python scripts/batch_httpx_scrape.py --limit 10 + python scripts/batch_httpx_scrape.py --start 100 --limit 50 + python scripts/batch_httpx_scrape.py --dry-run + +No API keys or external services required! +""" + +from __future__ import annotations + +import argparse +import re +import sys +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Any +from urllib.parse import urljoin, urlparse + +import httpx +import yaml +from bs4 import BeautifulSoup + +# Configuration +CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") +FAILED_URLS_FILE = Path("/Users/kempersc/apps/glam/data/failed_crawl_urls.txt") + +# User agent to mimic a real browser +USER_AGENT = ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +) + +# Platform type detection patterns +PLATFORM_PATTERNS: dict[str, list[str]] = { + 'DISCOVERY_PORTAL': [ + r'/collectie', r'/collection', r'/catalogus', r'/catalog', + r'/zoeken', r'/search', r'/archief', r'/archive', + r'/beeldbank', r'/images', r'/foto', r'/photo', + ], + 'DIGITAL_ARCHIVE': [ + r'archieven\.nl', r'archief', r'archive', + r'/inventaris', r'/inventory', r'/toegang', + ], + 'EDUCATION': [ + r'/educatie', r'/education', r'/onderwijs', r'/leren', + r'/scholen', r'/schools', r'/lesmateriaal', + ], + 'INSTITUTIONAL_WEBSITE': [ + r'/over-ons', r'/about', r'/contact', r'/bezoek', + r'/visit', r'/openingstijden', r'/hours', + ], +} + + +def detect_platform_type(url: str, links: list[str] | None = None) -> str: + """Detect the platform type based on URL patterns and extracted links.""" + url_lower = url.lower() + all_urls = [url_lower] + [link.lower() for link in (links or [])] + + for platform_type, patterns in PLATFORM_PATTERNS.items(): + for pattern in patterns: + for check_url in all_urls: + if re.search(pattern, check_url): + return platform_type + + return 'INSTITUTIONAL_WEBSITE' + + +def extract_collection_urls(links: list[str], base_url: str) -> list[str]: + """Extract URLs that appear to be collection/catalog pages.""" + collection_patterns = [ + r'/collectie', r'/collection', r'/catalogus', r'/catalog', + r'/zoeken', r'/search', r'/beeldbank', r'/inventaris', + r'/archief(?!en\.)', r'/archiefstukken', r'/toegangen', + ] + + collection_urls: list[str] = [] + base_domain = urlparse(base_url).netloc + + for link in links: + try: + parsed = urlparse(link) + if base_domain in parsed.netloc or parsed.netloc in base_domain: + for pattern in collection_patterns: + if re.search(pattern, link.lower()): + if link not in collection_urls: + collection_urls.append(link) + break + except Exception: + continue + + return collection_urls[:10] + + +def extract_auxiliary_platforms(links: list[str], base_url: str) -> list[dict[str, str]]: + """Extract external platform links (aggregators, portals, etc.).""" + external_patterns: dict[str, dict[str, str]] = { + 'archieven.nl': {'name': 'Archieven.nl', 'type': 'AGGREGATOR'}, + 'europeana.eu': {'name': 'Europeana', 'type': 'AGGREGATOR'}, + 'collectienederland.nl': {'name': 'Collectie Nederland', 'type': 'AGGREGATOR'}, + 'erfgoedthesaurus.nl': {'name': 'Erfgoedthesaurus', 'type': 'THESAURUS'}, + 'delpher.nl': {'name': 'Delpher', 'type': 'DIGITAL_ARCHIVE'}, + 'geheugen.nl': {'name': 'Geheugen van Nederland', 'type': 'AGGREGATOR'}, + 'archiefweb.eu': {'name': 'Archiefweb', 'type': 'DIGITAL_ARCHIVE'}, + } + + base_domain = urlparse(base_url).netloc + auxiliary: list[dict[str, str]] = [] + seen_domains: set[str] = set() + + for link in links: + try: + parsed = urlparse(link) + domain = parsed.netloc.replace('www.', '') + + if base_domain in domain or domain in base_domain: + continue + + for pattern, info in external_patterns.items(): + if pattern in domain and domain not in seen_domains: + seen_domains.add(domain) + auxiliary.append({ + 'platform_name': info['name'], + 'platform_url': link, + 'platform_type': info['type'], + 'integration_type': 'external_aggregator', + }) + break + except Exception: + continue + + return auxiliary[:5] + + +def is_generic_title(title: str | None) -> bool: + """Check if a title is too generic to use as platform name.""" + generic_patterns = [ + 'home', 'homepage', 'welkom', 'welcome', 'startpagina', + 'index', 'main', 'website', 'webpagina', 'web page', + ] + if not title: + return True + title_lower = title.lower().strip() + for pattern in generic_patterns: + if title_lower == pattern or title_lower == f"{pattern} -" or title_lower.startswith(f"{pattern} |"): + return True + return len(title) < 3 + + +def scrape_with_httpx(url: str, client: httpx.Client, timeout: float = 30.0) -> dict[str, Any] | None: + """Scrape a URL using httpx and return parsed metadata.""" + try: + response = client.get(url, timeout=timeout, follow_redirects=True) + + if response.status_code != 200: + return {'error': f'HTTP {response.status_code}', 'status_code': response.status_code} + + # Parse HTML + soup = BeautifulSoup(response.text, 'html.parser') + + # Extract metadata + metadata: dict[str, Any] = { + 'status_code': response.status_code, + 'final_url': str(response.url), + } + + # Title + title_tag = soup.find('title') + metadata['title'] = title_tag.get_text(strip=True) if title_tag else None + + # Meta tags + for meta in soup.find_all('meta'): + name = str(meta.get('name', '')).lower() + prop = str(meta.get('property', '')).lower() + content = str(meta.get('content', '')) + + if name == 'description' or prop == 'og:description': + if 'description' not in metadata or prop == 'og:description': + metadata['description'] = content + elif prop == 'og:title': + metadata['og_title'] = content + elif prop == 'og:image': + metadata['og_image'] = urljoin(url, content) if content else None + elif prop == 'og:site_name': + metadata['og_site_name'] = content + elif name == 'language' or str(meta.get('http-equiv', '')).lower() == 'content-language': + metadata['language'] = content.split(',')[0].split('-')[0] + + # Detect language from html tag + html_tag = soup.find('html') + if html_tag: + lang_attr = html_tag.get('lang') + if lang_attr: + lang_str = str(lang_attr) if not isinstance(lang_attr, list) else str(lang_attr[0]) + metadata['language'] = lang_str.split('-')[0] + + # Favicon + for link in soup.find_all('link'): + rel = link.get('rel') + if rel is None: + rel = [] + if isinstance(rel, list): + rel_str = ' '.join(str(r) for r in rel) + else: + rel_str = str(rel) + if 'icon' in rel_str.lower(): + href = link.get('href') + if href: + metadata['favicon'] = urljoin(url, str(href)) + break + + # Extract links + links: list[str] = [] + for a in soup.find_all('a', href=True): + href = str(a['href']) + if href.startswith('http') or href.startswith('/'): + full_url = urljoin(url, href) + if full_url not in links: + links.append(full_url) + + metadata['links'] = links[:100] # Limit to 100 links + + return metadata + + except httpx.TimeoutException: + return {'error': 'Timeout', 'status_code': None} + except httpx.ConnectError as e: + return {'error': f'Connection error: {e}', 'status_code': None} + except httpx.HTTPError as e: + return {'error': f'HTTP error: {e}', 'status_code': None} + except Exception as e: + return {'error': f'Exception: {e}', 'status_code': None} + + +def transform_to_platform_v2(scrape_result: dict[str, Any], source_url: str, org_name: str) -> dict[str, Any]: + """Transform scrape result to digital_platform_v2 format.""" + links: list[str] = scrape_result.get('links', []) + + # Extract title, preferring og:title, then site_name, then page title + raw_title = scrape_result.get('title', '') or '' + candidate_titles: list[str | None] = [ + scrape_result.get('og_title'), + scrape_result.get('og_site_name'), + raw_title.split(' - ')[0].strip() if raw_title else None, + raw_title.split(' | ')[0].strip() if raw_title else None, + ] + + title = org_name # Default fallback + for candidate in candidate_titles: + if candidate and not is_generic_title(candidate): + title = candidate + break + + # Generate platform ID + domain = urlparse(source_url).netloc.replace('www.', '').replace('.', '_') + platform_id = f"primary_website_{domain}" + + # Detect platform type + platform_type = detect_platform_type(source_url, links) + + # Extract collection URLs + collection_urls = extract_collection_urls(links, source_url) + + # Extract auxiliary platforms + auxiliary_platforms = extract_auxiliary_platforms(links, source_url) + + # Build digital_platform_v2 structure + platform_v2: dict[str, Any] = { + 'transformation_metadata': { + 'transformed_from': 'httpx_beautifulsoup', + 'transformation_date': datetime.now(timezone.utc).isoformat(), + 'transformation_version': '2.1', + 'source_status_code': scrape_result.get('status_code', 200), + }, + 'primary_platform': { + 'platform_id': platform_id, + 'platform_name': f"{title} Website" if 'website' not in title.lower() else title, + 'platform_url': scrape_result.get('final_url', source_url), + 'platform_type': platform_type, + 'description': scrape_result.get('description', ''), + 'language': scrape_result.get('language', 'nl'), + 'og_image': scrape_result.get('og_image'), + 'favicon': scrape_result.get('favicon'), + }, + } + + # Add collection URLs if found + if collection_urls: + platform_v2['primary_platform']['collection_urls'] = collection_urls + + # Add auxiliary platforms if found + if auxiliary_platforms: + platform_v2['auxiliary_platforms'] = auxiliary_platforms + + # Add internal navigation links (sample) + base_domain = urlparse(source_url).netloc + internal_links = [link for link in links if base_domain in urlparse(link).netloc][:20] + if internal_links: + platform_v2['navigation_links'] = internal_links + + return platform_v2 + + +def update_custodian_file(filepath: Path, platform_v2: dict[str, Any]) -> bool: + """Update a custodian YAML file with digital_platform_v2 data.""" + try: + with open(filepath, 'r') as f: + data = yaml.safe_load(f) + + if data is None: + data = {} + + # Add digital_platform_v2 section + data['digital_platform_v2'] = platform_v2 + + with open(filepath, 'w') as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + return True + + except Exception as e: + print(f" Error updating {filepath}: {e}") + return False + + +def load_failed_urls() -> list[tuple[str, str]]: + """Load the list of failed URLs with their file paths.""" + urls: list[tuple[str, str]] = [] + with open(FAILED_URLS_FILE, 'r') as f: + for line in f: + line = line.strip() + if '\t' in line: + filename, url = line.split('\t', 1) + urls.append((filename, url)) + return urls + + +def get_org_name(filepath: Path) -> str: + """Extract organization name from custodian file.""" + try: + with open(filepath, 'r') as f: + data = yaml.safe_load(f) + + if data: + if 'original_entry' in data and data['original_entry'].get('organisatie'): + return str(data['original_entry']['organisatie']) + if 'custodian_name' in data: + cn = data['custodian_name'] + return str(cn.get('emic_name', '') or cn.get('preferred_name', '')) + if 'name' in data: + return str(data['name']) + + # Fallback: extract from filename + stem = filepath.stem + parts = stem.split('-') + return parts[-1] if parts else stem + + except Exception: + return filepath.stem + + +def main() -> None: + parser = argparse.ArgumentParser(description='Batch web scraper using httpx + BeautifulSoup') + parser.add_argument('--start', type=int, default=0, help='Starting index') + parser.add_argument('--limit', type=int, default=0, help='Maximum URLs to process (0=all)') + parser.add_argument('--dry-run', action='store_true', help='Print what would be done without making changes') + parser.add_argument('--delay', type=float, default=1.0, help='Delay between requests in seconds (default 1)') + parser.add_argument('--timeout', type=float, default=30.0, help='Request timeout in seconds (default 30)') + parser.add_argument('--skip-existing', action='store_true', default=True, help='Skip files that already have digital_platform_v2') + args = parser.parse_args() + + # Check for BeautifulSoup + try: + from bs4 import BeautifulSoup as _ # noqa: F401 + except ImportError: + print("Error: BeautifulSoup not installed. Run: pip install beautifulsoup4") + sys.exit(1) + + # Load URLs + all_urls = load_failed_urls() + print(f"Loaded {len(all_urls)} failed URLs from {FAILED_URLS_FILE}") + + # Slice based on start and limit + if args.limit > 0: + urls_to_process = all_urls[args.start:args.start + args.limit] + else: + urls_to_process = all_urls[args.start:] + + print(f"Processing {len(urls_to_process)} URLs (start={args.start}, limit={args.limit or 'all'})") + + if args.dry_run: + print("\n[DRY RUN MODE - No changes will be made]") + for filename, url in urls_to_process[:10]: + print(f" Would scrape: {filename} -> {url}") + if len(urls_to_process) > 10: + print(f" ... and {len(urls_to_process) - 10} more") + return + + # Create HTTP client with headers + client = httpx.Client( + headers={ + 'User-Agent': USER_AGENT, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'nl,en-US;q=0.9,en;q=0.8', + }, + follow_redirects=True, + timeout=args.timeout, + ) + + success_count = 0 + skip_count = 0 + fail_count = 0 + + try: + for i, (filename, url) in enumerate(urls_to_process): + filepath = CUSTODIAN_DIR / filename + + print(f"\n[{i+1}/{len(urls_to_process)}] {filename}") + print(f" URL: {url}") + + if not filepath.exists(): + print(f" SKIP: File not found") + skip_count += 1 + continue + + # Check if already has digital_platform_v2 + if args.skip_existing: + with open(filepath, 'r') as f: + content = f.read() + if 'digital_platform_v2:' in content: + print(f" SKIP: Already has digital_platform_v2") + skip_count += 1 + continue + + # Get org name for platform naming + org_name = get_org_name(filepath) + + # Scrape URL + result = scrape_with_httpx(url, client, timeout=args.timeout) + + if result and 'error' not in result: + # Transform to platform_v2 + platform_v2 = transform_to_platform_v2(result, url, org_name) + + # Update file + if update_custodian_file(filepath, platform_v2): + success_count += 1 + platform_name = platform_v2['primary_platform']['platform_name'] + print(f" SUCCESS: {platform_name}") + else: + fail_count += 1 + else: + fail_count += 1 + error_msg = result.get('error', 'Unknown error') if result else 'No result' + print(f" FAILED: {error_msg}") + + # Rate limiting + if args.delay > 0: + time.sleep(args.delay) + + # Progress update every 50 URLs + if (i + 1) % 50 == 0: + print(f"\n=== Progress: {i+1}/{len(urls_to_process)} (success={success_count}, skip={skip_count}, fail={fail_count}) ===\n") + + finally: + client.close() + + print(f"\n=== Final Results ===") + print(f"Success: {success_count}") + print(f"Skipped: {skip_count}") + print(f"Failed: {fail_count}") + print(f"Total: {len(urls_to_process)}") + + +if __name__ == '__main__': + main() diff --git a/scripts/detect_name_mismatch.py b/scripts/detect_name_mismatch.py new file mode 100644 index 0000000000..c09d9240ff --- /dev/null +++ b/scripts/detect_name_mismatch.py @@ -0,0 +1,213 @@ +#!/usr/bin/env python3 +""" +Detect name mismatches in LinkedIn entity profiles. + +Compares the LinkedIn URL slug with the assigned name to find: +1. Profiles where the name doesn't match the slug at all +2. Patterns of repeated wrong names (like "Simon Kemper") +3. Other potential filler/hallucinated names +""" + +import json +import os +import re +from pathlib import Path +from collections import Counter, defaultdict +from urllib.parse import unquote +import unicodedata + +def normalize_name(name: str) -> str: + """Normalize a name for comparison.""" + if not name: + return "" + # Decode URL encoding + name = unquote(name) + # Normalize unicode + name = unicodedata.normalize('NFD', name) + # Remove diacritics + name = ''.join(c for c in name if unicodedata.category(c) != 'Mn') + # Lowercase + name = name.lower() + # Remove common suffixes like numbers, hyphens + name = re.sub(r'[-_\d]+$', '', name) + # Replace hyphens/underscores with spaces + name = re.sub(r'[-_]+', ' ', name) + # Remove extra whitespace + name = ' '.join(name.split()) + return name + +def extract_name_from_slug(slug: str) -> str: + """Extract a human-readable name from a LinkedIn slug.""" + # Decode URL encoding + slug = unquote(slug) + # Remove timestamp suffix like _20251214T115050Z + slug = re.sub(r'_\d{8}T\d{6}Z\.json$', '', slug) + # Remove trailing numbers/IDs + slug = re.sub(r'[-_][\da-f]{6,}$', '', slug) + slug = re.sub(r'[-_]\d+$', '', slug) + return normalize_name(slug) + +def names_match(slug_name: str, profile_name: str) -> bool: + """Check if the slug name and profile name are reasonably similar.""" + if not slug_name or not profile_name: + return False + + slug_normalized = normalize_name(slug_name) + profile_normalized = normalize_name(profile_name) + + # Direct match + if slug_normalized == profile_normalized: + return True + + # Check if all words from slug appear in profile name + slug_words = set(slug_normalized.split()) + profile_words = set(profile_normalized.split()) + + # If slug has meaningful words, check overlap + if slug_words and len(slug_words) >= 2: + # At least half the slug words should be in profile + overlap = slug_words & profile_words + if len(overlap) >= len(slug_words) * 0.5: + return True + + # Check if first name matches + slug_parts = slug_normalized.split() + profile_parts = profile_normalized.split() + if slug_parts and profile_parts: + if slug_parts[0] == profile_parts[0]: + return True + + return False + +def analyze_entity_files(entity_dir: Path): + """Analyze all entity files for name mismatches.""" + + mismatches = [] + name_counter = Counter() + files_by_name = defaultdict(list) + total_files = 0 + fallback_files = 0 + + for filepath in entity_dir.glob("*.json"): + total_files += 1 + filename = filepath.name + + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = json.load(f) + except (json.JSONDecodeError, IOError) as e: + print(f"Error reading {filename}: {e}") + continue + + # Get the profile name + profile_name = None + if 'profile_data' in data and 'name' in data['profile_data']: + profile_name = data['profile_data']['name'] + elif 'source_staff_info' in data and 'name' in data['source_staff_info']: + profile_name = data['source_staff_info']['name'] + + if not profile_name: + continue + + # Track all names for frequency analysis + name_counter[profile_name] += 1 + files_by_name[profile_name].append(filename) + + # Check if this is a fallback file + extraction_method = data.get('extraction_metadata', {}).get('extraction_method', '') + if extraction_method == 'fallback_basic': + fallback_files += 1 + + # Extract name from slug + slug_name = extract_name_from_slug(filename) + + # Check for mismatch + if not names_match(slug_name, profile_name): + mismatches.append({ + 'filename': filename, + 'slug_name': slug_name, + 'profile_name': profile_name, + 'extraction_method': extraction_method, + 'linkedin_url': data.get('extraction_metadata', {}).get('linkedin_url', '') + }) + + return { + 'total_files': total_files, + 'fallback_files': fallback_files, + 'mismatches': mismatches, + 'name_counter': name_counter, + 'files_by_name': files_by_name + } + +def main(): + entity_dir = Path("/Users/kempersc/apps/glam/data/custodian/person/entity") + + print("=" * 80) + print("LINKEDIN ENTITY NAME MISMATCH ANALYSIS") + print("=" * 80) + print() + + results = analyze_entity_files(entity_dir) + + print(f"Total entity files analyzed: {results['total_files']}") + print(f"Fallback (basic) files: {results['fallback_files']}") + print(f"Total mismatches detected: {len(results['mismatches'])}") + print() + + # Find names that appear suspiciously often (potential filler names) + print("=" * 80) + print("NAMES APPEARING MORE THAN 5 TIMES (Potential Filler Names)") + print("=" * 80) + frequent_names = [(name, count) for name, count in results['name_counter'].most_common(50) if count > 5] + + for name, count in frequent_names: + # Check if this name appears in mismatches + mismatch_count = sum(1 for m in results['mismatches'] if m['profile_name'] == name) + print(f" '{name}': {count} occurrences ({mismatch_count} are mismatches)") + + print() + print("=" * 80) + print("ALL MISMATCHED FILES (slug name != profile name)") + print("=" * 80) + + # Group mismatches by profile_name to see patterns + mismatch_by_name = defaultdict(list) + for m in results['mismatches']: + mismatch_by_name[m['profile_name']].append(m) + + # Sort by frequency of the mismatched name + sorted_names = sorted(mismatch_by_name.items(), key=lambda x: -len(x[1])) + + for profile_name, items in sorted_names[:30]: # Top 30 most frequent mismatch names + print(f"\n--- '{profile_name}' assigned to {len(items)} different slugs ---") + for item in items[:10]: # Show first 10 examples + print(f" Slug: {item['slug_name']}") + print(f" File: {item['filename']}") + print(f" Method: {item['extraction_method']}") + print() + + # Output detailed CSV for further analysis + csv_path = entity_dir.parent / "name_mismatch_report.csv" + with open(csv_path, 'w', encoding='utf-8') as f: + f.write("filename,slug_name,profile_name,extraction_method,linkedin_url\n") + for m in results['mismatches']: + f.write(f'"{m["filename"]}","{m["slug_name"]}","{m["profile_name"]}","{m["extraction_method"]}","{m["linkedin_url"]}"\n') + + print(f"\nDetailed report saved to: {csv_path}") + + # Also output JSON for programmatic use + json_path = entity_dir.parent / "name_mismatch_report.json" + with open(json_path, 'w', encoding='utf-8') as f: + json.dump({ + 'total_files': results['total_files'], + 'fallback_files': results['fallback_files'], + 'total_mismatches': len(results['mismatches']), + 'mismatches_by_name': {name: len(items) for name, items in mismatch_by_name.items()}, + 'frequent_names': [(name, count) for name, count in results['name_counter'].most_common(100)], + 'mismatches': results['mismatches'] + }, f, indent=2, ensure_ascii=False) + + print(f"JSON report saved to: {json_path}") + +if __name__ == "__main__": + main() diff --git a/scripts/enrich_dutch_custodians_crawl4ai.py b/scripts/enrich_dutch_custodians_crawl4ai.py new file mode 100755 index 0000000000..051c23706e --- /dev/null +++ b/scripts/enrich_dutch_custodians_crawl4ai.py @@ -0,0 +1,666 @@ +#!/usr/bin/env python3 +""" +Enrich Dutch custodian YAML files with web data using Crawl4AI (free, local). + +This script replaces the Firecrawl-based enrichment with Crawl4AI which: +1. Runs locally using Playwright (no API costs) +2. Extracts links, metadata, and content with XPath provenance +3. Detects APIs, catalogs, and metadata standards + +Usage: + python scripts/enrich_dutch_custodians_crawl4ai.py [options] + +Options: + --dry-run Show what would be enriched without modifying files + --limit N Process only first N files (for testing) + --start-index N Start from index N (for resuming) + --resume Resume from last checkpoint + --force Re-enrich even if already has crawl4ai_enrichment + --file PATH Process a single specific file +""" + +import argparse +import asyncio +import json +import logging +import os +import re +import sys +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Optional +from urllib.parse import urlparse, urlunparse + +import yaml +from dotenv import load_dotenv +from lxml import etree + +# Crawl4AI imports +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +# Load environment variables from .env file +load_dotenv() + +# Set up logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +# Configuration +CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian" +CHECKPOINT_FILE = CUSTODIAN_DIR / ".crawl4ai_enrichment_checkpoint.json" + +# Rate limiting - be nice to websites even though we're local +REQUEST_DELAY = 2.0 # seconds between requests + +# Digital platform detection patterns +API_ENDPOINT_PATTERNS = [ + r'/oai[-_]?pmh', + r'/api/', + r'/rest/', + r'/sparql', + r'/graphql', + r'/iiif/', + r'/sru', + r'/z39\.50', + r'/opensearch', +] + +CATALOG_PATTERNS = [ + r'/catalogu[es]?(?:/|\?|$)', + r'/collecti[eo]n?[s]?(?:/|\?|$)', + r'/archie[fv](?:/|\?|$)', + r'/beeldbank(?:/|\?|$)', + r'/zoeken(?:/|\?|$)', + r'/search(?:/|\?|$)', + r'/discover(?:/|\?|$)', + r'/browse(?:/|\?|$)', +] + +# Dutch-specific catalog type detection +CATALOG_TYPE_PATTERNS = { + 'beeldbank': { + 'patterns': [r'/beeldbank', r'/beeld', r'/images', r'/foto'], + 'label': 'Image Collection', + 'description_nl': 'Beeldbank met gedigitaliseerde foto\'s, kaarten en afbeeldingen', + }, + 'genealogie': { + 'patterns': [r'/genealogie', r'/stamboom', r'/persons', r'/akten'], + 'label': 'Genealogy Records', + 'description_nl': 'Genealogische bronnen en persoonsgegevens', + }, + 'archieven': { + 'patterns': [r'/archie[fv]', r'/inventaris', r'/toegangen', r'/finding'], + 'label': 'Archive Finding Aids', + 'description_nl': 'Archiefinventarissen en toegangen', + }, + 'collectie': { + 'patterns': [r'/collectie', r'/collection', r'/object'], + 'label': 'Collection Portal', + 'description_nl': 'Collectieportaal met objecten en kunstwerken', + }, + 'kranten': { + 'patterns': [r'/kranten', r'/newspaper', r'/periodiek'], + 'label': 'Newspaper Archive', + 'description_nl': 'Gedigitaliseerde kranten en periodieken', + }, + 'kaarten': { + 'patterns': [r'/kaart', r'/map', r'/cartogra'], + 'label': 'Map Collection', + 'description_nl': 'Historische kaarten en cartografisch materiaal', + }, + 'bibliotheek': { + 'patterns': [r'/catalogu', r'/biblio', r'/library', r'/boek'], + 'label': 'Library Catalog', + 'description_nl': 'Bibliotheekcatalogus', + }, + 'zoeken': { + 'patterns': [r'/zoeken', r'/search', r'/discover', r'/browse'], + 'label': 'Search Interface', + 'description_nl': 'Algemene zoekinterface', + }, +} + +CMS_INDICATORS = { + 'atlantis': ['atlantis', 'picturae'], + 'mais_flexis': ['mais-flexis', 'mais flexis', 'de ree'], + 'adlib': ['adlib', 'axiell'], + 'collective_access': ['collectiveaccess', 'collective access'], + 'archivematica': ['archivematica'], + 'archivesspace': ['archivesspace'], + 'atom': ['accesstomemory', 'atom'], + 'omeka': ['omeka'], + 'contentdm': ['contentdm'], + 'dspace': ['dspace'], + 'islandora': ['islandora'], + 'memorix': ['memorix'], +} + +# Metadata standards detection patterns with regex word boundaries +METADATA_STANDARDS_PATTERNS = [ + (r'\bdublin\s+core\b', 'Dublin Core', True), + (r'\bdc:', 'Dublin Core', True), + (r'\bdcterms\b', 'Dublin Core', True), + (r'\bmarc\s*21\b', 'MARC21', True), + (r'\bmarc21\b', 'MARC21', True), + (r'\bead\b', 'EAD', True), + (r'encoded\s+archival\s+description', 'EAD', True), + (r'\bead\s*2002\b', 'EAD', True), + (r'\bead3\b', 'EAD', True), + (r'\bmets\b', 'METS', True), + (r'metadata\s+encoding\s+and\s+transmission', 'METS', True), + (r'\bmods\b', 'MODS', True), + (r'metadata\s+object\s+description', 'MODS', True), + (r'\blido\b', 'LIDO', True), + (r'lightweight\s+information\s+describing', 'LIDO', True), + (r'\bcidoc[-\s]?crm\b', 'CIDOC-CRM', True), + (r'\bschema\.org\b', 'Schema.org', True), + (r'\bschema:', 'Schema.org', True), + (r'\bric[-\s]?o\b', 'RiC-O', True), + (r'records\s+in\s+contexts', 'RiC-O', True), + (r'\bpremis\b', 'PREMIS', True), + (r'preservation\s+metadata', 'PREMIS', True), + (r'\bbibframe\b', 'BIBFRAME', True), + (r'\biiif\b', 'IIIF', True), + (r'image\s+interoperability\s+framework', 'IIIF', True), +] + +# Dutch archive platform domains to detect +DUTCH_ARCHIVE_PLATFORMS = [ + 'archieven.nl', + 'memorix.nl', + 'archiefweb.eu', + 'atlantisdigitaal.nl', + 'picturae.nl', + 'mais-flexis.nl', + 'delpher.nl', + 'geheugen.nl', +] + + +def get_xpath(element, tree) -> str: + """Generate XPath for an lxml element.""" + parts = [] + while element is not None: + parent = element.getparent() + if parent is None: + parts.append(element.tag) + else: + siblings = [c for c in parent if c.tag == element.tag] + if len(siblings) == 1: + parts.append(element.tag) + else: + index = siblings.index(element) + 1 + parts.append(f'{element.tag}[{index}]') + element = parent + return '/' + '/'.join(reversed(parts)) + + +def normalize_url(url: str) -> str: + """Normalize URL by removing noise query parameters.""" + if not url: + return url + + parsed = urlparse(url) + # Remove common tracking/session parameters + noise_params = ['sort', 'order', 'view', 'mode', 'ss', 'page', 'offset', + 'limit', 'random', 'session', 'sid', 'token', 'ref'] + + if parsed.query: + params = dict(p.split('=', 1) if '=' in p else (p, '') + for p in parsed.query.split('&')) + filtered = {k: v for k, v in params.items() + if not any(k.startswith(n) for n in noise_params + ['utm_', 'fbclid', 'gclid'])} + new_query = '&'.join(f'{k}={v}' if v else k for k, v in sorted(filtered.items())) + return urlunparse(parsed._replace(query=new_query)) + + return url + + +def detect_catalog_type(url: str) -> dict | None: + """Detect catalog type from URL pattern.""" + url_lower = url.lower() + for type_key, type_info in CATALOG_TYPE_PATTERNS.items(): + for pattern in type_info['patterns']: + if re.search(pattern, url_lower): + return { + 'type': type_key, + 'label': type_info['label'], + 'description_nl': type_info['description_nl'], + } + return None + + +def detect_metadata_standards(content: str) -> list[str]: + """Detect metadata standards mentioned in content using regex word boundaries.""" + if not content: + return [] + + content_lower = content.lower() + standards_found = set() + + for pattern, standard_name, use_regex in METADATA_STANDARDS_PATTERNS: + if use_regex: + if re.search(pattern, content_lower, re.IGNORECASE): + standards_found.add(standard_name) + else: + if pattern.lower() in content_lower: + standards_found.add(standard_name) + + return sorted(list(standards_found)) + + +def detect_cms(content: str) -> str | None: + """Detect CMS/collection management system from content.""" + if not content: + return None + + content_lower = content.lower() + for cms_name, indicators in CMS_INDICATORS.items(): + for indicator in indicators: + if indicator in content_lower: + return cms_name + return None + + +def extract_website_url(entry: dict) -> str | None: + """Extract website URL from custodian entry.""" + # Check various possible locations for website + if 'website' in entry: + return entry['website'] + + # Check in enrichment data + for enrichment_key in ['zcbs_enrichment', 'google_maps_enrichment', 'wikidata_enrichment']: + if enrichment_key in entry: + enrichment = entry[enrichment_key] + if isinstance(enrichment, dict): + if 'website' in enrichment: + return enrichment['website'] + if 'url' in enrichment: + return enrichment['url'] + + # Check identifiers + if 'identifiers' in entry: + for identifier in entry.get('identifiers', []): + if isinstance(identifier, dict): + if identifier.get('identifier_scheme') == 'Website': + return identifier.get('identifier_value') + + return None + + +async def crawl_website(crawler: AsyncWebCrawler, url: str) -> dict: + """ + Crawl a website and extract structured data with XPath provenance. + + Returns a dict with: + - success: bool + - title: str + - description: str + - html: str (raw HTML for further processing) + - markdown: str + - links: list of dicts with href, text, xpath + - metadata: dict of og/meta tags + - error: str (if failed) + """ + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + verbose=False, + # Wait for page to fully load + wait_until="networkidle", + page_timeout=30000, + ) + + try: + result = await crawler.arun(url=url, config=config) + + if not result.success: + return { + 'success': False, + 'error': f'Crawl failed with status {result.status_code}', + 'status_code': result.status_code, + } + + # Parse HTML with lxml to extract XPaths + links_with_xpath = [] + if result.html: + try: + tree = etree.HTML(result.html) + link_elements = tree.xpath('//a[@href]') + + for link_el in link_elements: + href = link_el.get('href', '') + text = ''.join(link_el.itertext()).strip() + xpath = get_xpath(link_el, tree) + + # Skip empty links and javascript + if href and not href.startswith(('javascript:', '#', 'mailto:', 'tel:')): + links_with_xpath.append({ + 'href': href, + 'text': text[:200] if text else '', # Truncate long text + 'xpath': xpath, + }) + except Exception as e: + logger.warning(f"Error parsing HTML for XPath extraction: {e}") + + # Also include crawl4ai's extracted links for completeness + internal_links = result.links.get('internal', []) if result.links else [] + external_links = result.links.get('external', []) if result.links else [] + + return { + 'success': True, + 'status_code': result.status_code, + 'title': result.metadata.get('title', '') if result.metadata else '', + 'description': result.metadata.get('description', '') if result.metadata else '', + 'html': result.html, + 'markdown': result.markdown.raw_markdown if result.markdown else '', + 'links_with_xpath': links_with_xpath, + 'internal_links': [l.get('href', '') for l in internal_links if isinstance(l, dict)], + 'external_links': [l.get('href', '') for l in external_links if isinstance(l, dict)], + 'metadata': result.metadata or {}, + } + + except Exception as e: + logger.error(f"Error crawling {url}: {e}") + return { + 'success': False, + 'error': str(e), + } + + +def analyze_crawl_results(crawl_data: dict, base_url: str) -> dict: + """ + Analyze crawl results to extract APIs, catalogs, and metadata standards. + + Returns enrichment dict ready to add to YAML. + """ + enrichment = { + 'retrieval_timestamp': datetime.now(timezone.utc).isoformat(), + 'retrieval_agent': 'crawl4ai', + 'source_url': base_url, + 'status_code': crawl_data.get('status_code'), + } + + if not crawl_data.get('success'): + enrichment['error'] = crawl_data.get('error', 'Unknown error') + return enrichment + + # Basic metadata + enrichment['title'] = crawl_data.get('title', '') + enrichment['description'] = crawl_data.get('description', '') + enrichment['links_count'] = len(crawl_data.get('links_with_xpath', [])) + + # Collect all URLs for analysis + all_urls = set() + links_with_xpath = crawl_data.get('links_with_xpath', []) + + for link in links_with_xpath: + href = link.get('href', '') + if href: + # Make absolute URL if relative + if href.startswith('/'): + parsed_base = urlparse(base_url) + href = f"{parsed_base.scheme}://{parsed_base.netloc}{href}" + all_urls.add(href) + + # Add internal/external links from crawl4ai + for link in crawl_data.get('internal_links', []): + if link: + all_urls.add(link) + for link in crawl_data.get('external_links', []): + if link: + all_urls.add(link) + + # Detect API endpoints + detected_apis = [] + for url in all_urls: + url_lower = url.lower() + for pattern in API_ENDPOINT_PATTERNS: + if re.search(pattern, url_lower): + detected_apis.append({ + 'url': normalize_url(url), + 'pattern_matched': pattern, + }) + break + + if detected_apis: + enrichment['detected_api_endpoints'] = detected_apis + + # Detect catalog URLs with type classification + detected_catalogs = [] + for url in all_urls: + url_lower = url.lower() + for pattern in CATALOG_PATTERNS: + if re.search(pattern, url_lower): + catalog_entry = { + 'url': normalize_url(url), + } + catalog_type = detect_catalog_type(url) + if catalog_type: + catalog_entry['type'] = catalog_type['type'] + catalog_entry['label'] = catalog_type['label'] + + # Find XPath for this link + for link in links_with_xpath: + if link.get('href', '').rstrip('/') == url.rstrip('/') or \ + (link.get('href', '').startswith('/') and url.endswith(link.get('href', ''))): + catalog_entry['xpath'] = link.get('xpath') + catalog_entry['link_text'] = link.get('text', '') + break + + detected_catalogs.append(catalog_entry) + break + + if detected_catalogs: + enrichment['detected_catalog_urls'] = detected_catalogs + + # Detect external archive platforms + external_platforms = [] + for url in all_urls: + url_lower = url.lower() + for platform in DUTCH_ARCHIVE_PLATFORMS: + if platform in url_lower: + external_platforms.append({ + 'url': normalize_url(url), + 'platform': platform, + }) + break + + if external_platforms: + enrichment['external_archive_platforms'] = external_platforms + + # Detect metadata standards from content + # Handle None values explicitly to avoid string concatenation errors + markdown = crawl_data.get('markdown') or '' + title = crawl_data.get('title') or '' + description = crawl_data.get('description') or '' + content = f"{markdown} {title} {description}" + standards = detect_metadata_standards(content) + if standards: + enrichment['detected_standards'] = standards + + # Detect CMS + cms = detect_cms(content) + if cms: + enrichment['detected_cms'] = cms + + # Extract OG/meta tags of interest + metadata = crawl_data.get('metadata', {}) + og_data = {} + for key in ['og:title', 'og:description', 'og:image', 'og:url', 'og:site_name']: + if key in metadata: + og_data[key.replace('og:', '')] = metadata[key] + if og_data: + enrichment['open_graph'] = og_data + + return enrichment + + +def load_checkpoint() -> dict: + """Load checkpoint from file.""" + if CHECKPOINT_FILE.exists(): + with open(CHECKPOINT_FILE, 'r') as f: + return json.load(f) + return {} + + +def save_checkpoint(checkpoint: dict): + """Save checkpoint to file.""" + with open(CHECKPOINT_FILE, 'w') as f: + json.dump(checkpoint, f, indent=2) + + +async def process_single_file( + crawler: AsyncWebCrawler, + filepath: Path, + dry_run: bool = False, + force: bool = False, +) -> bool: + """Process a single custodian YAML file.""" + try: + with open(filepath, 'r', encoding='utf-8') as f: + entry = yaml.safe_load(f) + + if not entry: + logger.warning(f"Empty file: {filepath}") + return False + + # Check if already enriched + if 'crawl4ai_enrichment' in entry and not force: + logger.info(f"Skipping {filepath.name}: already has crawl4ai_enrichment") + return True + + # Extract website URL + website_url = extract_website_url(entry) + if not website_url: + logger.info(f"Skipping {filepath.name}: no website URL found") + return False + + # Ensure URL has protocol + if not website_url.startswith(('http://', 'https://')): + website_url = 'https://' + website_url + + logger.info(f"Processing {filepath.name}: {website_url}") + + if dry_run: + logger.info(f" -> DRY RUN: would crawl {website_url}") + return True + + # Crawl the website + crawl_data = await crawl_website(crawler, website_url) + + # Analyze results + enrichment = analyze_crawl_results(crawl_data, website_url) + + # Add enrichment to entry + entry['crawl4ai_enrichment'] = enrichment + + # Write back to file + with open(filepath, 'w', encoding='utf-8') as f: + yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + # Log summary + apis_count = len(enrichment.get('detected_api_endpoints', [])) + catalogs_count = len(enrichment.get('detected_catalog_urls', [])) + platforms_count = len(enrichment.get('external_archive_platforms', [])) + logger.info(f" -> success: {apis_count} APIs, {catalogs_count} catalogs, {platforms_count} external platforms found") + + return True + + except Exception as e: + logger.error(f"Error processing {filepath}: {e}") + return False + + +async def main(): + parser = argparse.ArgumentParser(description='Enrich Dutch custodians with Crawl4AI') + parser.add_argument('--dry-run', action='store_true', help='Show what would be enriched') + parser.add_argument('--limit', type=int, help='Process only first N files') + parser.add_argument('--start-index', type=int, default=0, help='Start from index N') + parser.add_argument('--resume', action='store_true', help='Resume from last checkpoint') + parser.add_argument('--force', action='store_true', help='Re-enrich even if already enriched') + parser.add_argument('--file', type=str, help='Process a single specific file') + args = parser.parse_args() + + # Create logs directory + logs_dir = Path(__file__).parent.parent / "logs" + logs_dir.mkdir(exist_ok=True) + + # Add file handler for logging + log_file = logs_dir / f"crawl4ai_enrichment_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log" + file_handler = logging.FileHandler(log_file) + file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) + logger.addHandler(file_handler) + + # Single file mode + if args.file: + filepath = Path(args.file) + if not filepath.exists(): + logger.error(f"File not found: {filepath}") + sys.exit(1) + + async with AsyncWebCrawler() as crawler: + success = await process_single_file(crawler, filepath, args.dry_run, args.force) + sys.exit(0 if success else 1) + + # Batch mode + files = sorted(CUSTODIAN_DIR.glob("NL-*.yaml")) + logger.info(f"Found {len(files)} Dutch custodian files") + + # Handle resume + start_index = args.start_index + if args.resume: + checkpoint = load_checkpoint() + if 'last_processed_index' in checkpoint: + start_index = checkpoint['last_processed_index'] + 1 + logger.info(f"Resuming from index {start_index}") + + # Apply limit + end_index = len(files) + if args.limit: + end_index = min(start_index + args.limit, len(files)) + + logger.info(f"Processing files {start_index} to {end_index - 1}") + + # Process files + success_count = 0 + error_count = 0 + + async with AsyncWebCrawler() as crawler: + for i, filepath in enumerate(files[start_index:end_index], start=start_index): + logger.info(f"[{i + 1}/{len(files)}] Processing {filepath.name}") + + success = await process_single_file(crawler, filepath, args.dry_run, args.force) + + if success: + success_count += 1 + else: + error_count += 1 + + # Save checkpoint + if not args.dry_run: + save_checkpoint({ + 'last_processed_index': i, + 'last_processed_file': str(filepath), + 'last_processed_time': datetime.now(timezone.utc).isoformat(), + 'success_count': success_count, + 'error_count': error_count, + }) + + # Rate limiting + if i < end_index - 1: + await asyncio.sleep(REQUEST_DELAY) + + # Summary + logger.info(f"\n{'='*50}") + logger.info(f"Enrichment complete!") + logger.info(f" Success: {success_count}") + logger.info(f" Errors: {error_count}") + logger.info(f" Log file: {log_file}") + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/scripts/fix_collision_victims.py b/scripts/fix_collision_victims.py new file mode 100644 index 0000000000..c05bdff82c --- /dev/null +++ b/scripts/fix_collision_victims.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python3 +""" +Fix GHCID collision victim files. + +These files have a trailing dash in their filename (e.g., NL-DR-ASS-L-BD-.yaml) +indicating they were collision victims whose internal GHCID was incorrectly set +to their collision partner's GHCID instead of getting their own unique GHCID. + +This script: +1. Reads the institution's real name from original_entry.organisatie +2. Generates a proper name suffix from that name +3. Creates a new unique GHCID with the proper suffix +4. Regenerates all GHCID-derived identifiers (UUID, numeric) +5. Updates the file with correct identifiers +6. Renames the file to match the new GHCID +""" + +import hashlib +import re +import shutil +import unicodedata +import uuid +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional + +import yaml + +# GHCID namespace for UUID generation +GHCID_NAMESPACE = uuid.NAMESPACE_URL +GHCID_URL_PREFIX = "https://glam.registry/" + +# Skip words for abbreviation generation (Dutch and common) +SKIP_WORDS = { + 'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des', + 's', 'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder', + 'door', 'en', 'of', 'stichting', 'vereniging', 'foundation', 'the', 'a', + 'an', 'of', 'in', 'at', 'on', 'to', 'for', 'with', 'from', 'by', 'as', + 'museum', 'bibliotheek', 'archief', 'collectie' +} + + +def normalize_diacritics(text: str) -> str: + """Normalize diacritics to ASCII equivalents.""" + normalized = unicodedata.normalize('NFD', text) + ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') + return ascii_text + + +def generate_name_suffix(native_name: str) -> str: + """Convert native language institution name to snake_case suffix. + + Examples: + "Biblionet Drenthe POI" → "biblionet_drenthe_poi" + "Fries Verzetsmuseum" → "fries_verzetsmuseum" + "Musée d'Orsay" → "musee_dorsay" + """ + # Normalize unicode and remove diacritics + ascii_name = normalize_diacritics(native_name) + + # Convert to lowercase + lowercase = ascii_name.lower() + + # Remove apostrophes, commas, and other punctuation + no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase) + + # Replace spaces and hyphens with underscores + underscored = re.sub(r'[\s\-]+', '_', no_punct) + + # Remove any remaining non-alphanumeric characters (except underscores) + clean = re.sub(r'[^a-z0-9_]', '', underscored) + + # Collapse multiple underscores + final = re.sub(r'_+', '_', clean).strip('_') + + return final + + +def generate_ghcid_uuid(ghcid: str) -> str: + """Generate UUID v5 from GHCID.""" + return str(uuid.uuid5(GHCID_NAMESPACE, f"{GHCID_URL_PREFIX}{ghcid}")) + + +def generate_ghcid_uuid_sha256(ghcid: str) -> str: + """Generate UUID v8 (SHA-256 based) from GHCID.""" + return str(uuid.uuid5(GHCID_NAMESPACE, f"{GHCID_URL_PREFIX}sha256/{ghcid}")) + + +def generate_ghcid_numeric(ghcid: str) -> int: + """Generate 64-bit numeric ID from GHCID.""" + sha256_hash = hashlib.sha256(ghcid.encode()).hexdigest() + return int(sha256_hash[:16], 16) + + +def fix_collision_victim(file_path: Path, dry_run: bool = False) -> Optional[Path]: + """Fix a single collision victim file. + + Args: + file_path: Path to the collision victim YAML file + dry_run: If True, only print what would be done + + Returns: + New file path after renaming, or None if skipped/failed + """ + print(f"\n{'='*80}") + print(f"Processing: {file_path.name}") + print(f"{'='*80}") + + # Read file + try: + with open(file_path) as f: + data = yaml.safe_load(f) + except Exception as e: + print(f" ERROR: Could not read file: {e}") + return None + + if data is None: + print(f" SKIP: File is empty or invalid") + return None + + # Get institution name + org_name = data.get('original_entry', {}).get('organisatie') + if not org_name: + print(f" ERROR: No organisatie found in original_entry") + return None + + print(f" Institution: {org_name}") + + # Get current GHCID info + ghcid_data = data.get('ghcid', {}) + old_ghcid = ghcid_data.get('ghcid_current', '') + print(f" Old GHCID: {old_ghcid}") + + # Extract base GHCID from filename (remove trailing dash) + base_ghcid = file_path.stem.rstrip('-') + print(f" Base GHCID: {base_ghcid}") + + # Generate new name suffix from institution name + name_suffix = generate_name_suffix(org_name) + print(f" Name suffix: {name_suffix}") + + # Create new GHCID + new_ghcid = f"{base_ghcid}-{name_suffix}" + print(f" New GHCID: {new_ghcid}") + + # Check if this would be the same as old (only filename is wrong) + if new_ghcid == old_ghcid: + expected_filename = f"{new_ghcid}.yaml" + if file_path.name != expected_filename: + print(f" GHCID correct, but filename wrong - needs rename only") + if dry_run: + print(f" DRY RUN: Would rename to {expected_filename}") + return None + + new_file_path = file_path.parent / expected_filename + if new_file_path.exists(): + print(f" ERROR: Target file already exists: {new_file_path.name}") + return None + + shutil.move(str(file_path), str(new_file_path)) + print(f" Renamed: {file_path.name} → {new_file_path.name}") + return new_file_path + else: + print(f" SKIP: GHCID and filename both correct") + return None + + # Generate new identifiers + new_uuid = generate_ghcid_uuid(new_ghcid) + new_uuid_sha256 = generate_ghcid_uuid_sha256(new_ghcid) + new_numeric = generate_ghcid_numeric(new_ghcid) + + print(f" New UUID: {new_uuid}") + print(f" New numeric: {new_numeric}") + + if dry_run: + print(f" DRY RUN: Would update file and rename to {new_ghcid}.yaml") + return None + + # Update GHCID section + timestamp = datetime.now(timezone.utc).isoformat() + + # Preserve old GHCID in history + ghcid_history = ghcid_data.get('ghcid_history', []) + + # Add history entry for the fix + ghcid_history.append({ + 'ghcid': old_ghcid, + 'ghcid_uuid': ghcid_data.get('ghcid_uuid', ''), + 'ghcid_numeric': ghcid_data.get('ghcid_numeric', 0), + 'valid_from': ghcid_data.get('generated_at', ''), + 'valid_to': timestamp, + 'reason': f"Collision fix: had partner's GHCID, corrected to institution's own GHCID based on name '{org_name}'" + }) + + data['ghcid'] = { + 'ghcid_current': new_ghcid, + 'ghcid_uuid': new_uuid, + 'ghcid_uuid_sha256': new_uuid_sha256, + 'ghcid_numeric': new_numeric, + 'generated_at': timestamp, + 'ghcid_history': ghcid_history + } + + # Update identifiers list + identifiers = data.get('identifiers', []) + updated_identifiers = [] + for ident in identifiers: + scheme = ident.get('identifier_scheme', '') + if scheme == 'GHCID': + ident['identifier_value'] = new_ghcid + ident['identifier_url'] = f"https://w3id.org/heritage/custodian/{new_ghcid}" + elif scheme == 'GHCID_UUID': + ident['identifier_value'] = new_uuid + elif scheme == 'GHCID_NUMERIC': + ident['identifier_value'] = str(new_numeric) + updated_identifiers.append(ident) + data['identifiers'] = updated_identifiers + + # Write updated data back to file + with open(file_path, 'w') as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + print(f" Updated file content") + + # Rename file to match new GHCID + new_file_path = file_path.parent / f"{new_ghcid}.yaml" + + if new_file_path.exists(): + print(f" ERROR: Target file already exists: {new_file_path.name}") + return None + + shutil.move(str(file_path), str(new_file_path)) + print(f" Renamed: {file_path.name} → {new_file_path.name}") + + return new_file_path + + +def main(): + import argparse + + parser = argparse.ArgumentParser(description='Fix GHCID collision victim files') + parser.add_argument('--dry-run', action='store_true', help='Only show what would be done') + parser.add_argument('--file', type=str, help='Process only this specific file') + args = parser.parse_args() + + custodian_dir = Path('data/custodian') + + if args.file: + files = [Path(args.file)] + else: + # Find all collision victim files (trailing dash pattern) + files = sorted(custodian_dir.glob('NL-*-.yaml')) + + print(f"Found {len(files)} collision victim file(s)") + + fixed = 0 + skipped = 0 + errors = 0 + + for f in files: + result = fix_collision_victim(f, dry_run=args.dry_run) + if result: + fixed += 1 + elif result is None: + # Check if it was empty + if f.stat().st_size == 0: + print(f"\n EMPTY FILE: {f.name} - should be deleted") + errors += 1 + else: + skipped += 1 + + print(f"\n{'='*80}") + print(f"SUMMARY") + print(f"{'='*80}") + print(f" Fixed: {fixed}") + print(f" Skipped: {skipped}") + print(f" Errors/Empty: {errors}") + + +if __name__ == '__main__': + main() diff --git a/scripts/fix_generic_platform_names.py b/scripts/fix_generic_platform_names.py new file mode 100755 index 0000000000..551e861012 --- /dev/null +++ b/scripts/fix_generic_platform_names.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +""" +Fix generic platform names ('Home Website', 'Homepage Website') by using +the organisatie field from original_entry. + +Also filters invalid platform types (ONLINEMARKETING, ONLINEBRANDING). +""" + +import yaml +import os +import sys +from pathlib import Path +from datetime import datetime, timezone + +# Custom YAML representer to preserve formatting +def str_representer(dumper, data): + if '\n' in data: + return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') + return dumper.represent_scalar('tag:yaml.org,2002:str', data) + +yaml.add_representer(str, str_representer) + +GENERIC_NAMES = {'Home Website', 'Homepage Website', 'Welkom Website'} +INVALID_TYPES = {'ONLINEMARKETING', 'ONLINEBRANDING', 'ONLINEWEBSITE', 'ONLINE'} + +def fix_file(filepath: Path, dry_run: bool = False) -> dict: + """Fix a single file. Returns stats dict.""" + stats = { + 'name_fixed': False, + 'types_fixed': False, + 'old_name': None, + 'new_name': None, + 'removed_types': [] + } + + with open(filepath, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + if not data: + return stats + + # Check if file has digital_platform_v2 + if 'digital_platform_v2' not in data: + return stats + + dpv2 = data['digital_platform_v2'] + modified = False + + # Fix 1: Generic platform names + current_name = dpv2.get('platform_name', '') + if current_name in GENERIC_NAMES: + # Try to get organisation name + org_name = None + if 'original_entry' in data and 'organisatie' in data['original_entry']: + org_name = data['original_entry']['organisatie'] + elif 'museum_register_enrichment' in data and 'museum_name' in data['museum_register_enrichment']: + org_name = data['museum_register_enrichment']['museum_name'] + elif 'wikidata_enrichment' in data and 'wikidata_label_nl' in data['wikidata_enrichment']: + org_name = data['wikidata_enrichment']['wikidata_label_nl'] + + if org_name: + new_name = f"{org_name} Website" + stats['old_name'] = current_name + stats['new_name'] = new_name + stats['name_fixed'] = True + dpv2['platform_name'] = new_name + modified = True + + # Fix 2: Invalid platform types + if 'platform_type' in dpv2 and isinstance(dpv2['platform_type'], list): + original_types = dpv2['platform_type'].copy() + filtered_types = [t for t in original_types if t not in INVALID_TYPES] + + if len(filtered_types) < len(original_types): + stats['removed_types'] = [t for t in original_types if t in INVALID_TYPES] + stats['types_fixed'] = True + dpv2['platform_type'] = filtered_types if filtered_types else ['INSTITUTIONAL_WEBSITE'] + modified = True + + # Add fix metadata + if modified: + if '_transformation_metadata' not in dpv2: + dpv2['_transformation_metadata'] = {} + dpv2['_transformation_metadata']['quality_fix_date'] = datetime.now(timezone.utc).isoformat() + if stats['name_fixed']: + dpv2['_transformation_metadata']['name_source'] = 'organisatie_field' + + if not dry_run: + with open(filepath, 'w', encoding='utf-8') as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + return stats + + +def main(): + import argparse + parser = argparse.ArgumentParser(description='Fix generic platform names') + parser.add_argument('--dry-run', action='store_true', help='Show what would be changed without modifying files') + parser.add_argument('--path', default='/Users/kempersc/apps/glam/data/custodian', help='Path to custodian files') + args = parser.parse_args() + + custodian_path = Path(args.path) + + # Find files with digital_platform_v2 + files_fixed_names = 0 + files_fixed_types = 0 + total_checked = 0 + + print(f"{'[DRY RUN] ' if args.dry_run else ''}Scanning {custodian_path}...") + print() + + for filepath in sorted(custodian_path.glob('NL-*.yaml')): + stats = fix_file(filepath, dry_run=args.dry_run) + + if stats['name_fixed'] or stats['types_fixed']: + total_checked += 1 + + if stats['name_fixed']: + files_fixed_names += 1 + print(f"✓ {filepath.name}") + print(f" Name: '{stats['old_name']}' → '{stats['new_name']}'") + + if stats['types_fixed']: + files_fixed_types += 1 + print(f" Removed types: {stats['removed_types']}") + + print() + print("=" * 60) + print(f"{'[DRY RUN] ' if args.dry_run else ''}Summary:") + print(f" Files with name fixed: {files_fixed_names}") + print(f" Files with types fixed: {files_fixed_types}") + print(f" Total files modified: {total_checked}") + + if args.dry_run: + print() + print("Run without --dry-run to apply changes.") + + +if __name__ == '__main__': + main() diff --git a/scripts/fix_generic_platform_names_fast.py b/scripts/fix_generic_platform_names_fast.py new file mode 100755 index 0000000000..fadaae3f72 --- /dev/null +++ b/scripts/fix_generic_platform_names_fast.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +""" +Fast fix for generic platform names - processes only files from stdin or file list. +""" + +import yaml +import sys +from pathlib import Path +from datetime import datetime, timezone + +GENERIC_NAMES = {'Home Website', 'Homepage Website', 'Welkom Website'} +INVALID_TYPES = {'ONLINEMARKETING', 'ONLINEBRANDING', 'ONLINEWEBSITE', 'ONLINE'} + +def fix_file(filepath: Path, dry_run: bool = False) -> dict: + """Fix a single file.""" + stats = {'name_fixed': False, 'types_fixed': False, 'old_name': None, 'new_name': None, 'removed_types': []} + + with open(filepath, 'r', encoding='utf-8') as f: + content = f.read() + data = yaml.safe_load(content) + + if not data or 'digital_platform_v2' not in data: + return stats + + dpv2 = data['digital_platform_v2'] + modified = False + + # Fix generic names + current_name = dpv2.get('platform_name', '') + if current_name in GENERIC_NAMES: + org_name = None + if 'original_entry' in data and data['original_entry'].get('organisatie'): + org_name = data['original_entry']['organisatie'] + elif 'museum_register_enrichment' in data and data['museum_register_enrichment'].get('museum_name'): + org_name = data['museum_register_enrichment']['museum_name'] + elif 'wikidata_enrichment' in data and data['wikidata_enrichment'].get('wikidata_label_nl'): + org_name = data['wikidata_enrichment']['wikidata_label_nl'] + + if org_name: + new_name = f"{org_name} Website" + stats['old_name'] = current_name + stats['new_name'] = new_name + stats['name_fixed'] = True + dpv2['platform_name'] = new_name + modified = True + + # Fix invalid types + if 'platform_type' in dpv2 and isinstance(dpv2['platform_type'], list): + original_types = dpv2['platform_type'].copy() + filtered_types = [t for t in original_types if t not in INVALID_TYPES] + if len(filtered_types) < len(original_types): + stats['removed_types'] = [t for t in original_types if t in INVALID_TYPES] + stats['types_fixed'] = True + dpv2['platform_type'] = filtered_types if filtered_types else ['INSTITUTIONAL_WEBSITE'] + modified = True + + if modified: + if '_transformation_metadata' not in dpv2: + dpv2['_transformation_metadata'] = {} + dpv2['_transformation_metadata']['quality_fix_date'] = datetime.now(timezone.utc).isoformat() + + if not dry_run: + with open(filepath, 'w', encoding='utf-8') as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + return stats + +def main(): + dry_run = '--dry-run' in sys.argv + file_list = sys.argv[1] if len(sys.argv) > 1 and not sys.argv[1].startswith('--') else None + + if file_list: + with open(file_list) as f: + files = [Path(line.strip()) for line in f if line.strip()] + else: + files = [Path(line.strip()) for line in sys.stdin if line.strip()] + + fixed_names = 0 + fixed_types = 0 + + for filepath in files: + if not filepath.exists(): + continue + stats = fix_file(filepath, dry_run=dry_run) + + if stats['name_fixed'] or stats['types_fixed']: + if stats['name_fixed']: + fixed_names += 1 + print(f"✓ {filepath.name}: '{stats['old_name']}' → '{stats['new_name']}'") + if stats['types_fixed']: + fixed_types += 1 + print(f" Removed: {stats['removed_types']}") + + print(f"\n{'[DRY RUN] ' if dry_run else ''}Fixed: {fixed_names} names, {fixed_types} type lists") + +if __name__ == '__main__': + main() diff --git a/scripts/fix_ghcid_type.py b/scripts/fix_ghcid_type.py new file mode 100644 index 0000000000..d0055354f3 --- /dev/null +++ b/scripts/fix_ghcid_type.py @@ -0,0 +1,523 @@ +#!/usr/bin/env python3 +""" +Fix GHCID type codes in Dutch custodian files. + +This script corrects GHCID type codes (position 4) for files where the +type was incorrectly assigned. Common corrections: +- U→M: Unknown should be Museum +- U→I: Unknown should be Intangible Heritage +- U→T: Unknown should be Taste/Smell Heritage +- X→I: Mixed should be Intangible Heritage (single type) + +Usage: + # Dry run (preview changes) + python scripts/fix_ghcid_type.py --dry-run + + # Apply fixes + python scripts/fix_ghcid_type.py + + # Process specific correction type only + python scripts/fix_ghcid_type.py --correction U-to-I --dry-run + python scripts/fix_ghcid_type.py --correction U-to-M + + # Process a single file + python scripts/fix_ghcid_type.py --file data/custodian/NL-DR-FRE-U-FCFE.yaml --new-type I + +Author: GLAM Data Quality Team +Date: 2025-12-14 +""" + +import argparse +import hashlib +import shutil +import uuid +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional + +import yaml + +# GHCID namespace for UUID v5 generation (same as DNS namespace per project spec) +GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8") + +# Type code corrections: filename pattern -> new type code +# These are determined by analyzing original_entry.type_organisatie in each file +# +# Current U-type breakdown (173 files): +# - 143 files: type_organisatie: museum → should be M +# - 14 files: type_organisatie: intangible_heritage_custodian → should be I +# - 7 files: type_organisatie: unknown → keep as U (correct) +# +# Current X-type files (2 files): +# - Both are intangible_heritage_custodian → should be I (single type, not mixed) +# +TYPE_CORRECTIONS = { + # U→I: Intangible heritage custodians incorrectly marked as Unknown (14 files) + "U-to-I": { + "files": [ + "NL-DR-FRE-U-FCFE.yaml", + "NL-GE-TIE-U-BO.yaml", + "NL-LI-VAL-U-C.yaml", + "NL-NH-AMS-U-C.yaml", + "NL-NH-ASS-U-HA.yaml", + "NL-NH-SAN-U-HSO.yaml", + "NL-OV-GEN-U-GB.yaml", + "NL-OV-GEN-U-GMS.yaml", + "NL-OV-OMM-U-EO.yaml", + "NL-OV-SAA-U-BS.yaml", + "NL-ZH-BOD-U-GB.yaml", + "NL-ZH-GOU-U-BI.yaml", + "NL-ZH-HIL-U-HHO.yaml", + "NL-ZH-LIS-U-HLO.yaml", + ], + "old_type": "U", + "new_type": "I", + "reason": "Type corrected: intangible_heritage_custodian should use type I (Intangible Heritage), not U (Unknown)", + }, + # X→I: Mixed type should be Intangible (single primary type) (2 files) + "X-to-I": { + "files": [ + "NL-OV-KAL-X-BW.yaml", + "NL-GE-HAT-X-IGR.yaml", + ], + "old_type": "X", + "new_type": "I", + "reason": "Type corrected: intangible_heritage_custodian should use type I (Intangible Heritage), not X (Mixed)", + }, + # U→M: Museums incorrectly marked as Unknown (143 files) + # Use --auto-detect-museums flag to populate this list dynamically + "U-to-M": { + "files": [ + # Auto-detected by checking original_entry.type_organisatie == "museum" + # Run with: python scripts/fix_ghcid_type.py --auto-detect-museums --dry-run + ], + "old_type": "U", + "new_type": "M", + "reason": "Type corrected: museum should use type M (Museum), not U (Unknown)", + }, +} + + +def generate_uuid_v5(ghcid_string: str) -> str: + """Generate deterministic UUID v5 from GHCID string.""" + return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string)) + + +def generate_uuid_v8_sha256(ghcid_string: str) -> str: + """Generate UUID v8 from SHA-256 hash of GHCID string.""" + sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest() + # Take first 16 bytes for UUID + uuid_bytes = bytearray(sha256_hash[:16]) + # Set version to 8 (custom) + uuid_bytes[6] = (uuid_bytes[6] & 0x0f) | 0x80 + # Set variant to RFC 4122 + uuid_bytes[8] = (uuid_bytes[8] & 0x3f) | 0x80 + return str(uuid.UUID(bytes=bytes(uuid_bytes))) + + +def generate_numeric_id(ghcid_string: str) -> int: + """Generate 64-bit numeric ID from SHA-256 hash.""" + sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest() + # Take first 8 bytes as 64-bit unsigned integer + numeric_id = int.from_bytes(sha256_hash[:8], byteorder='big') + return numeric_id + + +def fix_ghcid_type(ghcid: str, old_type: str, new_type: str) -> str: + """ + Replace the type code in a GHCID string. + + GHCID format: CC-RR-CCC-T-ABBREV[-suffix] + Position 4 (0-indexed 3) is the type code. + + Examples: + NL-DR-FRE-U-FCFE → NL-DR-FRE-I-FCFE + NL-OV-KAL-X-BW → NL-OV-KAL-I-BW + """ + parts = ghcid.split('-') + if len(parts) < 5: + raise ValueError(f"Invalid GHCID format: {ghcid}") + + current_type = parts[3] + if current_type != old_type: + raise ValueError(f"Expected type '{old_type}' but found '{current_type}' in GHCID: {ghcid}") + + parts[3] = new_type + return '-'.join(parts) + + +def process_file( + file_path: Path, + old_type: str, + new_type: str, + reason: str, + dry_run: bool = True +) -> Optional[dict]: + """ + Process a single YAML file to fix GHCID type code. + + Returns dict with change info, or None if no change needed or error. + """ + try: + with open(file_path, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + except Exception as e: + print(f" Error reading {file_path}: {e}") + return None + + if not data or 'ghcid' not in data: + print(f" Warning: No ghcid section in {file_path}") + return None + + ghcid_section = data.get('ghcid', {}) + old_ghcid = ghcid_section.get('ghcid_current', '') + + if not old_ghcid: + print(f" Warning: No ghcid_current in {file_path}") + return None + + # Check if the type matches what we expect to fix + parts = old_ghcid.split('-') + if len(parts) < 5: + print(f" Warning: Invalid GHCID format in {file_path}: {old_ghcid}") + return None + + current_type = parts[3] + if current_type != old_type: + print(f" Skipping {file_path}: type is '{current_type}', expected '{old_type}'") + return None + + # Fix the GHCID + try: + new_ghcid = fix_ghcid_type(old_ghcid, old_type, new_type) + except ValueError as e: + print(f" Error: {e}") + return None + + if new_ghcid == old_ghcid: + return None + + # Generate new identifiers + new_uuid_v5 = generate_uuid_v5(new_ghcid) + new_uuid_v8 = generate_uuid_v8_sha256(new_ghcid) + new_numeric = generate_numeric_id(new_ghcid) + timestamp_now = datetime.now(timezone.utc).isoformat() + + change_info = { + 'file': str(file_path), + 'old_ghcid': old_ghcid, + 'new_ghcid': new_ghcid, + 'old_type': old_type, + 'new_type': new_type, + 'old_uuid': ghcid_section.get('ghcid_uuid', ''), + 'new_uuid': new_uuid_v5, + 'old_numeric': ghcid_section.get('ghcid_numeric', 0), + 'new_numeric': new_numeric, + } + + if dry_run: + return change_info + + # Update ghcid section + ghcid_section['ghcid_current'] = new_ghcid + ghcid_section['ghcid_uuid'] = new_uuid_v5 + ghcid_section['ghcid_uuid_sha256'] = new_uuid_v8 + ghcid_section['ghcid_numeric'] = new_numeric + # Keep ghcid_original as-is (for historical reference) + + # Add history entry for the fix + ghcid_history = ghcid_section.get('ghcid_history', []) + + # Add new entry at the beginning + new_history_entry = { + 'ghcid': new_ghcid, + 'ghcid_numeric': new_numeric, + 'valid_from': timestamp_now, + 'reason': reason, + } + + # Mark previous entry as superseded + if ghcid_history: + if 'valid_to' not in ghcid_history[0] or ghcid_history[0]['valid_to'] is None: + ghcid_history[0]['valid_to'] = timestamp_now + ghcid_history[0]['superseded_by'] = new_ghcid + + ghcid_section['ghcid_history'] = [new_history_entry] + ghcid_history + data['ghcid'] = ghcid_section + + # Update identifiers section + identifiers = data.get('identifiers', []) + for ident in identifiers: + scheme = ident.get('identifier_scheme') + if scheme == 'GHCID': + ident['identifier_value'] = new_ghcid + elif scheme == 'GHCID_UUID': + ident['identifier_value'] = new_uuid_v5 + ident['identifier_url'] = f"urn:uuid:{new_uuid_v5}" + elif scheme == 'GHCID_UUID_SHA256': + ident['identifier_value'] = new_uuid_v8 + ident['identifier_url'] = f"urn:uuid:{new_uuid_v8}" + elif scheme == 'GHCID_NUMERIC': + ident['identifier_value'] = str(new_numeric) + data['identifiers'] = identifiers + + # Also update original_entry.type if present (to keep consistency) + if 'original_entry' in data and 'type' in data['original_entry']: + # Update type list to use new type + current_types = data['original_entry']['type'] + if isinstance(current_types, list): + # Replace old type with new type in the list + data['original_entry']['type'] = [ + new_type if t == old_type else t for t in current_types + ] + + # Write updated file + with open(file_path, 'w', encoding='utf-8') as f: + yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) + + # Rename file to match new GHCID + old_filename = file_path.name + new_filename = f"{new_ghcid}.yaml" + + if old_filename != new_filename: + new_file_path = file_path.parent / new_filename + if new_file_path.exists(): + print(f" Warning: Target file already exists: {new_file_path}") + # Don't rename if target exists + else: + shutil.move(str(file_path), str(new_file_path)) + change_info['new_file'] = str(new_file_path) + + return change_info + + +def find_files_for_correction( + custodian_dir: Path, + correction_key: str +) -> list[Path]: + """Find files that need the specified type correction.""" + correction = TYPE_CORRECTIONS.get(correction_key) + if not correction: + print(f"Unknown correction type: {correction_key}") + return [] + + files = [] + for filename in correction['files']: + file_path = custodian_dir / filename + if file_path.exists(): + files.append(file_path) + else: + print(f" Warning: File not found: {file_path}") + + return files + + +def auto_detect_museum_files(custodian_dir: Path) -> list[Path]: + """ + Auto-detect files where type should be M (Museum) based on: + - original_entry.type_organisatie == "museum" + - Current GHCID type is U (Unknown) + """ + museum_files = [] + + # Find all NL-*-U-*.yaml files (Dutch files with Unknown type) + for file_path in custodian_dir.glob("NL-*-U-*.yaml"): + try: + with open(file_path, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + if not data: + continue + + # Check if type_organisatie indicates this is a museum + orig_entry = data.get('original_entry', {}) + type_org = orig_entry.get('type_organisatie', '').lower() + + if type_org == 'museum': + museum_files.append(file_path) + except Exception: + continue + + return museum_files + + +def main(): + parser = argparse.ArgumentParser( + description="Fix GHCID type codes in Dutch custodian files" + ) + parser.add_argument( + '--dry-run', + action='store_true', + help='Preview changes without modifying files' + ) + parser.add_argument( + '--correction', + choices=['U-to-I', 'U-to-M', 'X-to-I', 'all'], + default='all', + help='Which correction type to apply (default: all)' + ) + parser.add_argument( + '--file', + type=str, + help='Process a single file instead of batch' + ) + parser.add_argument( + '--new-type', + type=str, + help='New type code when processing single file' + ) + parser.add_argument( + '--auto-detect-museums', + action='store_true', + help='Auto-detect museum files based on type_organisatie field' + ) + parser.add_argument( + '--custodian-dir', + type=str, + default='data/custodian', + help='Path to custodian directory (default: data/custodian)' + ) + + args = parser.parse_args() + + # Find project root (where data/ directory is) + script_dir = Path(__file__).parent + project_root = script_dir.parent + custodian_dir = project_root / args.custodian_dir + + if not custodian_dir.exists(): + print(f"Error: Custodian directory not found: {custodian_dir}") + return 1 + + print(f"GHCID Type Correction Script") + print(f"{'=' * 50}") + print(f"Mode: {'DRY RUN' if args.dry_run else 'APPLY CHANGES'}") + print(f"Custodian directory: {custodian_dir}") + print() + + all_changes = [] + + # Single file mode + if args.file: + if not args.new_type: + print("Error: --new-type is required when using --file") + return 1 + + file_path = Path(args.file) + if not file_path.is_absolute(): + file_path = project_root / file_path + + if not file_path.exists(): + print(f"Error: File not found: {file_path}") + return 1 + + # Detect old type from filename + parts = file_path.stem.split('-') + if len(parts) >= 4: + old_type = parts[3] + else: + print(f"Error: Cannot determine type from filename: {file_path}") + return 1 + + reason = f"Type corrected: {old_type} → {args.new_type} (manual correction)" + + print(f"Processing single file: {file_path}") + change = process_file(file_path, old_type, args.new_type, reason, args.dry_run) + if change: + all_changes.append(change) + + # Auto-detect museum files + elif args.auto_detect_museums: + print("Auto-detecting museum files...") + museum_files = auto_detect_museum_files(custodian_dir) + print(f"Found {len(museum_files)} museum files with type U") + + # Update the U-to-M correction with detected files + TYPE_CORRECTIONS['U-to-M']['files'] = [f.name for f in museum_files] + + # Process them + correction = TYPE_CORRECTIONS['U-to-M'] + for file_path in museum_files: + change = process_file( + file_path, + correction['old_type'], + correction['new_type'], + correction['reason'], + args.dry_run + ) + if change: + all_changes.append(change) + + # Batch mode + else: + corrections_to_apply = [] + + if args.correction == 'all': + corrections_to_apply = list(TYPE_CORRECTIONS.keys()) + else: + corrections_to_apply = [args.correction] + + for correction_key in corrections_to_apply: + correction = TYPE_CORRECTIONS[correction_key] + + if not correction['files']: + print(f"\nSkipping {correction_key}: no files specified") + continue + + print(f"\nProcessing {correction_key}:") + print(f" {correction['old_type']} → {correction['new_type']}") + print(f" Files: {len(correction['files'])}") + + files = find_files_for_correction(custodian_dir, correction_key) + + for file_path in files: + change = process_file( + file_path, + correction['old_type'], + correction['new_type'], + correction['reason'], + args.dry_run + ) + if change: + all_changes.append(change) + + # Summary + print(f"\n{'=' * 50}") + print(f"SUMMARY") + print(f"{'=' * 50}") + + if not all_changes: + print("No changes needed or no matching files found.") + return 0 + + print(f"Total changes: {len(all_changes)}") + print() + + # Group by type change + by_type_change = {} + for change in all_changes: + key = f"{change['old_type']}→{change['new_type']}" + if key not in by_type_change: + by_type_change[key] = [] + by_type_change[key].append(change) + + for key, changes in sorted(by_type_change.items()): + print(f"\n{key}: {len(changes)} files") + for change in changes: + print(f" {change['old_ghcid']} → {change['new_ghcid']}") + if 'new_file' in change: + print(f" Renamed to: {Path(change['new_file']).name}") + + if args.dry_run: + print(f"\n{'=' * 50}") + print("DRY RUN - No files were modified.") + print("Run without --dry-run to apply changes.") + else: + print(f"\n{'=' * 50}") + print(f"Successfully updated {len(all_changes)} files.") + + return 0 + + +if __name__ == '__main__': + exit(main()) diff --git a/scripts/fix_simon_kemper_contamination.py b/scripts/fix_simon_kemper_contamination.py new file mode 100644 index 0000000000..7f0f2a4378 --- /dev/null +++ b/scripts/fix_simon_kemper_contamination.py @@ -0,0 +1,269 @@ +#!/usr/bin/env python3 +""" +Fix Simon Kemper contamination in entity profiles. + +For entries where: +1. Name is "Simon Kemper" +2. But the LinkedIn slug clearly indicates a different person + +We derive the correct name from the slug and update the profile. + +IMPORTANT: Per Rule 21 (Data Fabrication Prohibition) - if we cannot reliably +derive the name from the slug, we mark it as "Unknown" rather than guessing. +Compound slugs without hyphens (like "jponjee") cannot be reliably parsed. +""" + +import json +import os +import re +from pathlib import Path +from urllib.parse import unquote +from datetime import datetime, timezone + + +def is_compound_slug(slug: str) -> bool: + """Check if slug is a compound name without separators. + + Returns True for slugs like: + - 'jponjee' (no hyphens, all lowercase) + - 'sharellyemanuelson' + - 'addieroelofsen' + - 'adheliap' + + Returns False for slugs like: + - 'willem-blok' (has hyphens) + - 'jan-van-den-borre' (has hyphens) + - 'miriam-h' (has hyphens, even if short) + - 'olivi%C3%AB-7153658' (has hyphens after URL decoding) + """ + # First decode URL encoding (e.g., %C3%AB -> ë) + slug = unquote(slug) + + # After removing trailing ID, check if there are NO hyphens + clean_slug = re.sub(r'[-_][\da-f]{6,}$', '', slug) + clean_slug = re.sub(r'[-_]\d{5,}$', '', clean_slug) + + # If no hyphens remain, it's a compound slug that can't be reliably parsed + # Even short ones like "jponjee" (7 chars) could be "J. Ponjee" or "J Ponjee" + if '-' not in clean_slug: + return True + + return False + + +def slug_to_name(slug: str) -> tuple[str, bool]: + """Convert a LinkedIn slug to a human-readable name. + + Returns: + tuple: (name, is_reliable) where: + - name: The derived name or "Unknown" + - is_reliable: True if we're confident in the derivation + + Examples: + 'willem-blok-b6a46648' -> ('Willem Blok', True) + 'dave-van-den-nieuwenhof-4446b3146' -> ('Dave van den Nieuwenhof', True) + 'olivi%C3%AB-7153658' -> ('Olivië', True) + 'jponjee' -> ('Unknown', False) # Compound slug, cannot parse reliably + 'sharellyemanuelson' -> ('Unknown', False) # Compound slug + """ + # Decode URL encoding + slug = unquote(slug) + + # Remove trailing ID (hex or numeric) + clean_slug = re.sub(r'[-_][\da-f]{6,}$', '', slug) + clean_slug = re.sub(r'[-_]\d{5,}$', '', clean_slug) + + # Check if this is a compound slug we can't reliably parse + if is_compound_slug(slug): + return ("Unknown", False) + + # Split by hyphens + parts = clean_slug.split('-') + + # Filter out empty parts + parts = [p for p in parts if p] + + if not parts: + return ("Unknown", False) + + # Capitalize appropriately + # Dutch particles that should stay lowercase: van, de, den, der, het, 't + dutch_particles = {'van', 'de', 'den', 'der', 'het', 't', "'t"} + + name_parts = [] + for i, part in enumerate(parts): + if part.lower() in dutch_particles and i > 0: + name_parts.append(part.lower()) + else: + # Capitalize first letter, preserve rest + name_parts.append(part.capitalize()) + + name = ' '.join(name_parts) + + # Additional validation - name should have at least 2 characters + if len(name) < 2: + return ("Unknown", False) + + return (name, True) + +def fix_contaminated_files(entity_dir: Path, dry_run: bool = True): + """Find and fix Simon Kemper contaminated files. + + Only processes files where name is ACTUALLY "Simon Kemper" (contaminated). + Skips files where name was already corrected or was never contaminated. + + Returns: + tuple: (contaminated_list, fixed_list, unreliable_list) + """ + + contaminated = [] + fixed = [] + unreliable = [] # Files where we couldn't reliably derive the name + + for filepath in entity_dir.glob("*.json"): + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = json.load(f) + except (json.JSONDecodeError, IOError): + continue + + # Check if this is a Simon Kemper contamination + profile_name = data.get('profile_data', {}).get('name', '') + source_name = data.get('source_staff_info', {}).get('name', '') + + # ONLY process files where the name is ACTUALLY "Simon Kemper" + if profile_name != 'Simon Kemper' and source_name != 'Simon Kemper': + continue + + # Get the slug from filename or URL + filename = filepath.name + linkedin_url = data.get('extraction_metadata', {}).get('linkedin_url', '') + + # Extract slug from URL + slug_match = re.search(r'/in/([^/]+)/?$', linkedin_url) + if not slug_match: + continue + + slug = slug_match.group(1) + + # Check if this is truly contamination (slug doesn't match simon kemper) + slug_lower = slug.lower().replace('%', '') + if 'simonkemper' in slug_lower or 'simon-kemper' in slug_lower: + # This is the real Simon Kemper, skip + continue + + # Derive correct name from slug + correct_name, is_reliable = slug_to_name(slug) + + entry = { + 'file': filepath.name, + 'slug': slug, + 'profile_name': profile_name, + 'source_name': source_name, + 'contaminated_field': 'profile_data.name' if profile_name == 'Simon Kemper' else 'source_staff_info.name', + 'correct_name': correct_name, + 'is_reliable': is_reliable, + 'headline': data.get('profile_data', {}).get('headline', ''), + 'custodian': data.get('affiliations', [{}])[0].get('custodian_name', '') if data.get('affiliations') else '' + } + + if is_reliable: + contaminated.append(entry) + else: + unreliable.append(entry) + + if not dry_run: + # Fix the data + if 'profile_data' in data: + data['profile_data']['name'] = correct_name + if 'source_staff_info' in data: + data['source_staff_info']['name'] = correct_name + + # Add fix metadata + if 'extraction_metadata' not in data: + data['extraction_metadata'] = {} + + if is_reliable: + fix_note = f"Name corrected from 'Simon Kemper' (contamination) to '{correct_name}' (derived from slug) on {datetime.now(timezone.utc).isoformat()}" + else: + fix_note = f"Name set to 'Unknown' (was 'Simon Kemper' contamination). Original slug: {slug}. Compound slug cannot be reliably parsed. Fixed on {datetime.now(timezone.utc).isoformat()}" + # Also preserve slug in a dedicated field for future reference + data['extraction_metadata']['original_slug'] = slug + + existing_notes = data['extraction_metadata'].get('notes', '') + if existing_notes: + data['extraction_metadata']['notes'] = f"{existing_notes} | {fix_note}" + else: + data['extraction_metadata']['notes'] = fix_note + + # Write back + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False) + + fixed.append(filepath.name) + + return contaminated, fixed, unreliable + +def main(): + import argparse + parser = argparse.ArgumentParser(description='Fix Simon Kemper contamination') + parser.add_argument('--fix', action='store_true', help='Actually fix files (default: dry run)') + args = parser.parse_args() + + entity_dir = Path("/Users/kempersc/apps/glam/data/custodian/person/entity") + + dry_run = not args.fix + mode = "DRY RUN" if dry_run else "FIXING" + + print("=" * 80) + print(f"SIMON KEMPER CONTAMINATION FIX - {mode}") + print("=" * 80) + + contaminated, fixed, unreliable = fix_contaminated_files(entity_dir, dry_run=dry_run) + + print(f"\n{'='*40}") + print(f"RELIABLY PARSEABLE ({len(contaminated)} files)") + print(f"{'='*40}") + print("These slugs have hyphens and can be reliably converted to names:\n") + + for c in contaminated: + print(f" File: {c['file']}") + print(f" Slug: {c['slug']}") + print(f" Contaminated: {c['contaminated_field']} = 'Simon Kemper'") + print(f" Correct name: '{c['correct_name']}'") + headline = c['headline'] + print(f" Headline: {headline[:60]}..." if len(headline) > 60 else f" Headline: {headline}") + print(f" Custodian: {c['custodian']}") + print() + + if unreliable: + print(f"\n{'='*40}") + print(f"COMPOUND SLUGS - SET TO 'Unknown' ({len(unreliable)} files)") + print(f"{'='*40}") + print("These slugs have no hyphens and cannot be reliably parsed.") + print("Per Rule 21: Names will be set to 'Unknown' (no hallucination).\n") + + for u in unreliable: + print(f" File: {u['file']}") + print(f" Slug: {u['slug']}") + print(f" Contaminated: {u['contaminated_field']} = 'Simon Kemper'") + print(f" Will be set to: 'Unknown' (slug preserved in metadata)") + headline = u['headline'] + print(f" Headline: {headline[:60]}..." if len(headline) > 60 else f" Headline: {headline}") + print(f" Custodian: {u['custodian']}") + print() + + print(f"\n{'='*40}") + print("SUMMARY") + print(f"{'='*40}") + print(f" Reliably fixable: {len(contaminated)}") + print(f" Set to 'Unknown': {len(unreliable)}") + print(f" Total: {len(contaminated) + len(unreliable)}") + + if not dry_run: + print(f"\n✅ Fixed {len(fixed)} files") + else: + print(f"\n⚠️ DRY RUN - No files modified. Run with --fix to apply changes.") + +if __name__ == "__main__": + main() diff --git a/scripts/parse_linkedin_connections.py b/scripts/parse_linkedin_connections.py index b2ac16346d..c79d48d69c 100644 --- a/scripts/parse_linkedin_connections.py +++ b/scripts/parse_linkedin_connections.py @@ -99,6 +99,62 @@ NON_HERITAGE_KEYWORDS = [ 'organiser', 'opruimhulp', 'verpleeg', 'nurse' ] +# Organizations that are explicitly NOT heritage institutions +# These should never be classified as heritage-relevant +NON_HERITAGE_ORGANIZATIONS = [ + # Banks & Financial + 'ing ', 'ing nederland', 'rabobank', 'abn amro', 'postbank', 'triodos', + # Security companies + 'i-sec', 'g4s', 'securitas', 'trigion', 'chubb', + # Police/Government (non-cultural) + 'politie', 'police', 'rijkswaterstaat', 'belastingdienst', 'douane', 'defensie', + # Political parties + 'vvd', 'pvda', 'cda', 'd66', 'groenlinks', 'pvv', 'bbb', 'nsc', 'volt', + 'sp ', 'forum voor democratie', 'ja21', 'bij1', 'denk', 'sgp', 'cu ', + # Tech companies (non-heritage) + 'google', 'microsoft', 'amazon', 'meta', 'facebook', 'apple', 'netflix', + 'uber', 'airbnb', 'booking.com', 'adyen', 'mollie', 'messagebird', + 'coolblue', 'bol.com', 'picnic', 'takeaway', 'just eat', + # Telecom + 'kpn', 'vodafone', 't-mobile', 'ziggo', + # Postal / Logistics + 'postnl', 'postkantoren', 'dhl', 'ups', 'fedex', + # Healthcare + 'ziekenhuis', 'hospital', 'ggz', 'ggd', 'thuiszorg', + # Retail + 'albert heijn', 'jumbo', 'lidl', 'aldi', 'ikea', 'hema', 'action', + # Consulting / Professional services + 'deloitte', 'kpmg', 'pwc', 'ey ', 'ernst & young', 'mckinsey', 'bcg', + 'accenture', 'capgemini', 'ordina', 'atos', 'cgi ', + # Recruitment / HR + 'randstad', 'tempo-team', 'manpower', 'hays', 'brunel', + # Energy / Utilities + 'shell', 'bp ', 'eneco', 'vattenfall', 'essent', 'nuon', + # Transport + 'ns ', 'prorail', 'schiphol', 'klm', 'transavia', + # Other + 'freelance', 'zelfstandig', 'zzp', 'eigen bedrijf', +] + +# Heritage organization keywords - organizations that ARE heritage institutions +# Used to validate that 'D' (Digital) roles are actually at heritage orgs +HERITAGE_ORGANIZATION_KEYWORDS = [ + # Archives + 'archief', 'archive', 'nationaal archief', 'stadsarchief', 'regionaal archief', + 'beeld en geluid', 'beeld & geluid', 'niod', 'iish', 'iisg', + # Museums + 'museum', 'musea', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis', + 'tropenmuseum', 'allard pierson', 'kröller', 'boijmans', + # Libraries + 'bibliotheek', 'library', 'koninklijke bibliotheek', 'kb ', + # Film/AV heritage + 'eye film', 'filmmuseum', 'eye ', 'sound and vision', + # Heritage platforms + 'erfgoed', 'heritage', 'cultural', 'cultureel', + # Research institutes (heritage-focused) + 'knaw', 'humanities cluster', 'meertens', 'huygens', +] + # Lines that indicate LinkedIn UI noise (to skip entirely) NOISE_EXACT = { '0 notifications', 'Search', 'Home', 'My Network', 'Jobs', 'Messaging', @@ -276,16 +332,35 @@ def is_location_line(line: str) -> bool: def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]: """ Detect if a headline is heritage-relevant and what type. + + Two-stage classification: + 1. Check if organization is explicitly non-heritage (blocklist) + 2. Check if role/organization matches heritage patterns + + For 'D' (Digital) type, require BOTH a tech role AND a heritage organization. """ headline_lower = headline.lower() - # Check for non-heritage indicators + # Stage 1: Check for non-heritage organizations (blocklist) + for org in NON_HERITAGE_ORGANIZATIONS: + if org.lower() in headline_lower: + return (False, None) + + # Stage 2: Check for non-heritage role indicators for keyword in NON_HERITAGE_KEYWORDS: if keyword.lower() in headline_lower: return (False, None) + # Stage 3: Check if this is a heritage organization + is_heritage_org = False + for org_keyword in HERITAGE_ORGANIZATION_KEYWORDS: + if org_keyword.lower() in headline_lower: + is_heritage_org = True + break + # Check heritage keywords by type (order matters - more specific first) - type_order = ['A', 'M', 'L', 'O', 'R', 'E', 'D', 'G', 'S', 'C'] + # 'D' (Digital) is checked last and requires heritage org validation + type_order = ['A', 'M', 'L', 'G', 'S', 'C', 'O', 'R', 'E'] # D removed from here for heritage_type in type_order: keywords = HERITAGE_KEYWORDS.get(heritage_type, []) @@ -293,7 +368,15 @@ def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]: if keyword.lower() in headline_lower: return (True, heritage_type) - # Generic heritage terms + # Special handling for 'D' (Digital) - ONLY if at a heritage organization + # This prevents generic IT workers from being classified as heritage-relevant + if is_heritage_org: + digital_keywords = HERITAGE_KEYWORDS.get('D', []) + for keyword in digital_keywords: + if keyword.lower() in headline_lower: + return (True, 'D') + + # Generic heritage terms (without specific type) generic_heritage = [ 'heritage', 'erfgoed', 'culture', 'cultuur', 'cultural', 'film', 'cinema', 'media', 'arts', 'kunst', 'creative', diff --git a/scripts/parse_linkedin_html.py b/scripts/parse_linkedin_html.py index cc53577ef8..b11a70987f 100755 --- a/scripts/parse_linkedin_html.py +++ b/scripts/parse_linkedin_html.py @@ -66,6 +66,62 @@ NON_HERITAGE_KEYWORDS = [ 'investment', 'e-commerce', 'organiser', 'opruimhulp', 'verpleeg', 'nurse' ] +# Organizations that are explicitly NOT heritage institutions +# These should never be classified as heritage-relevant +NON_HERITAGE_ORGANIZATIONS = [ + # Banks & Financial + 'ing ', 'ing nederland', 'rabobank', 'abn amro', 'postbank', 'triodos', + # Security companies + 'i-sec', 'g4s', 'securitas', 'trigion', 'chubb', + # Police/Government (non-cultural) + 'politie', 'police', 'rijkswaterstaat', 'belastingdienst', 'douane', 'defensie', + # Political parties + 'vvd', 'pvda', 'cda', 'd66', 'groenlinks', 'pvv', 'bbb', 'nsc', 'volt', + 'sp ', 'forum voor democratie', 'ja21', 'bij1', 'denk', 'sgp', 'cu ', + # Tech companies (non-heritage) + 'google', 'microsoft', 'amazon', 'meta', 'facebook', 'apple', 'netflix', + 'uber', 'airbnb', 'booking.com', 'adyen', 'mollie', 'messagebird', + 'coolblue', 'bol.com', 'picnic', 'takeaway', 'just eat', + # Telecom + 'kpn', 'vodafone', 't-mobile', 'ziggo', + # Postal / Logistics + 'postnl', 'postkantoren', 'dhl', 'ups', 'fedex', + # Healthcare + 'ziekenhuis', 'hospital', 'ggz', 'ggd', 'thuiszorg', + # Retail + 'albert heijn', 'jumbo', 'lidl', 'aldi', 'ikea', 'hema', 'action', + # Consulting / Professional services + 'deloitte', 'kpmg', 'pwc', 'ey ', 'ernst & young', 'mckinsey', 'bcg', + 'accenture', 'capgemini', 'ordina', 'atos', 'cgi ', + # Recruitment / HR + 'randstad', 'tempo-team', 'manpower', 'hays', 'brunel', + # Energy / Utilities + 'shell', 'bp ', 'eneco', 'vattenfall', 'essent', 'nuon', + # Transport + 'ns ', 'prorail', 'schiphol', 'klm', 'transavia', + # Other + 'freelance', 'zelfstandig', 'zzp', 'eigen bedrijf', +] + +# Heritage organization keywords - organizations that ARE heritage institutions +# Used to validate that 'D' (Digital) roles are actually at heritage orgs +HERITAGE_ORGANIZATION_KEYWORDS = [ + # Archives + 'archief', 'archive', 'nationaal archief', 'stadsarchief', 'regionaal archief', + 'beeld en geluid', 'beeld & geluid', 'niod', 'iish', 'iisg', + # Museums + 'museum', 'musea', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis', + 'tropenmuseum', 'allard pierson', 'kröller', 'boijmans', + # Libraries + 'bibliotheek', 'library', 'koninklijke bibliotheek', 'kb ', + # Film/AV heritage + 'eye film', 'filmmuseum', 'eye ', 'sound and vision', + # Heritage platforms + 'erfgoed', 'heritage', 'cultural', 'cultureel', + # Research institutes (heritage-focused) + 'knaw', 'humanities cluster', 'meertens', 'huygens', +] + # LinkedIn status phrases that pollute name fields (extracted from img alt text) # These should be removed from names and stored as metadata LINKEDIN_STATUS_PHRASES = [ @@ -168,8 +224,8 @@ class LinkedInProfileCardParser(HTMLParser): def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: attrs_dict = dict(attrs) - attr_id = attrs_dict.get('id', '') - attr_class = attrs_dict.get('class', '') + attr_id = attrs_dict.get('id') or '' + attr_class = attrs_dict.get('class') or '' # Detect profile card start - can be on tag (regular) OR tag (anonymous) if 'org-people-profile-card__profile-image' in attr_id: @@ -367,28 +423,58 @@ class LinkedInProfileCardParser(HTMLParser): def detect_heritage_type(headline: str) -> tuple[bool, Optional[str]]: - """Detect if a headline is heritage-relevant and what type.""" + """ + Detect if a headline is heritage-relevant and what type. + + Two-stage classification: + 1. Check if organization is explicitly non-heritage (blocklist) + 2. Check if role/organization matches heritage patterns + + For 'D' (Digital) type, require BOTH a tech role AND a heritage organization. + This prevents generic IT workers at banks/police from being classified as heritage. + """ if not headline: return (False, None) headline_lower = headline.lower() - # Check non-heritage first + # Stage 1: Check for non-heritage organizations (blocklist) + for org in NON_HERITAGE_ORGANIZATIONS: + if org.lower() in headline_lower: + return (False, None) + + # Stage 2: Check for non-heritage role indicators for keyword in NON_HERITAGE_KEYWORDS: if keyword.lower() in headline_lower: return (False, None) - # Check heritage keywords by type - type_order = ['A', 'M', 'L', 'O', 'R', 'E', 'D', 'G', 'S', 'C'] + # Stage 3: Check if this is a heritage organization + is_heritage_org = False + for org_keyword in HERITAGE_ORGANIZATION_KEYWORDS: + if org_keyword.lower() in headline_lower: + is_heritage_org = True + break + + # Check heritage keywords by type (order matters - more specific first) + # 'D' (Digital) is checked last and requires heritage org validation + type_order = ['A', 'M', 'L', 'G', 'S', 'C', 'O', 'R', 'E'] # D removed from main loop + for heritage_type in type_order: keywords = HERITAGE_KEYWORDS.get(heritage_type, []) for keyword in keywords: if keyword.lower() in headline_lower: return (True, heritage_type) - # Generic heritage terms + # Special handling for 'D' (Digital) - ONLY if at a heritage organization + if is_heritage_org: + digital_keywords = HERITAGE_KEYWORDS.get('D', []) + for keyword in digital_keywords: + if keyword.lower() in headline_lower: + return (True, 'D') + + # Generic heritage terms (without specific type) generic = ['heritage', 'erfgoed', 'culture', 'cultuur', 'cultural', 'film', 'cinema', - 'media', 'arts', 'kunst', 'creative', 'preservation', 'conservation'] + 'media', 'arts', 'kunst', 'creative', 'preservation', 'conservation', 'collection'] for keyword in generic: if keyword in headline_lower: return (True, None) diff --git a/scripts/scan_dutch_data_quality.py b/scripts/scan_dutch_data_quality.py new file mode 100644 index 0000000000..64e8b0a7e4 --- /dev/null +++ b/scripts/scan_dutch_data_quality.py @@ -0,0 +1,445 @@ +#!/usr/bin/env python3 +""" +Comprehensive data quality scan for Dutch custodian YAML files. +Identifies issues like wrong GHCID types, missing web claims, Google Maps mismatches, etc. +""" + +import os +import re +import yaml +from pathlib import Path +from collections import defaultdict +from datetime import datetime + +CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") + +# Issue categories +issues = defaultdict(list) + +def extract_ghcid_type(filename): + """Extract type code from GHCID filename (e.g., NL-ZH-ZOE-A-SAZS -> A)""" + match = re.match(r'NL-[A-Z]{2}-[A-Z]{3}-([A-Z])-', filename) + return match.group(1) if match else None + +def get_expected_type(data): + """Determine expected type from original_entry or other fields""" + # Check original_entry.type + if 'original_entry' in data: + oe = data['original_entry'] + if 'type' in oe and oe['type']: + types = oe['type'] + if isinstance(types, list) and len(types) > 0: + return types[0] + if 'type_organisatie' in oe: + type_org = oe['type_organisatie'] + if type_org: + type_map = { + 'archive': 'A', 'archief': 'A', + 'library': 'L', 'bibliotheek': 'L', + 'museum': 'M', + 'gallery': 'G', 'galerie': 'G', + } + return type_map.get(type_org.lower(), None) + return None + +def check_google_maps_mismatch(data, filename): + """Check if Google Maps name doesn't match organization name""" + if 'google_maps_enrichment' not in data: + return None + + gm = data['google_maps_enrichment'] + gm_name = gm.get('name', '') + + # Get original org name + org_name = '' + if 'original_entry' in data: + org_name = data['original_entry'].get('organisatie', '') + if 'custodian_name' in data: + cn = data['custodian_name'] + if isinstance(cn, dict): + org_name = cn.get('claim_value', org_name) + + if not gm_name or not org_name: + return None + + # Simple similarity check - if names share less than 30% of words, flag it + gm_words = set(gm_name.lower().split()) + org_words = set(org_name.lower().split()) + + # Remove common words + stopwords = {'de', 'het', 'van', 'en', 'in', 'te', 'der', 'voor', 'stichting', 'vereniging'} + gm_words = gm_words - stopwords + org_words = org_words - stopwords + + if len(gm_words) == 0 or len(org_words) == 0: + return None + + overlap = len(gm_words & org_words) + similarity = overlap / max(len(gm_words), len(org_words)) + + if similarity < 0.3: + return { + 'google_name': gm_name, + 'org_name': org_name, + 'similarity': round(similarity, 2) + } + return None + +def check_absolute_paths(data, filename): + """Check for absolute paths that should be relative""" + yaml_str = yaml.dump(data, default_flow_style=False) + abs_paths = [] + + patterns = [ + r'/Volumes/KINGSTON/', + r'/Users/kempersc/', + r'/mnt/', + r'C:\\', + r'D:\\' + ] + + for pattern in patterns: + if re.search(pattern, yaml_str): + abs_paths.append(pattern.rstrip('/\\')) + + return abs_paths if abs_paths else None + +def check_web_claims(data, filename): + """Check web claims quality""" + issues_found = [] + + if 'web_claims' not in data: + return ['no_web_claims'] + + wc = data['web_claims'] + + # Check if claims exist + claims = wc.get('claims', []) + if not claims: + issues_found.append('empty_claims') + + # Check for verified_claims + if 'verified_claims' not in wc: + issues_found.append('no_verified_claims') + else: + vc = wc['verified_claims'] + if isinstance(vc, dict): + vc_claims = vc.get('claims', []) + # Check for XPath provenance + claims_without_xpath = 0 + for claim in vc_claims: + if isinstance(claim, dict) and 'xpath' not in claim: + claims_without_xpath += 1 + if claims_without_xpath > 0: + issues_found.append(f'claims_missing_xpath:{claims_without_xpath}') + + return issues_found if issues_found else None + +def check_coordinates(data, filename): + """Check for coordinate issues""" + issues_found = [] + + # Check if location exists + if 'location' not in data: + issues_found.append('no_location') + return issues_found + + loc = data['location'] + lat = loc.get('latitude') + lon = loc.get('longitude') + + if lat is None or lon is None: + issues_found.append('missing_coordinates') + elif not (50.5 < lat < 53.7 and 3.3 < lon < 7.3): + # Rough Netherlands bounding box + issues_found.append('coordinates_outside_netherlands') + + # Check if coordinates from Google Maps differ significantly from corrected + if 'coordinate_provenance' in loc: + prov = loc['coordinate_provenance'] + if 'previous_coordinates' in prov: + issues_found.append('has_coordinate_correction') + + return issues_found if issues_found else None + +def check_digital_platforms(data, filename): + """Check for missing digital platforms""" + if 'digital_platforms' not in data or not data['digital_platforms']: + return ['no_digital_platforms'] + + platforms = data['digital_platforms'] + if len(platforms) == 0: + return ['empty_digital_platforms'] + + return None + +def check_identifiers(data, filename): + """Check identifier completeness""" + issues_found = [] + + if 'identifiers' not in data: + issues_found.append('no_identifiers') + return issues_found + + ids = data['identifiers'] + id_types = [i.get('identifier_scheme') for i in ids if isinstance(i, dict)] + + if 'ISIL' not in id_types: + issues_found.append('no_isil') + if 'GHCID' not in id_types: + issues_found.append('no_ghcid') + + return issues_found if issues_found else None + +def check_wikidata(data, filename): + """Check Wikidata enrichment status""" + if 'wikidata_enrichment' not in data: + return 'no_wikidata_enrichment' + + wd = data['wikidata_enrichment'] + status = wd.get('status', '') + + if status == 'NOT_FOUND': + return 'wikidata_not_found' + elif status in ['SUCCESS', 'ENRICHED']: + return None + else: + return f'wikidata_status:{status}' + +def check_url(data, filename): + """Check URL issues""" + issues_found = [] + + url = data.get('url', '') + if not url: + issues_found.append('no_url') + elif url.startswith('http://'): + issues_found.append('http_not_https') + + # Check if URL was corrected (indicates previous wrong URL) + if 'url_correction' in data: + issues_found.append('has_url_correction') + + return issues_found if issues_found else None + +def scan_file(filepath): + """Scan a single file for all issue types""" + filename = filepath.name + file_issues = {} + + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + except Exception as e: + return {'parse_error': str(e)} + + if not data: + return {'empty_file': True} + + # 1. Check GHCID type mismatch + ghcid_type = extract_ghcid_type(filename) + expected_type = get_expected_type(data) + + if ghcid_type and expected_type and ghcid_type != expected_type: + if ghcid_type == 'U' and expected_type != 'U': + file_issues['wrong_ghcid_type'] = { + 'current': ghcid_type, + 'expected': expected_type + } + + # Also check for U type that should be something else + if ghcid_type == 'U': + file_issues['unknown_type'] = True + + # 2. Check Google Maps mismatch + gm_mismatch = check_google_maps_mismatch(data, filename) + if gm_mismatch: + file_issues['google_maps_mismatch'] = gm_mismatch + + # 3. Check absolute paths + abs_paths = check_absolute_paths(data, filename) + if abs_paths: + file_issues['absolute_paths'] = abs_paths + + # 4. Check web claims + wc_issues = check_web_claims(data, filename) + if wc_issues: + file_issues['web_claims_issues'] = wc_issues + + # 5. Check coordinates + coord_issues = check_coordinates(data, filename) + if coord_issues: + file_issues['coordinate_issues'] = coord_issues + + # 6. Check digital platforms + dp_issues = check_digital_platforms(data, filename) + if dp_issues: + file_issues['digital_platform_issues'] = dp_issues + + # 7. Check identifiers + id_issues = check_identifiers(data, filename) + if id_issues: + file_issues['identifier_issues'] = id_issues + + # 8. Check Wikidata + wd_issue = check_wikidata(data, filename) + if wd_issue: + file_issues['wikidata_issue'] = wd_issue + + # 9. Check URL + url_issues = check_url(data, filename) + if url_issues: + file_issues['url_issues'] = url_issues + + return file_issues + +def main(): + print(f"Scanning Dutch custodian files in {CUSTODIAN_DIR}") + print(f"Scan started: {datetime.now().isoformat()}") + print("=" * 80) + + # Collect all issues + all_issues = {} + issue_counts = defaultdict(int) + + files = sorted(CUSTODIAN_DIR.glob("NL-*.yaml")) + total_files = len(files) + + print(f"Found {total_files} Dutch custodian files\n") + + for i, filepath in enumerate(files): + if (i + 1) % 200 == 0: + print(f"Progress: {i+1}/{total_files} files scanned...", flush=True) + + file_issues = scan_file(filepath) + + if file_issues: + all_issues[filepath.name] = file_issues + for issue_type in file_issues.keys(): + issue_counts[issue_type] += 1 + + print(f"\nScan complete: {total_files} files analyzed") + print("=" * 80) + + # Summary report + print("\n" + "=" * 80) + print("SUMMARY REPORT: Data Quality Issues") + print("=" * 80) + + print(f"\nTotal files scanned: {total_files}") + print(f"Files with issues: {len(all_issues)}") + print(f"Files without issues: {total_files - len(all_issues)}") + + print("\n" + "-" * 80) + print("ISSUE BREAKDOWN BY TYPE") + print("-" * 80) + + # Sort issues by count + sorted_issues = sorted(issue_counts.items(), key=lambda x: -x[1]) + + for issue_type, count in sorted_issues: + pct = (count / total_files) * 100 + print(f"{issue_type:40} {count:5} files ({pct:5.1f}%)") + + # Detailed breakdown for critical issues + print("\n" + "=" * 80) + print("CRITICAL ISSUES - REQUIRE IMMEDIATE ATTENTION") + print("=" * 80) + + # 1. Wrong GHCID type + wrong_type_files = [(f, d) for f, d in all_issues.items() if 'wrong_ghcid_type' in d] + print(f"\n1. WRONG GHCID TYPE ({len(wrong_type_files)} files)") + print("-" * 40) + if wrong_type_files: + for filename, data in wrong_type_files[:20]: + info = data['wrong_ghcid_type'] + print(f" {filename}: {info['current']} -> should be {info['expected']}") + if len(wrong_type_files) > 20: + print(f" ... and {len(wrong_type_files) - 20} more") + else: + print(" None found") + + # 2. Google Maps mismatches + gm_mismatch_files = [(f, d) for f, d in all_issues.items() if 'google_maps_mismatch' in d] + print(f"\n2. GOOGLE MAPS MISMATCHES ({len(gm_mismatch_files)} files)") + print("-" * 40) + if gm_mismatch_files: + for filename, data in gm_mismatch_files[:20]: + info = data['google_maps_mismatch'] + print(f" {filename}") + print(f" Google: {info['google_name']}") + print(f" Org: {info['org_name']}") + print(f" Similarity: {info['similarity']}") + if len(gm_mismatch_files) > 20: + print(f" ... and {len(gm_mismatch_files) - 20} more") + else: + print(" None found") + + # 3. Absolute paths + abs_path_files = [(f, d) for f, d in all_issues.items() if 'absolute_paths' in d] + print(f"\n3. ABSOLUTE PATHS ({len(abs_path_files)} files)") + print("-" * 40) + if abs_path_files: + for filename, data in abs_path_files[:10]: + print(f" {filename}: {data['absolute_paths']}") + if len(abs_path_files) > 10: + print(f" ... and {len(abs_path_files) - 10} more") + else: + print(" None found") + + # 4. Unknown type (U) + unknown_type_files = [f for f, d in all_issues.items() if 'unknown_type' in d] + print(f"\n4. UNKNOWN TYPE CODE 'U' ({len(unknown_type_files)} files)") + print("-" * 40) + if unknown_type_files: + for filename in unknown_type_files[:30]: + print(f" {filename}") + if len(unknown_type_files) > 30: + print(f" ... and {len(unknown_type_files) - 30} more") + else: + print(" None found") + + print("\n" + "=" * 80) + print("ENRICHMENT GAPS") + print("=" * 80) + + # Web claims issues + no_verified_claims = [f for f, d in all_issues.items() + if 'web_claims_issues' in d and 'no_verified_claims' in d['web_claims_issues']] + print(f"\n5. NO VERIFIED WEB CLAIMS ({len(no_verified_claims)} files)") + + # Digital platforms + no_platforms = [f for f, d in all_issues.items() + if 'digital_platform_issues' in d] + print(f"6. NO DIGITAL PLATFORMS ({len(no_platforms)} files)") + + # Wikidata + no_wikidata = [f for f, d in all_issues.items() + if d.get('wikidata_issue') in ['no_wikidata_enrichment', 'wikidata_not_found']] + print(f"7. NO WIKIDATA ENRICHMENT ({len(no_wikidata)} files)") + + # URLs + no_url = [f for f, d in all_issues.items() + if 'url_issues' in d and 'no_url' in d['url_issues']] + print(f"8. NO URL ({len(no_url)} files)") + + # Save detailed report + report_file = CUSTODIAN_DIR.parent / 'reports' / 'dutch_data_quality_scan.yaml' + report_file.parent.mkdir(exist_ok=True) + + report = { + 'scan_timestamp': datetime.now().isoformat(), + 'total_files': total_files, + 'files_with_issues': len(all_issues), + 'issue_counts': dict(sorted_issues), + 'detailed_issues': all_issues + } + + with open(report_file, 'w', encoding='utf-8') as f: + yaml.dump(report, f, default_flow_style=False, allow_unicode=True) + + print(f"\n\nDetailed report saved to: {report_file}") + print(f"Scan completed: {datetime.now().isoformat()}") + +if __name__ == '__main__': + main() diff --git a/scripts/scan_dutch_fast.py b/scripts/scan_dutch_fast.py new file mode 100644 index 0000000000..00cb06885e --- /dev/null +++ b/scripts/scan_dutch_fast.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +"""Fast data quality scan - optimized for speed.""" + +import os +import re +import yaml +from pathlib import Path +from collections import defaultdict +from datetime import datetime + +# Use C loader for speed +try: + from yaml import CSafeLoader as SafeLoader +except ImportError: + from yaml import SafeLoader + +CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") + +def extract_ghcid_type(filename): + match = re.match(r'NL-[A-Z]{2}-[A-Z]{3}-([A-Z])-', filename) + return match.group(1) if match else None + +def scan_file_fast(filepath): + """Fast scan using string operations where possible.""" + filename = filepath.name + issues = [] + + try: + with open(filepath, 'r', encoding='utf-8') as f: + content = f.read() + except Exception as e: + return ['parse_error'] + + # Quick string-based checks first + + # Absolute paths + if '/Volumes/KINGSTON/' in content or '/Users/kempersc/' in content: + issues.append('absolute_paths') + + # No URL + if '\nurl:' not in content and 'url: ' not in content[:500]: + issues.append('no_url') + + # HTTP instead of HTTPS + if 'url: http://' in content: + issues.append('http_not_https') + + # No digital_platforms + if 'digital_platforms:' not in content: + issues.append('no_digital_platforms') + elif 'digital_platforms: []\n' in content or 'digital_platforms:\n-' not in content: + issues.append('empty_digital_platforms') + + # No verified_claims + if 'verified_claims:' not in content: + issues.append('no_verified_claims') + + # Wikidata NOT_FOUND + if "status: NOT_FOUND" in content: + issues.append('wikidata_not_found') + elif 'wikidata_enrichment:' not in content: + issues.append('no_wikidata_enrichment') + + # Unknown type in filename + ghcid_type = extract_ghcid_type(filename) + if ghcid_type == 'U': + issues.append('unknown_type_U') + + # Parse YAML only for complex checks + try: + data = yaml.load(content, Loader=SafeLoader) + except: + issues.append('yaml_parse_error') + return issues + + if not data: + issues.append('empty_file') + return issues + + # Check GHCID type mismatch + if 'original_entry' in data: + oe = data['original_entry'] + expected = None + if 'type' in oe and oe['type'] and isinstance(oe['type'], list): + expected = oe['type'][0] + elif 'type_organisatie' in oe and oe['type_organisatie']: + type_map = {'archive': 'A', 'archief': 'A', 'library': 'L', + 'bibliotheek': 'L', 'museum': 'M', 'gallery': 'G'} + expected = type_map.get(oe['type_organisatie'].lower()) + + if expected and ghcid_type and ghcid_type != expected: + issues.append(f'wrong_type:{ghcid_type}→{expected}') + + # Check Google Maps mismatch + if 'google_maps_enrichment' in data and 'original_entry' in data: + gm_name = data['google_maps_enrichment'].get('name', '').lower() + org_name = data['original_entry'].get('organisatie', '').lower() + + if gm_name and org_name: + gm_words = set(gm_name.split()) - {'de', 'het', 'van', 'en', 'stichting'} + org_words = set(org_name.split()) - {'de', 'het', 'van', 'en', 'stichting'} + + if gm_words and org_words: + overlap = len(gm_words & org_words) + similarity = overlap / max(len(gm_words), len(org_words)) + if similarity < 0.25: + issues.append('google_maps_mismatch') + + # Check coordinates + if 'location' in data: + loc = data['location'] + lat = loc.get('latitude') + lon = loc.get('longitude') + if lat is None or lon is None: + issues.append('missing_coordinates') + elif not (50.5 < lat < 53.7 and 3.3 < lon < 7.3): + issues.append('coords_outside_NL') + else: + issues.append('no_location') + + return issues + +def main(): + print(f"Fast scan started: {datetime.now().isoformat()}") + + files = sorted(CUSTODIAN_DIR.glob("NL-*.yaml")) + total = len(files) + + print(f"Scanning {total} Dutch custodian files...") + + issue_counts = defaultdict(int) + files_with_issues = defaultdict(list) + + for i, fp in enumerate(files): + issues = scan_file_fast(fp) + for issue in issues: + issue_counts[issue] += 1 + files_with_issues[issue].append(fp.name) + + print(f"\nScan complete: {datetime.now().isoformat()}") + print("\n" + "=" * 80) + print("DATA QUALITY SUMMARY REPORT") + print("=" * 80) + print(f"\nTotal files: {total}") + + # Count files with any issue + all_issue_files = set() + for files_list in files_with_issues.values(): + all_issue_files.update(files_list) + + print(f"Files with issues: {len(all_issue_files)} ({100*len(all_issue_files)/total:.1f}%)") + print(f"Clean files: {total - len(all_issue_files)}") + + print("\n" + "-" * 80) + print("ISSUE BREAKDOWN") + print("-" * 80) + + # Sort by count + for issue, count in sorted(issue_counts.items(), key=lambda x: -x[1]): + pct = 100 * count / total + bar = "█" * int(pct / 2) + print(f"{issue:35} {count:5} ({pct:5.1f}%) {bar}") + + # Critical issues detail + print("\n" + "=" * 80) + print("CRITICAL ISSUES (require manual fix)") + print("=" * 80) + + critical_issues = ['wrong_type:', 'google_maps_mismatch', 'absolute_paths', 'unknown_type_U'] + + for critical in critical_issues: + matching = [(k, v) for k, v in files_with_issues.items() if critical in k or k == critical] + if matching: + for issue_key, file_list in matching: + print(f"\n{issue_key} ({len(file_list)} files):") + for f in file_list[:15]: + print(f" - {f}") + if len(file_list) > 15: + print(f" ... and {len(file_list) - 15} more") + + # Save report + report_path = CUSTODIAN_DIR.parent / 'reports' / 'dutch_data_quality_fast.yaml' + report_path.parent.mkdir(exist_ok=True) + + report = { + 'scan_timestamp': datetime.now().isoformat(), + 'total_files': total, + 'files_with_issues': len(all_issue_files), + 'issue_counts': dict(sorted(issue_counts.items(), key=lambda x: -x[1])), + 'files_by_issue': {k: v for k, v in files_with_issues.items()} + } + + with open(report_path, 'w') as f: + yaml.dump(report, f, default_flow_style=False, allow_unicode=True) + + print(f"\n\nFull report saved: {report_path}") + +if __name__ == '__main__': + main() diff --git a/scripts/transform_crawl4ai_to_digital_platform.py b/scripts/transform_crawl4ai_to_digital_platform.py new file mode 100644 index 0000000000..7d6e77902f --- /dev/null +++ b/scripts/transform_crawl4ai_to_digital_platform.py @@ -0,0 +1,575 @@ +#!/usr/bin/env python3 +""" +Transform crawl4ai_enrichment data into proper digital_platform YAML structure. + +This script processes custodian YAML files that have crawl4ai_enrichment data +and creates/updates the digital_platform block conforming to the LinkML schema. + +Schema Reference: +- DigitalPlatform: schemas/20251121/linkml/modules/classes/DigitalPlatform.yaml +- AuxiliaryDigitalPlatform: schemas/20251121/linkml/modules/classes/AuxiliaryDigitalPlatform.yaml +- DigitalPlatformTypeEnum: schemas/20251121/linkml/modules/enums/DigitalPlatformTypeEnum.yaml + +Usage: + python scripts/transform_crawl4ai_to_digital_platform.py [--dry-run] [--file FILE] +""" + +import argparse +import logging +import re +import sys +from collections import defaultdict +from datetime import datetime, timezone +from pathlib import Path +from typing import Any +from urllib.parse import unquote, urlparse + +import yaml + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(sys.stdout), + logging.FileHandler(f'logs/transform_digital_platform_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log') + ] +) +logger = logging.getLogger(__name__) + + +# Mapping from crawl4ai detected_catalog_urls type to DigitalPlatformTypeEnum +# and to the appropriate slot (collection_web_addresses or inventory_web_addresses) +CATALOG_TYPE_MAPPING = { + # Image collections → collection_web_addresses + 'beeldbank': { + 'platform_types': ['PHOTOGRAPH_COLLECTION'], + 'slot': 'collection_web_addresses', + 'description': 'Image/photograph collection' + }, + # Genealogy → collection_web_addresses (specialized database) + 'genealogie': { + 'platform_types': ['GENEALOGY_DATABASE'], + 'slot': 'collection_web_addresses', + 'description': 'Genealogy records database' + }, + # Archives/inventories → inventory_web_addresses + 'archieven': { + 'platform_types': ['ARCHIVES_PORTAL'], + 'slot': 'inventory_web_addresses', + 'description': 'Archival finding aids and inventories' + }, + 'inventaris': { + 'platform_types': ['ARCHIVES_PORTAL'], + 'slot': 'inventory_web_addresses', + 'description': 'Archival inventory' + }, + # Collections → collection_web_addresses + 'collectie': { + 'platform_types': ['ONLINE_DATABASE'], + 'slot': 'collection_web_addresses', + 'description': 'General collection access' + }, + # Library → collection_web_addresses + 'bibliotheek': { + 'platform_types': ['DIGITAL_LIBRARY'], + 'slot': 'collection_web_addresses', + 'description': 'Library catalog' + }, + # Search interfaces → collection_web_addresses + 'zoeken': { + 'platform_types': ['ONLINE_DATABASE'], + 'slot': 'collection_web_addresses', + 'description': 'Search interface' + }, + # Kranten (newspapers) → collection_web_addresses + 'kranten': { + 'platform_types': ['ONLINE_NEWS_ARCHIVE'], + 'slot': 'collection_web_addresses', + 'description': 'Historical newspapers' + }, +} + +# Mapping for external archive platforms to AuxiliaryDigitalPlatformTypeEnum +EXTERNAL_PLATFORM_MAPPING = { + 'archieven.nl': { + 'platform_name': 'Archieven.nl', + 'auxiliary_platform_type': 'AGGREGATOR', + 'description': 'National Dutch archives aggregator' + }, + 'archiefweb.eu': { + 'platform_name': 'Archiefweb.eu', + 'auxiliary_platform_type': 'ARCHIVAL_REPOSITORY', + 'description': 'Web archiving service' + }, + 'memorix.nl': { + 'platform_name': 'Memorix', + 'auxiliary_platform_type': 'DIGITAL_ARCHIVE', + 'description': 'Heritage information management platform' + }, + 'opendata.archieven.nl': { + 'platform_name': 'Open Data Archieven.nl', + 'auxiliary_platform_type': 'OPEN_DATA_PORTAL', + 'description': 'Open data from Dutch archives' + }, + 'regionaalarchief': { + 'platform_name': 'Regionaal Archief', + 'auxiliary_platform_type': 'ARCHIVES_PORTAL', + 'description': 'Regional archive portal' + }, + 'delpher.nl': { + 'platform_name': 'Delpher', + 'auxiliary_platform_type': 'DIGITAL_LIBRARY', + 'description': 'KB digitized newspapers, books, and periodicals' + }, + 'wiewaswie.nl': { + 'platform_name': 'WieWasWie', + 'auxiliary_platform_type': 'GENEALOGY_DATABASE', + 'description': 'Dutch genealogy database' + }, +} + + +def normalize_url(url: str) -> str: + """Normalize URL by decoding and extracting base path.""" + if not url: + return url + + # URL decode + decoded = unquote(url) + + # Parse URL + parsed = urlparse(decoded) + + # Reconstruct without query parameters for deduplication key + base_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}" + + # Remove trailing slash for consistency (except root) + if base_url.endswith('/') and len(parsed.path) > 1: + base_url = base_url[:-1] + + return base_url + + +def extract_base_path_key(url: str) -> str: + """Extract base path for deduplication (without query params).""" + parsed = urlparse(url) + return f"{parsed.netloc}{parsed.path}".rstrip('/') + + +def deduplicate_catalog_urls(catalog_urls: list[dict]) -> list[dict]: + """ + Deduplicate catalog URLs, preferring entries with XPath provenance. + + Strategy: + 1. Group URLs by base path (without query params) + 2. For each group, prefer entries with xpath provenance + 3. Return one representative URL per type per base path + """ + if not catalog_urls: + return [] + + # Group by (base_path, type) + grouped: dict[tuple[str, str], list[dict]] = defaultdict(list) + + for entry in catalog_urls: + url = entry.get('url', '') + url_type = entry.get('type', 'unknown') + base_key = extract_base_path_key(url) + grouped[(base_key, url_type)].append(entry) + + # Select best entry from each group + deduplicated = [] + for (base_key, url_type), entries in grouped.items(): + # Sort: entries with xpath first, then by URL length (shorter preferred) + sorted_entries = sorted( + entries, + key=lambda e: (0 if e.get('xpath') else 1, len(e.get('url', ''))) + ) + best = sorted_entries[0] + + # Normalize the URL + best_copy = best.copy() + best_copy['url'] = normalize_url(best['url']) + deduplicated.append(best_copy) + + return deduplicated + + +def generate_platform_id(ghcid: str) -> str: + """Generate platform_id URI from GHCID.""" + ghcid_lower = ghcid.lower().replace('_', '-') + return f"https://nde.nl/ontology/hc/platform/{ghcid_lower}-website" + + +def extract_ghcid_from_file(file_path: Path) -> str | None: + """Extract GHCID from filename.""" + stem = file_path.stem + # GHCID pattern: CC-RR-CCC-T-ABBREV (e.g., NL-DR-ASS-A-DA) + if re.match(r'^[A-Z]{2}-[A-Z]{2,3}-[A-Z]{3}-[A-Z]-', stem): + return stem + return None + + +def determine_platform_types(catalog_urls: list[dict]) -> list[str]: + """ + Determine platform types from detected catalog URLs. + + Returns list of DigitalPlatformTypeEnum values. + """ + types_set = set() + + for entry in catalog_urls: + url_type = entry.get('type', '') + mapping = CATALOG_TYPE_MAPPING.get(url_type, {}) + for pt in mapping.get('platform_types', []): + types_set.add(pt) + + # If we have catalog URLs but no specific types, add generic ONLINE_DATABASE + if catalog_urls and not types_set: + types_set.add('ONLINE_DATABASE') + + # Always include INSTITUTIONAL_WEBSITE as base type + types_set.add('INSTITUTIONAL_WEBSITE') + + return sorted(list(types_set)) + + +def categorize_urls_by_slot(catalog_urls: list[dict]) -> dict[str, list[str]]: + """ + Categorize URLs by target slot (collection_web_addresses vs inventory_web_addresses). + """ + slots = { + 'collection_web_addresses': [], + 'inventory_web_addresses': [] + } + + seen_urls = set() + + for entry in catalog_urls: + url = entry.get('url', '') + if not url or url in seen_urls: + continue + + url_type = entry.get('type', '') + mapping = CATALOG_TYPE_MAPPING.get(url_type, {}) + slot = mapping.get('slot', 'collection_web_addresses') + + slots[slot].append(url) + seen_urls.add(url) + + return slots + + +def transform_external_platforms(external_platforms: list[dict]) -> list[dict]: + """ + Transform external_archive_platforms to auxiliary_platforms structure. + """ + if not external_platforms: + return [] + + auxiliary = [] + seen_platforms = set() + + for entry in external_platforms: + url = entry.get('url', '') + platform_key = entry.get('platform', '') + + if not url or platform_key in seen_platforms: + continue + + # Find mapping + mapping = None + for key, config in EXTERNAL_PLATFORM_MAPPING.items(): + if key in platform_key or key in url: + mapping = config + break + + if not mapping: + # Generic external platform + mapping = { + 'platform_name': platform_key.replace('.', ' ').title() if platform_key else 'External Platform', + 'auxiliary_platform_type': 'WEB_PORTAL', + 'description': 'External heritage platform' + } + + aux_platform = { + 'platform_name': mapping['platform_name'], + 'platform_url': url, + 'auxiliary_platform_type': mapping['auxiliary_platform_type'], + 'platform_purpose': mapping.get('description', '') + } + + auxiliary.append(aux_platform) + seen_platforms.add(platform_key) + + return auxiliary + + +def get_platform_name(data: dict, ghcid: str) -> str: + """ + Determine the best platform name from available data. + + Priority: + 1. custodian_name.emic_name or custodian_name.name + 2. crawl4ai_enrichment.title (cleaned) + 3. GHCID-based fallback + """ + # Try custodian_name first + custodian_name = data.get('custodian_name', {}) + if isinstance(custodian_name, dict): + name = custodian_name.get('emic_name') or custodian_name.get('name') + if name: + return f"{name} Website" + + # Try top-level name + if data.get('name'): + return f"{data['name']} Website" + + # Try crawl4ai title + crawl4ai = data.get('crawl4ai_enrichment', {}) + title = crawl4ai.get('title', '') + if title: + # Clean up title (remove common suffixes) + cleaned = re.sub(r'\s*[-–|]\s*.+$', '', title).strip() + if cleaned and len(cleaned) > 3: + return f"{cleaned} Website" + + # Fallback to GHCID + return f"{ghcid} Website" + + +def transform_crawl4ai_to_digital_platform(data: dict, ghcid: str) -> dict | None: + """ + Transform crawl4ai_enrichment into digital_platform structure. + + Args: + data: Full custodian YAML data + ghcid: Global Heritage Custodian Identifier + + Returns: + digital_platform dict or None if no crawl4ai_enrichment + """ + crawl4ai = data.get('crawl4ai_enrichment') + if not crawl4ai: + return None + + # Skip failed fetches - accept 2xx and 3xx status codes + status_code = crawl4ai.get('status_code') + if status_code is None or status_code >= 400: + logger.debug(f"Skipping {ghcid}: HTTP status {status_code}") + return None + + source_url = crawl4ai.get('source_url', '') + if not source_url: + return None + + # Get and deduplicate catalog URLs + catalog_urls = crawl4ai.get('detected_catalog_urls', []) + deduped_catalogs = deduplicate_catalog_urls(catalog_urls) + + # Determine platform types + platform_types = determine_platform_types(deduped_catalogs) + + # Categorize URLs by slot + url_slots = categorize_urls_by_slot(deduped_catalogs) + + # Transform external platforms + external_platforms = crawl4ai.get('external_archive_platforms', []) + auxiliary_platforms = transform_external_platforms(external_platforms) + + # Build digital_platform structure + digital_platform = { + 'platform_id': generate_platform_id(ghcid), + 'platform_name': get_platform_name(data, ghcid), + 'homepage_web_address': source_url, + 'refers_to_custodian': f"https://nde.nl/ontology/hc/{ghcid.lower()}" + } + + # Add platform types if we have more than just INSTITUTIONAL_WEBSITE + if platform_types and len(platform_types) > 1: + digital_platform['platform_type'] = platform_types + elif platform_types: + digital_platform['platform_type'] = platform_types + + # Add collection URLs + if url_slots['collection_web_addresses']: + digital_platform['collection_web_addresses'] = url_slots['collection_web_addresses'] + + # Add inventory URLs + if url_slots['inventory_web_addresses']: + digital_platform['inventory_web_addresses'] = url_slots['inventory_web_addresses'] + + # Add auxiliary platforms + if auxiliary_platforms: + digital_platform['auxiliary_platforms'] = auxiliary_platforms + + # Add transformation metadata + digital_platform['_transformation_metadata'] = { + 'source': 'crawl4ai_enrichment', + 'transformation_date': datetime.now(timezone.utc).isoformat(), + 'catalog_urls_original': len(catalog_urls), + 'catalog_urls_deduplicated': len(deduped_catalogs), + 'external_platforms_count': len(external_platforms) + } + + return digital_platform + + +def process_file(file_path: Path, dry_run: bool = False) -> dict: + """ + Process a single custodian YAML file. + + Returns: + dict with processing statistics + """ + stats = { + 'file': str(file_path.name), + 'status': 'skipped', + 'has_crawl4ai': False, + 'has_digital_platform': False, + 'catalog_urls': 0, + 'external_platforms': 0 + } + + try: + # Read YAML file + with open(file_path, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + if not data: + stats['status'] = 'empty' + return stats + + # Extract GHCID + ghcid = extract_ghcid_from_file(file_path) + if not ghcid: + stats['status'] = 'no_ghcid' + return stats + + # Check for crawl4ai_enrichment + crawl4ai = data.get('crawl4ai_enrichment') + if not crawl4ai: + stats['status'] = 'no_crawl4ai' + return stats + + stats['has_crawl4ai'] = True + stats['catalog_urls'] = len(crawl4ai.get('detected_catalog_urls', [])) + stats['external_platforms'] = len(crawl4ai.get('external_archive_platforms', [])) + + # Check if digital_platform_v2 already exists (avoid overwriting) + if 'digital_platform_v2' in data: + stats['has_digital_platform'] = True + stats['status'] = 'already_transformed' + return stats + + # Transform to digital_platform + digital_platform = transform_crawl4ai_to_digital_platform(data, ghcid) + + if not digital_platform: + stats['status'] = 'transform_failed' + return stats + + # Add to data as digital_platform_v2 (to distinguish from any existing digital_platform) + data['digital_platform_v2'] = digital_platform + + if not dry_run: + # Write back to file + with open(file_path, 'w', encoding='utf-8') as f: + yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) + stats['status'] = 'transformed' + else: + stats['status'] = 'would_transform' + logger.info(f"[DRY-RUN] Would transform {file_path.name}") + logger.debug(f" Platform types: {digital_platform.get('platform_type', [])}") + logger.debug(f" Collection URLs: {len(digital_platform.get('collection_web_addresses', []))}") + logger.debug(f" Inventory URLs: {len(digital_platform.get('inventory_web_addresses', []))}") + logger.debug(f" Auxiliary platforms: {len(digital_platform.get('auxiliary_platforms', []))}") + + return stats + + except yaml.YAMLError as e: + logger.error(f"YAML error in {file_path.name}: {e}") + stats['status'] = 'yaml_error' + return stats + except Exception as e: + logger.error(f"Error processing {file_path.name}: {e}") + stats['status'] = 'error' + return stats + + +def main(): + parser = argparse.ArgumentParser( + description='Transform crawl4ai_enrichment to digital_platform structure' + ) + parser.add_argument( + '--dry-run', + action='store_true', + help='Show what would be done without making changes' + ) + parser.add_argument( + '--file', + type=Path, + help='Process a single file instead of all NL-*.yaml files' + ) + parser.add_argument( + '--verbose', '-v', + action='store_true', + help='Enable verbose logging' + ) + + args = parser.parse_args() + + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + # Ensure logs directory exists + Path('logs').mkdir(exist_ok=True) + + # Get files to process + data_dir = Path('data/custodian') + + if args.file: + if not args.file.exists(): + logger.error(f"File not found: {args.file}") + sys.exit(1) + files = [args.file] + else: + files = sorted(data_dir.glob('NL-*.yaml')) + + logger.info(f"Processing {len(files)} files...") + if args.dry_run: + logger.info("DRY-RUN MODE - no files will be modified") + + # Process files + stats_summary = defaultdict(int) + total_catalog_urls = 0 + total_external_platforms = 0 + + for i, file_path in enumerate(files): + if (i + 1) % 100 == 0: + logger.info(f"Progress: {i + 1}/{len(files)} files processed") + + stats = process_file(file_path, dry_run=args.dry_run) + stats_summary[stats['status']] += 1 + total_catalog_urls += stats.get('catalog_urls', 0) + total_external_platforms += stats.get('external_platforms', 0) + + # Print summary + logger.info("\n" + "=" * 60) + logger.info("TRANSFORMATION SUMMARY") + logger.info("=" * 60) + logger.info(f"Total files processed: {len(files)}") + + for status, count in sorted(stats_summary.items()): + logger.info(f" {status}: {count}") + + logger.info(f"\nTotal catalog URLs found: {total_catalog_urls}") + logger.info(f"Total external platforms found: {total_external_platforms}") + + if args.dry_run: + logger.info("\n[DRY-RUN] No files were modified. Run without --dry-run to apply changes.") + + +if __name__ == '__main__': + main()