#!/usr/bin/env python3 """ Fast batch enrichment script for digital_platforms metadata. Reads from a pre-generated list file for speed. Usage: # First generate the list: find data/nde/enriched/entries -name "*.yaml" -exec grep -L "digital_platforms:" {} \; > /tmp/entries_to_enrich.txt # Then run: python scripts/enrich_digital_platforms_fast.py --input /tmp/entries_to_enrich.txt --batch 100 """ import argparse import sys import yaml from datetime import datetime, timezone from pathlib import Path from typing import Optional def load_entry(filepath: Path) -> dict: """Load a YAML entry file.""" with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def save_entry(filepath: Path, data: dict): """Save a YAML entry file.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) def get_website_url(entry: dict) -> Optional[str]: """Extract website URL from entry.""" # Priority: most explicit sources first # 1. Original entry webadres original = entry.get('original_entry', {}) if original.get('webadres_organisatie'): return original['webadres_organisatie'] # 2. Contact section website contact = entry.get('contact', {}) if contact.get('website'): return contact['website'] # 3. Digital presence website digital_presence = entry.get('digital_presence', {}) if digital_presence.get('website'): return digital_presence['website'] # 4. Wikidata official website wikidata = entry.get('wikidata_enrichment', {}) if wikidata.get('wikidata_official_website'): return wikidata['wikidata_official_website'] # 5. Google Maps website google = entry.get('google_maps_enrichment', {}) if google.get('website'): return google['website'] return None def detect_platform_type(entry: dict) -> str: """Detect platform type from entry data.""" institution_type = entry.get('original_entry', {}).get('type_organisatie', '') types = entry.get('original_entry', {}).get('type', []) if 'archive' in institution_type.lower() or 'archief' in institution_type.lower() or 'A' in types: return "DISCOVERY_PORTAL" if 'museum' in institution_type.lower() or 'M' in types: return "WEBSITE" if 'library' in institution_type.lower() or 'bibliotheek' in institution_type.lower() or 'L' in types: return "ONLINE_CATALOG" if 'research' in institution_type.lower() or 'R' in types: return "WEB_PORTAL" return "WEBSITE" def detect_technology_stack(entry: dict) -> str: """Detect technology stack from entry.""" original = entry.get('original_entry', {}) system = original.get('systeem', '') if system: return system return "Standard web technology" def detect_data_standards(entry: dict) -> list: """Detect data standards from web claims.""" web_claims = entry.get('web_claims', {}).get('claims', []) detected_standards = set() for claim in web_claims: extraction_method = str(claim.get('extraction_method', '')).lower() if 'schema' in extraction_method or 'jsonld' in extraction_method: detected_standards.add("Schema.org") if 'og_' in extraction_method or 'open graph' in extraction_method: detected_standards.add("Open Graph") if not detected_standards: detected_standards.add("HTML5") return list(detected_standards) def extract_user_services(entry: dict) -> str: """Extract user services from institution type.""" services = [] original = entry.get('original_entry', {}) institution_type = original.get('type_organisatie', '') types = original.get('type', []) if 'museum' in institution_type.lower() or 'M' in types: services.extend(["Exhibition information", "Visit planning"]) if 'archive' in institution_type.lower() or 'archief' in institution_type.lower() or 'A' in types: services.append("Collection search") if 'library' in institution_type.lower() or 'bibliotheek' in institution_type.lower() or 'L' in types: services.append("Catalog search") if not services: services = ["General information", "Contact"] return ", ".join(services) def extract_sustainability_model(entry: dict) -> str: """Extract sustainability model from entry data.""" original = entry.get('original_entry', {}) museum_register = original.get('museum_register', '') if museum_register == 'ja': return "Registered museum (government supported)" wikidata = entry.get('wikidata_enrichment', {}) claims = wikidata.get('wikidata_claims', {}) legal_form = claims.get('P1454_legal_form', {}) if legal_form: value = legal_form.get('value', {}) if isinstance(value, dict): label = value.get('label_nl', value.get('label_en', '')) if 'stichting' in label.lower(): return "Non-profit foundation" if 'vereniging' in label.lower(): return "Membership association" return "Institutional funding" def extract_digital_collections(entry: dict) -> str: """Extract description of digital collections.""" wikidata = entry.get('wikidata_enrichment', {}) if wikidata.get('wikidata_description_en'): return wikidata['wikidata_description_en'] if wikidata.get('wikidata_description_nl'): return wikidata['wikidata_description_nl'] google = entry.get('google_maps_enrichment', {}) if google.get('editorial_summary'): return google['editorial_summary'] return "Organizational website with heritage information" def get_platform_name(entry: dict) -> str: """Get the platform name from entry data.""" original = entry.get('original_entry', {}) if original.get('organisatie'): return f"{original['organisatie']} Website" wikidata = entry.get('wikidata_enrichment', {}) if wikidata.get('wikidata_label_nl'): return f"{wikidata['wikidata_label_nl']} Website" if wikidata.get('wikidata_label_en'): return f"{wikidata['wikidata_label_en']} Website" return "Official Website" def create_digital_platform(entry: dict) -> Optional[dict]: """Create digital_platforms section for an entry.""" website_url = get_website_url(entry) if not website_url: return None platform_type = detect_platform_type(entry) platform = { 'platform_name': get_platform_name(entry), 'platform_url': website_url, 'platform_type': platform_type, 'platform_category': ["Organizational website"], 'digital_collections': extract_digital_collections(entry), 'technology_stack': detect_technology_stack(entry), 'data_standards': detect_data_standards(entry), 'user_services': extract_user_services(entry), 'sustainability_model': extract_sustainability_model(entry), 'enrichment_timestamp': datetime.now(timezone.utc).isoformat(), 'source_method': 'automated_extraction', } if platform_type == 'DISCOVERY_PORTAL': platform['platform_category'].append("Heritage discovery") elif platform_type == 'ONLINE_CATALOG': platform['platform_category'].append("Collection catalog") elif platform_type == 'VIRTUAL_MUSEUM': platform['platform_category'].append("Virtual exhibition") return platform def process_entry(filepath: Path) -> tuple[bool, str]: """Process a single entry. Returns (success, message).""" try: entry = load_entry(filepath) # Skip if already has digital_platforms if 'digital_platforms' in entry and entry['digital_platforms']: return False, "already enriched" platform = create_digital_platform(entry) if platform: entry['digital_platforms'] = [platform] save_entry(filepath, entry) return True, f"added {platform['platform_type']}" else: return False, "no website URL" except Exception as e: return False, f"error: {e}" def main(): parser = argparse.ArgumentParser(description="Fast batch enrich entries with digital_platforms") parser.add_argument('--input', '-i', type=str, required=True, help="Input file with list of entries") parser.add_argument('--batch', '-b', type=int, default=100, help="Batch size to process") parser.add_argument('--skip', '-s', type=int, default=0, help="Number of entries to skip") parser.add_argument('--dry-run', action='store_true', help="Don't actually write files") args = parser.parse_args() # Read list of files to process with open(args.input, 'r') as f: files = [Path(line.strip()) for line in f if line.strip()] print(f"Total entries in list: {len(files)}") # Apply skip and batch files = files[args.skip:args.skip + args.batch] print(f"Processing {len(files)} entries (skip={args.skip}, batch={args.batch})") enriched = 0 skipped = 0 errors = 0 for i, filepath in enumerate(files): if args.dry_run: print(f"[DRY] {filepath.name}") continue success, msg = process_entry(filepath) if success: enriched += 1 if enriched % 10 == 0: print(f"Progress: {enriched} enriched, {i+1}/{len(files)}") elif "error" in msg: errors += 1 print(f"ERROR {filepath.name}: {msg}") else: skipped += 1 print(f"\nDone: {enriched} enriched, {skipped} skipped, {errors} errors") print(f"Next batch: --skip {args.skip + args.batch}") if __name__ == "__main__": main()