#!/usr/bin/env python3 """ Batch enrichment script for digital_platforms metadata. This script: 1. Finds entries with websites but no digital_platforms section 2. Extracts digital platform metadata from existing web claims 3. Infers platform type and metadata from available data 4. Adds the digital_platforms section to each entry Usage: python scripts/enrich_digital_platforms.py [--limit N] [--start-index N] [--dry-run] """ import argparse import os import re import yaml from datetime import datetime, timezone from pathlib import Path from typing import Any, Optional ENTRIES_DIR = Path("data/nde/enriched/entries") # Platform type detection patterns PLATFORM_TYPE_PATTERNS = { "DIGITAL_ARCHIVE": ["digitaal archief", "digital archive", "online archief", "archiefbank"], "DISCOVERY_PORTAL": ["collectie", "collection", "zoeken", "search", "database"], "WEBSITE": ["website", "homepage", "info", "contact", "over ons", "about"], "WEB_PORTAL": ["portal", "portaal", "platform"], "ONLINE_CATALOG": ["catalogus", "catalog", "bibliotheek", "library"], "VIRTUAL_MUSEUM": ["virtueel", "virtual", "3d", "rondleiding", "tour"], "EDUCATIONAL_PLATFORM": ["educatie", "education", "lesmateriaal", "leren"], } # CMS detection patterns CMS_PATTERNS = { "WordPress": ["wp-content", "wp-includes", "wordpress"], "Drupal": ["drupal", "sites/default", "modules/system"], "Joomla": ["joomla", "components/com_"], "Custom CMS": [], } # Data standard patterns DATA_STANDARD_PATTERNS = { "Schema.org": ["schema.org", "itemtype", "itemscope"], "Dublin Core": ["dc:", "dcterms:", "dublin core"], "Open Graph": ["og:", "og:title", "og:description"], "IIIF": ["iiif", "manifest.json", "image-api"], "Linked Data": ["application/ld+json", "@context", "rdf"], } def load_entry(filepath: Path) -> dict: """Load a YAML entry file.""" with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def save_entry(filepath: Path, data: dict): """Save a YAML entry file.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) def has_website(entry: dict) -> bool: """Check if entry has a website URL.""" # Check original_entry.webadres_organisatie original = entry.get('original_entry', {}) if original.get('webadres_organisatie'): return True # Check wikidata official website wikidata = entry.get('wikidata_enrichment', {}) if wikidata.get('wikidata_official_website'): return True # Check google maps website google = entry.get('google_maps_enrichment', {}) if google.get('website'): return True return False def has_digital_platforms(entry: dict) -> bool: """Check if entry already has digital_platforms section.""" return 'digital_platforms' in entry and entry['digital_platforms'] def get_website_url(entry: dict) -> Optional[str]: """Extract website URL from entry.""" # Priority: original_entry > wikidata > google_maps original = entry.get('original_entry', {}) if original.get('webadres_organisatie'): return original['webadres_organisatie'] wikidata = entry.get('wikidata_enrichment', {}) if wikidata.get('wikidata_official_website'): return wikidata['wikidata_official_website'] google = entry.get('google_maps_enrichment', {}) if google.get('website'): return google['website'] return None def detect_platform_type(entry: dict) -> str: """Detect platform type from entry data.""" institution_type = entry.get('original_entry', {}).get('type_organisatie', '') types = entry.get('original_entry', {}).get('type', []) # Map institution type to platform type if 'archive' in institution_type.lower() or 'archief' in institution_type.lower() or 'A' in types: return "DISCOVERY_PORTAL" if 'museum' in institution_type.lower() or 'M' in types: return "WEBSITE" if 'library' in institution_type.lower() or 'bibliotheek' in institution_type.lower() or 'L' in types: return "ONLINE_CATALOG" if 'research' in institution_type.lower() or 'R' in types: return "WEB_PORTAL" # Check web claims for patterns web_claims = entry.get('web_claims', {}).get('claims', []) for claim in web_claims: claim_value = str(claim.get('claim_value', '')).lower() for platform_type, patterns in PLATFORM_TYPE_PATTERNS.items(): if any(p in claim_value for p in patterns): return platform_type return "WEBSITE" # Default def detect_technology_stack(entry: dict) -> str: """Detect technology stack from web claims.""" web_claims = entry.get('web_claims', {}).get('claims', []) detected_cms = [] for claim in web_claims: html_file = str(claim.get('html_file', '')).lower() claim_value = str(claim.get('claim_value', '')).lower() for cms, patterns in CMS_PATTERNS.items(): if any(p in html_file or p in claim_value for p in patterns): if cms not in detected_cms: detected_cms.append(cms) # Check for Atlantis or other known systems original = entry.get('original_entry', {}) system = original.get('systeem', '') if system: if system not in detected_cms: detected_cms.append(system) if detected_cms: return ", ".join(detected_cms) return "Standard web technology" def detect_data_standards(entry: dict) -> list: """Detect data standards from web claims.""" web_claims = entry.get('web_claims', {}).get('claims', []) detected_standards = set() for claim in web_claims: extraction_method = str(claim.get('extraction_method', '')).lower() claim_type = str(claim.get('claim_type', '')).lower() claim_value = str(claim.get('claim_value', '')).lower() # Check for schema.org if 'schema' in extraction_method or 'jsonld' in extraction_method: detected_standards.add("Schema.org") # Check for Open Graph if 'og_' in extraction_method or 'open graph' in extraction_method: detected_standards.add("Open Graph") # Check for collection/catalog patterns indicating standards if 'collection' in claim_type or 'catalog' in claim_type: detected_standards.add("Dublin Core") if not detected_standards: detected_standards.add("HTML5") return list(detected_standards) def extract_user_services(entry: dict) -> str: """Extract user services from web claims and institution type.""" services = [] # Check web claims for specific features web_claims = entry.get('web_claims', {}).get('claims', []) for claim in web_claims: claim_type = claim.get('claim_type', '') if 'search' in claim_type.lower(): if "Search" not in services: services.append("Search") if 'gallery' in claim_type.lower() or 'image' in claim_type.lower(): if "Image gallery" not in services: services.append("Image gallery") if 'video' in claim_type.lower(): if "Video content" not in services: services.append("Video content") if 'social' in claim_type.lower(): if "Social media integration" not in services: services.append("Social media integration") if 'login' in claim_type.lower() or 'signup' in claim_type.lower(): if "User accounts" not in services: services.append("User accounts") if 'email' in claim_type.lower() or 'phone' in claim_type.lower(): if "Contact information" not in services: services.append("Contact information") # Add basic services based on institution type original = entry.get('original_entry', {}) institution_type = original.get('type_organisatie', '') types = original.get('type', []) if 'museum' in institution_type.lower() or 'M' in types: if "Exhibition information" not in services: services.append("Exhibition information") if "Visit planning" not in services: services.append("Visit planning") if 'archive' in institution_type.lower() or 'archief' in institution_type.lower() or 'A' in types: if "Collection search" not in services: services.append("Collection search") if not services: services = ["General information", "Contact"] return ", ".join(services) def extract_sustainability_model(entry: dict) -> str: """Extract sustainability model from entry data.""" wikidata = entry.get('wikidata_enrichment', {}) claims = wikidata.get('wikidata_claims', {}) # Check for legal form legal_form = claims.get('P1454_legal_form', {}) if legal_form: value = legal_form.get('value', {}) if isinstance(value, dict): label = value.get('label_nl', value.get('label_en', '')) if 'stichting' in label.lower(): return "Non-profit foundation" if 'vereniging' in label.lower(): return "Membership association" # Check original entry for hints original = entry.get('original_entry', {}) museum_register = original.get('museum_register', '') if museum_register == 'ja': return "Registered museum (government supported)" return "Institutional funding" def extract_digital_collections(entry: dict) -> str: """Extract description of digital collections.""" descriptions = [] # Check for collection claims in web data web_claims = entry.get('web_claims', {}).get('claims', []) for claim in web_claims: if claim.get('claim_type') == 'collection_page': descriptions.append("Online collection access") if claim.get('claim_type') == 'description_short': # Use first short description as a basis if not descriptions: descriptions.append(claim.get('claim_value', '')[:200]) # Check wikidata descriptions wikidata = entry.get('wikidata_enrichment', {}) if wikidata.get('wikidata_description_en'): descriptions.append(wikidata['wikidata_description_en']) # Check google maps editorial summary google = entry.get('google_maps_enrichment', {}) if google.get('editorial_summary'): descriptions.append(google['editorial_summary']) if descriptions: # Combine and deduplicate return "; ".join(set(descriptions[:2])) return "Organizational website with heritage information" def get_platform_name(entry: dict) -> str: """Get the platform name from entry data.""" # Use organization name original = entry.get('original_entry', {}) if original.get('organisatie'): return f"{original['organisatie']} Website" wikidata = entry.get('wikidata_enrichment', {}) if wikidata.get('wikidata_label_nl'): return f"{wikidata['wikidata_label_nl']} Website" if wikidata.get('wikidata_label_en'): return f"{wikidata['wikidata_label_en']} Website" return "Official Website" def create_digital_platform(entry: dict) -> Optional[dict]: """Create digital_platforms section for an entry.""" website_url = get_website_url(entry) if not website_url: return None platform = { 'platform_name': get_platform_name(entry), 'platform_url': website_url, 'platform_type': detect_platform_type(entry), 'platform_category': ["Organizational website"], 'digital_collections': extract_digital_collections(entry), 'technology_stack': detect_technology_stack(entry), 'data_standards': detect_data_standards(entry), 'user_services': extract_user_services(entry), 'sustainability_model': extract_sustainability_model(entry), 'enrichment_timestamp': datetime.now(timezone.utc).isoformat(), 'source_method': 'automated_extraction', } # Add additional category based on platform type if platform['platform_type'] == 'DISCOVERY_PORTAL': platform['platform_category'].append("Heritage discovery") elif platform['platform_type'] == 'ONLINE_CATALOG': platform['platform_category'].append("Collection catalog") elif platform['platform_type'] == 'VIRTUAL_MUSEUM': platform['platform_category'].append("Virtual exhibition") return platform def enrich_entry(entry: dict) -> dict: """Add digital_platforms section to entry.""" platform = create_digital_platform(entry) if platform: entry['digital_platforms'] = [platform] return entry def find_entries_to_enrich() -> list[Path]: """Find all entries that need digital_platforms enrichment.""" entries_to_enrich = [] for filepath in sorted(ENTRIES_DIR.glob("*.yaml")): if filepath.name.startswith('_'): continue try: entry = load_entry(filepath) if has_website(entry) and not has_digital_platforms(entry): entries_to_enrich.append(filepath) except Exception as e: print(f"Error reading {filepath}: {e}") return entries_to_enrich def main(): parser = argparse.ArgumentParser(description="Enrich entries with digital_platforms metadata") parser.add_argument('--limit', type=int, default=None, help="Maximum number of entries to process") parser.add_argument('--start-index', type=int, default=0, help="Start index for batch processing") parser.add_argument('--dry-run', action='store_true', help="Print changes without writing") parser.add_argument('--verbose', '-v', action='store_true', help="Verbose output") args = parser.parse_args() print("Finding entries to enrich...") entries = find_entries_to_enrich() print(f"Found {len(entries)} entries with websites but no digital_platforms") # Apply start index and limit entries = entries[args.start_index:] if args.limit: entries = entries[:args.limit] print(f"Processing {len(entries)} entries...") enriched_count = 0 for filepath in entries: try: entry = load_entry(filepath) entry = enrich_entry(entry) if 'digital_platforms' in entry: enriched_count += 1 if args.verbose: platform = entry['digital_platforms'][0] print(f"\n{filepath.name}:") print(f" Platform: {platform['platform_name']}") print(f" URL: {platform['platform_url']}") print(f" Type: {platform['platform_type']}") print(f" Tech: {platform['technology_stack']}") if not args.dry_run: save_entry(filepath, entry) except Exception as e: print(f"Error processing {filepath}: {e}") print(f"\n{'Would enrich' if args.dry_run else 'Enriched'} {enriched_count} entries") if __name__ == "__main__": main()