#!/usr/bin/env python3 """ Enrich KB Netherlands library entries with website data using Exa MCP. This script extracts detailed information from library websites to populate LinkML schema fields including: - Description and mission - Collections (types, scope, extent) - Digital platforms (APIs, IIIF, linked data) - Services and accessibility - Organizational structure - Contact information - Opening hours (from website if available) - Metadata standards - Staff/leadership (if publicly listed) Usage: python scripts/enrich_kb_libraries_exa.py [--dry-run] [--limit N] """ import os import sys import json import yaml import time from pathlib import Path from datetime import datetime, timezone from typing import Dict, List, Optional, Any from dataclasses import dataclass, field import logging import argparse import subprocess import re # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Paths ENTRIES_DIR = Path("/Users/kempersc/apps/glam/data/nde/enriched/entries") REPORTS_DIR = Path("/Users/kempersc/apps/glam/reports") # Rate limiting for Exa API REQUEST_DELAY = 1.0 # seconds between requests @dataclass class WebsiteEnrichment: """Container for website-extracted data following LinkML schema.""" # Basic info description: Optional[str] = None mission_statement: Optional[str] = None history_summary: Optional[str] = None founding_year: Optional[str] = None # Collections (CustodianCollection fields) collections: List[Dict[str, Any]] = field(default_factory=list) collection_types: List[str] = field(default_factory=list) collection_scope: Optional[str] = None collection_extent: Optional[str] = None temporal_coverage: Optional[str] = None digitization_status: Optional[str] = None # Digital Platform fields homepage_url: Optional[str] = None catalog_url: Optional[str] = None api_endpoints: List[str] = field(default_factory=list) sparql_endpoint: Optional[str] = None oai_pmh_endpoint: Optional[str] = None iiif_support: Optional[bool] = None linked_data: Optional[bool] = None metadata_standards: List[str] = field(default_factory=list) # Services services: List[str] = field(default_factory=list) accessibility_info: Optional[str] = None membership_info: Optional[str] = None # Contact & Location contact_email: Optional[str] = None contact_phone: Optional[str] = None address: Optional[str] = None opening_hours_text: Optional[str] = None # Organization parent_organization: Optional[str] = None organizational_units: List[str] = field(default_factory=list) staff_count: Optional[int] = None leadership: List[Dict[str, str]] = field(default_factory=list) # Technical repository_software: Optional[str] = None cms_system: Optional[str] = None programming_languages: List[str] = field(default_factory=list) # Provenance extraction_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat()) source_urls: List[str] = field(default_factory=list) extraction_method: str = "exa_web_search" confidence_score: float = 0.0 def extract_collections_from_text(text: str) -> Dict[str, Any]: """Extract collection information from website text.""" collections_info = { 'collection_types': [], 'collection_scope': None, 'collection_extent': None, 'temporal_coverage': None, 'digitization_status': None, } text_lower = text.lower() # Detect collection types type_patterns = { 'books': ['books', 'boeken', 'publications', 'publicaties'], 'newspapers': ['newspapers', 'kranten', 'nieuws'], 'magazines': ['magazines', 'tijdschriften', 'periodicals'], 'manuscripts': ['manuscripts', 'handschriften', 'medieval'], 'digital_born': ['e-books', 'ebooks', 'digital', 'digitaal', 'websites'], 'photographs': ['photos', 'photographs', 'foto', 'images'], 'maps': ['maps', 'kaarten', 'cartography'], 'music': ['music', 'muziek', 'audio', 'sound'], 'archives': ['archives', 'archieven', 'records'], } for coll_type, keywords in type_patterns.items(): if any(kw in text_lower for kw in keywords): collections_info['collection_types'].append(coll_type) # Extract extent (numbers with units) extent_patterns = [ r'(\d[\d,\.]*)\s*(million|miljoen)?\s*(books|items|volumes|objects|pieces)', r'(\d[\d,\.]*)\s*(kilometer|km|metres|meter)s?\s*(of|aan)\s*(shelves|shelf|materials)', r'over\s+(\d[\d,\.]*)\s*(items|volumes|books)', r'meer dan\s+(\d[\d,\.]*)\s*(items|stukken|boeken)', ] for pattern in extent_patterns: match = re.search(pattern, text_lower) if match: collections_info['collection_extent'] = match.group(0) break # Detect digitization status if any(phrase in text_lower for phrase in ['fully digitized', 'volledig gedigitaliseerd', 'complete digital']): collections_info['digitization_status'] = 'COMPLETE' elif any(phrase in text_lower for phrase in ['digitization', 'digitalisering', 'being digitized', 'digital collection']): collections_info['digitization_status'] = 'PARTIAL' elif any(phrase in text_lower for phrase in ['no digital', 'niet digitaal', 'physical only']): collections_info['digitization_status'] = 'NOT_DIGITIZED' return collections_info def extract_digital_platform_info(text: str, url: str) -> Dict[str, Any]: """Extract digital platform information from website text.""" platform_info = { 'homepage_url': url if url else None, 'catalog_url': None, 'api_endpoints': [], 'sparql_endpoint': None, 'oai_pmh_endpoint': None, 'iiif_support': None, 'linked_data': None, 'metadata_standards': [], } text_lower = text.lower() # Detect IIIF support if 'iiif' in text_lower: platform_info['iiif_support'] = True # Detect linked data if any(term in text_lower for term in ['linked data', 'rdf', 'sparql', 'semantic web', 'json-ld']): platform_info['linked_data'] = True # Detect metadata standards standard_patterns = { 'Dublin Core': ['dublin core', 'dc:', 'dcterms'], 'MARC21': ['marc21', 'marc 21', 'marc format'], 'EAD': ['ead', 'encoded archival description'], 'LIDO': ['lido'], 'MODS': ['mods', 'metadata object description'], 'PREMIS': ['premis', 'preservation metadata'], 'Schema.org': ['schema.org', 'schema:'], } for standard, patterns in standard_patterns.items(): if any(p in text_lower for p in patterns): platform_info['metadata_standards'].append(standard) # Extract URLs url_patterns = { 'catalog_url': [r'(https?://[^\s<>"]+(?:catalog|catalogue|search|zoeken|collectie)[^\s<>"]*)'], 'api_endpoints': [r'(https?://[^\s<>"]+(?:api|webservice)[^\s<>"]*)'], 'sparql_endpoint': [r'(https?://[^\s<>"]+sparql[^\s<>"]*)'], 'oai_pmh_endpoint': [r'(https?://[^\s<>"]+(?:oai|oai-pmh)[^\s<>"]*)'], } for field, patterns in url_patterns.items(): for pattern in patterns: matches = re.findall(pattern, text, re.IGNORECASE) if matches: if field == 'api_endpoints': platform_info[field].extend(matches[:3]) # Max 3 else: platform_info[field] = matches[0] break return platform_info def extract_organization_info(text: str) -> Dict[str, Any]: """Extract organizational information from website text.""" org_info = { 'parent_organization': None, 'organizational_units': [], 'leadership': [], 'staff_count': None, } text_lower = text.lower() # Detect parent organizations parent_patterns = [ r'(?:part of|onderdeel van|under|onder)\s+(?:the\s+)?([A-Z][^,\.\n]+)', r'(?:ministry|ministerie)\s+(?:of|van)\s+([A-Z][^,\.\n]+)', ] for pattern in parent_patterns: match = re.search(pattern, text, re.IGNORECASE) if match: org_info['parent_organization'] = match.group(1).strip() break # Detect organizational units/departments unit_patterns = [ r'(?:department|afdeling|team|unit)\s+(?:of|voor)?\s*([A-Za-z\s]+)', ] for pattern in unit_patterns: matches = re.findall(pattern, text, re.IGNORECASE) org_info['organizational_units'].extend([m.strip() for m in matches[:5]]) # Extract staff count staff_patterns = [ r'(\d+)\s*(?:staff|employees|medewerkers|fte)', r'team of\s*(\d+)', ] for pattern in staff_patterns: match = re.search(pattern, text_lower) if match: try: org_info['staff_count'] = int(match.group(1)) except ValueError: pass break return org_info def extract_services_info(text: str) -> Dict[str, Any]: """Extract services and accessibility information.""" services_info = { 'services': [], 'accessibility_info': None, 'membership_info': None, } text_lower = text.lower() # Detect services service_keywords = { 'Reading room': ['reading room', 'leeszaal', 'study room'], 'Interlibrary loan': ['interlibrary loan', 'ibl', 'interbibliothecair leenverkeer'], 'Digital access': ['digital access', 'online access', 'remote access'], 'Research support': ['research support', 'onderzoeksondersteuning', 'reference service'], 'Exhibitions': ['exhibitions', 'tentoonstellingen', 'displays'], 'Tours': ['tours', 'rondleidingen', 'guided tours'], 'Events': ['events', 'evenementen', 'lectures', 'workshops'], 'Scanning services': ['scanning', 'digitization service', 'reproduction'], 'Wi-Fi': ['wi-fi', 'wifi', 'internet access'], 'Copying': ['copying', 'kopiƫren', 'printing'], } for service, keywords in service_keywords.items(): if any(kw in text_lower for kw in keywords): services_info['services'].append(service) # Detect accessibility accessibility_keywords = ['wheelchair', 'rolstoel', 'accessible', 'toegankelijk', 'disability', 'handicap'] if any(kw in text_lower for kw in accessibility_keywords): services_info['accessibility_info'] = 'Accessibility features available' # Detect membership if any(term in text_lower for term in ['membership', 'lidmaatschap', 'member', 'lid worden', 'join']): services_info['membership_info'] = 'Membership available' return services_info def extract_contact_info(text: str) -> Dict[str, Any]: """Extract contact information from website text.""" contact_info = { 'contact_email': None, 'contact_phone': None, 'address': None, } # Email pattern email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text) if email_match: contact_info['contact_email'] = email_match.group(0) # Dutch phone pattern phone_patterns = [ r'(?:\+31|0031|0)\s*(?:\d[\s-]*){9,10}', r'tel(?:efoon)?[:\s]+([+\d\s\-()]+)', ] for pattern in phone_patterns: match = re.search(pattern, text, re.IGNORECASE) if match: contact_info['contact_phone'] = match.group(0).strip() break # Address pattern (Dutch postal code) address_match = re.search(r'(\d{4}\s*[A-Z]{2})\s*([A-Za-z\s]+)', text) if address_match: contact_info['address'] = address_match.group(0) return contact_info def call_exa_search(query: str, num_results: int = 3) -> Optional[str]: """ Call Exa MCP tool via subprocess (simulated - in practice this would use the MCP client). For now, returns None as we'll use the direct API in the enrichment pipeline. """ # This is a placeholder - the actual Exa search will be done via MCP return None def process_exa_results(results: List[Dict[str, Any]]) -> WebsiteEnrichment: """Process Exa search results into WebsiteEnrichment structure.""" enrichment = WebsiteEnrichment() all_text = "" source_urls = [] for result in results: url = result.get('url', '') text = result.get('text', '') title = result.get('title', '') if url: source_urls.append(url) if text: all_text += f"\n{text}" # Set homepage from first result if not enrichment.homepage_url and url: enrichment.homepage_url = url enrichment.source_urls = source_urls if all_text: # Extract description (first 500 chars after cleaning) clean_text = re.sub(r'\s+', ' ', all_text).strip() if len(clean_text) > 100: enrichment.description = clean_text[:500] + "..." # Extract collections info collections_info = extract_collections_from_text(all_text) enrichment.collection_types = collections_info['collection_types'] enrichment.collection_scope = collections_info['collection_scope'] enrichment.collection_extent = collections_info['collection_extent'] enrichment.temporal_coverage = collections_info['temporal_coverage'] enrichment.digitization_status = collections_info['digitization_status'] # Extract digital platform info platform_info = extract_digital_platform_info(all_text, enrichment.homepage_url) enrichment.catalog_url = platform_info['catalog_url'] enrichment.api_endpoints = platform_info['api_endpoints'] enrichment.sparql_endpoint = platform_info['sparql_endpoint'] enrichment.oai_pmh_endpoint = platform_info['oai_pmh_endpoint'] enrichment.iiif_support = platform_info['iiif_support'] enrichment.linked_data = platform_info['linked_data'] enrichment.metadata_standards = platform_info['metadata_standards'] # Extract organization info org_info = extract_organization_info(all_text) enrichment.parent_organization = org_info['parent_organization'] enrichment.organizational_units = org_info['organizational_units'] enrichment.leadership = org_info['leadership'] enrichment.staff_count = org_info['staff_count'] # Extract services info services_info = extract_services_info(all_text) enrichment.services = services_info['services'] enrichment.accessibility_info = services_info['accessibility_info'] enrichment.membership_info = services_info['membership_info'] # Extract contact info contact_info = extract_contact_info(all_text) enrichment.contact_email = contact_info['contact_email'] enrichment.contact_phone = contact_info['contact_phone'] enrichment.address = contact_info['address'] # Calculate confidence score based on data extracted extracted_fields = sum([ bool(enrichment.description), len(enrichment.collection_types) > 0, bool(enrichment.collection_extent), bool(enrichment.digitization_status), len(enrichment.metadata_standards) > 0, len(enrichment.services) > 0, bool(enrichment.contact_email or enrichment.contact_phone), ]) enrichment.confidence_score = min(extracted_fields / 7.0, 1.0) return enrichment def load_kb_library_files() -> List[Dict[str, Any]]: """Load all KB library YAML files.""" entries = [] for filepath in sorted(ENTRIES_DIR.glob("*_kb_isil.yaml")): try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) data['_filepath'] = str(filepath) data['_filename'] = filepath.name entries.append(data) except Exception as e: logger.error(f"Error loading {filepath}: {e}") return entries def save_entry(entry: Dict[str, Any], filepath: str): """Save entry back to YAML file.""" # Remove internal fields before saving save_data = {k: v for k, v in entry.items() if not k.startswith('_')} with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(save_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) def enrichment_to_dict(enrichment: WebsiteEnrichment) -> Dict[str, Any]: """Convert WebsiteEnrichment dataclass to dict for YAML storage.""" data = { 'extraction_timestamp': enrichment.extraction_timestamp, 'extraction_method': enrichment.extraction_method, 'confidence_score': enrichment.confidence_score, 'source_urls': enrichment.source_urls, } # Add non-empty fields if enrichment.description: data['description'] = enrichment.description if enrichment.mission_statement: data['mission_statement'] = enrichment.mission_statement if enrichment.history_summary: data['history_summary'] = enrichment.history_summary if enrichment.founding_year: data['founding_year'] = enrichment.founding_year # Collections if enrichment.collection_types: data['collection_types'] = enrichment.collection_types if enrichment.collection_scope: data['collection_scope'] = enrichment.collection_scope if enrichment.collection_extent: data['collection_extent'] = enrichment.collection_extent if enrichment.temporal_coverage: data['temporal_coverage'] = enrichment.temporal_coverage if enrichment.digitization_status: data['digitization_status'] = enrichment.digitization_status # Digital Platform if enrichment.homepage_url: data['homepage_url'] = enrichment.homepage_url if enrichment.catalog_url: data['catalog_url'] = enrichment.catalog_url if enrichment.api_endpoints: data['api_endpoints'] = enrichment.api_endpoints if enrichment.sparql_endpoint: data['sparql_endpoint'] = enrichment.sparql_endpoint if enrichment.oai_pmh_endpoint: data['oai_pmh_endpoint'] = enrichment.oai_pmh_endpoint if enrichment.iiif_support is not None: data['iiif_support'] = enrichment.iiif_support if enrichment.linked_data is not None: data['linked_data'] = enrichment.linked_data if enrichment.metadata_standards: data['metadata_standards'] = enrichment.metadata_standards # Services if enrichment.services: data['services'] = enrichment.services if enrichment.accessibility_info: data['accessibility_info'] = enrichment.accessibility_info if enrichment.membership_info: data['membership_info'] = enrichment.membership_info # Contact if enrichment.contact_email: data['contact_email'] = enrichment.contact_email if enrichment.contact_phone: data['contact_phone'] = enrichment.contact_phone if enrichment.address: data['address_from_website'] = enrichment.address # Organization if enrichment.parent_organization: data['parent_organization'] = enrichment.parent_organization if enrichment.organizational_units: data['organizational_units'] = enrichment.organizational_units if enrichment.staff_count: data['staff_count'] = enrichment.staff_count if enrichment.leadership: data['leadership'] = enrichment.leadership # Technical if enrichment.repository_software: data['repository_software'] = enrichment.repository_software if enrichment.cms_system: data['cms_system'] = enrichment.cms_system if enrichment.programming_languages: data['programming_languages'] = enrichment.programming_languages return data def main(): """Main function - this provides structure for MCP-based enrichment.""" parser = argparse.ArgumentParser(description='Enrich KB libraries with website data via Exa') parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes') parser.add_argument('--limit', type=int, default=None, help='Limit number of entries to process') args = parser.parse_args() logger.info("=" * 60) logger.info("KB Netherlands Libraries - Exa Website Enrichment") logger.info("=" * 60) # Load entries entries = load_kb_library_files() logger.info(f"Loaded {len(entries)} KB library entries") if args.limit: entries = entries[:args.limit] logger.info(f"Limited to {len(entries)} entries") # Filter entries that need website enrichment needs_enrichment = [] for entry in entries: # Check if already has website enrichment if 'website_enrichment' not in entry: # Check if we have a website URL to search google_enrichment = entry.get('google_maps_enrichment', {}) wikidata_enrichment = entry.get('wikidata_enrichment', {}) website_url = ( google_enrichment.get('website') or wikidata_enrichment.get('wikidata_identifiers', {}).get('Website') ) if website_url: entry['_website_url'] = website_url needs_enrichment.append(entry) logger.info(f"Entries needing website enrichment: {len(needs_enrichment)}") if args.dry_run: logger.info("DRY RUN - No changes will be made") for entry in needs_enrichment[:10]: name = entry.get('original_entry', {}).get('organisatie', 'Unknown') website = entry.get('_website_url', 'No URL') logger.info(f" Would enrich: {name} - {website}") return # Print guidance for manual MCP-based enrichment logger.info("\n" + "=" * 60) logger.info("MANUAL ENRICHMENT REQUIRED") logger.info("=" * 60) logger.info("\nThis script identifies entries needing enrichment.") logger.info("Use the Exa MCP tool to search each library's website.") logger.info("\nExample search queries:") for entry in needs_enrichment[:5]: name = entry.get('original_entry', {}).get('organisatie', 'Unknown') city = entry.get('original_entry', {}).get('plaatsnaam_bezoekadres', '') website = entry.get('_website_url', '') if website: domain = website.replace('https://', '').replace('http://', '').split('/')[0] logger.info(f"\n Library: {name}") logger.info(f" Website: {website}") logger.info(f" Query: site:{domain} about collections services contact") logger.info("\n" + "=" * 60) logger.info(f"Total entries to enrich: {len(needs_enrichment)}") logger.info("=" * 60) if __name__ == "__main__": main()