- Implemented a Python script to validate KB library YAML files for required fields and data quality. - Analyzed enrichment coverage from Wikidata and Google Maps, generating statistics. - Created a comprehensive markdown report summarizing validation results and enrichment quality. - Included error handling for file loading and validation processes. - Generated JSON statistics for further analysis.
618 lines
23 KiB
Python
618 lines
23 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich KB Netherlands library entries with website data using Exa MCP.
|
|
|
|
This script extracts detailed information from library websites to populate
|
|
LinkML schema fields including:
|
|
- Description and mission
|
|
- Collections (types, scope, extent)
|
|
- Digital platforms (APIs, IIIF, linked data)
|
|
- Services and accessibility
|
|
- Organizational structure
|
|
- Contact information
|
|
- Opening hours (from website if available)
|
|
- Metadata standards
|
|
- Staff/leadership (if publicly listed)
|
|
|
|
Usage:
|
|
python scripts/enrich_kb_libraries_exa.py [--dry-run] [--limit N]
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import yaml
|
|
import time
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Optional, Any
|
|
from dataclasses import dataclass, field
|
|
import logging
|
|
import argparse
|
|
import subprocess
|
|
import re
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Paths
|
|
ENTRIES_DIR = Path("/Users/kempersc/apps/glam/data/nde/enriched/entries")
|
|
REPORTS_DIR = Path("/Users/kempersc/apps/glam/reports")
|
|
|
|
# Rate limiting for Exa API
|
|
REQUEST_DELAY = 1.0 # seconds between requests
|
|
|
|
|
|
@dataclass
|
|
class WebsiteEnrichment:
|
|
"""Container for website-extracted data following LinkML schema."""
|
|
|
|
# Basic info
|
|
description: Optional[str] = None
|
|
mission_statement: Optional[str] = None
|
|
history_summary: Optional[str] = None
|
|
founding_year: Optional[str] = None
|
|
|
|
# Collections (CustodianCollection fields)
|
|
collections: List[Dict[str, Any]] = field(default_factory=list)
|
|
collection_types: List[str] = field(default_factory=list)
|
|
collection_scope: Optional[str] = None
|
|
collection_extent: Optional[str] = None
|
|
temporal_coverage: Optional[str] = None
|
|
digitization_status: Optional[str] = None
|
|
|
|
# Digital Platform fields
|
|
homepage_url: Optional[str] = None
|
|
catalog_url: Optional[str] = None
|
|
api_endpoints: List[str] = field(default_factory=list)
|
|
sparql_endpoint: Optional[str] = None
|
|
oai_pmh_endpoint: Optional[str] = None
|
|
iiif_support: Optional[bool] = None
|
|
linked_data: Optional[bool] = None
|
|
metadata_standards: List[str] = field(default_factory=list)
|
|
|
|
# Services
|
|
services: List[str] = field(default_factory=list)
|
|
accessibility_info: Optional[str] = None
|
|
membership_info: Optional[str] = None
|
|
|
|
# Contact & Location
|
|
contact_email: Optional[str] = None
|
|
contact_phone: Optional[str] = None
|
|
address: Optional[str] = None
|
|
opening_hours_text: Optional[str] = None
|
|
|
|
# Organization
|
|
parent_organization: Optional[str] = None
|
|
organizational_units: List[str] = field(default_factory=list)
|
|
staff_count: Optional[int] = None
|
|
leadership: List[Dict[str, str]] = field(default_factory=list)
|
|
|
|
# Technical
|
|
repository_software: Optional[str] = None
|
|
cms_system: Optional[str] = None
|
|
programming_languages: List[str] = field(default_factory=list)
|
|
|
|
# Provenance
|
|
extraction_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
source_urls: List[str] = field(default_factory=list)
|
|
extraction_method: str = "exa_web_search"
|
|
confidence_score: float = 0.0
|
|
|
|
|
|
def extract_collections_from_text(text: str) -> Dict[str, Any]:
|
|
"""Extract collection information from website text."""
|
|
collections_info = {
|
|
'collection_types': [],
|
|
'collection_scope': None,
|
|
'collection_extent': None,
|
|
'temporal_coverage': None,
|
|
'digitization_status': None,
|
|
}
|
|
|
|
text_lower = text.lower()
|
|
|
|
# Detect collection types
|
|
type_patterns = {
|
|
'books': ['books', 'boeken', 'publications', 'publicaties'],
|
|
'newspapers': ['newspapers', 'kranten', 'nieuws'],
|
|
'magazines': ['magazines', 'tijdschriften', 'periodicals'],
|
|
'manuscripts': ['manuscripts', 'handschriften', 'medieval'],
|
|
'digital_born': ['e-books', 'ebooks', 'digital', 'digitaal', 'websites'],
|
|
'photographs': ['photos', 'photographs', 'foto', 'images'],
|
|
'maps': ['maps', 'kaarten', 'cartography'],
|
|
'music': ['music', 'muziek', 'audio', 'sound'],
|
|
'archives': ['archives', 'archieven', 'records'],
|
|
}
|
|
|
|
for coll_type, keywords in type_patterns.items():
|
|
if any(kw in text_lower for kw in keywords):
|
|
collections_info['collection_types'].append(coll_type)
|
|
|
|
# Extract extent (numbers with units)
|
|
extent_patterns = [
|
|
r'(\d[\d,\.]*)\s*(million|miljoen)?\s*(books|items|volumes|objects|pieces)',
|
|
r'(\d[\d,\.]*)\s*(kilometer|km|metres|meter)s?\s*(of|aan)\s*(shelves|shelf|materials)',
|
|
r'over\s+(\d[\d,\.]*)\s*(items|volumes|books)',
|
|
r'meer dan\s+(\d[\d,\.]*)\s*(items|stukken|boeken)',
|
|
]
|
|
|
|
for pattern in extent_patterns:
|
|
match = re.search(pattern, text_lower)
|
|
if match:
|
|
collections_info['collection_extent'] = match.group(0)
|
|
break
|
|
|
|
# Detect digitization status
|
|
if any(phrase in text_lower for phrase in ['fully digitized', 'volledig gedigitaliseerd', 'complete digital']):
|
|
collections_info['digitization_status'] = 'COMPLETE'
|
|
elif any(phrase in text_lower for phrase in ['digitization', 'digitalisering', 'being digitized', 'digital collection']):
|
|
collections_info['digitization_status'] = 'PARTIAL'
|
|
elif any(phrase in text_lower for phrase in ['no digital', 'niet digitaal', 'physical only']):
|
|
collections_info['digitization_status'] = 'NOT_DIGITIZED'
|
|
|
|
return collections_info
|
|
|
|
|
|
def extract_digital_platform_info(text: str, url: str) -> Dict[str, Any]:
|
|
"""Extract digital platform information from website text."""
|
|
platform_info = {
|
|
'homepage_url': url if url else None,
|
|
'catalog_url': None,
|
|
'api_endpoints': [],
|
|
'sparql_endpoint': None,
|
|
'oai_pmh_endpoint': None,
|
|
'iiif_support': None,
|
|
'linked_data': None,
|
|
'metadata_standards': [],
|
|
}
|
|
|
|
text_lower = text.lower()
|
|
|
|
# Detect IIIF support
|
|
if 'iiif' in text_lower:
|
|
platform_info['iiif_support'] = True
|
|
|
|
# Detect linked data
|
|
if any(term in text_lower for term in ['linked data', 'rdf', 'sparql', 'semantic web', 'json-ld']):
|
|
platform_info['linked_data'] = True
|
|
|
|
# Detect metadata standards
|
|
standard_patterns = {
|
|
'Dublin Core': ['dublin core', 'dc:', 'dcterms'],
|
|
'MARC21': ['marc21', 'marc 21', 'marc format'],
|
|
'EAD': ['ead', 'encoded archival description'],
|
|
'LIDO': ['lido'],
|
|
'MODS': ['mods', 'metadata object description'],
|
|
'PREMIS': ['premis', 'preservation metadata'],
|
|
'Schema.org': ['schema.org', 'schema:'],
|
|
}
|
|
|
|
for standard, patterns in standard_patterns.items():
|
|
if any(p in text_lower for p in patterns):
|
|
platform_info['metadata_standards'].append(standard)
|
|
|
|
# Extract URLs
|
|
url_patterns = {
|
|
'catalog_url': [r'(https?://[^\s<>"]+(?:catalog|catalogue|search|zoeken|collectie)[^\s<>"]*)'],
|
|
'api_endpoints': [r'(https?://[^\s<>"]+(?:api|webservice)[^\s<>"]*)'],
|
|
'sparql_endpoint': [r'(https?://[^\s<>"]+sparql[^\s<>"]*)'],
|
|
'oai_pmh_endpoint': [r'(https?://[^\s<>"]+(?:oai|oai-pmh)[^\s<>"]*)'],
|
|
}
|
|
|
|
for field, patterns in url_patterns.items():
|
|
for pattern in patterns:
|
|
matches = re.findall(pattern, text, re.IGNORECASE)
|
|
if matches:
|
|
if field == 'api_endpoints':
|
|
platform_info[field].extend(matches[:3]) # Max 3
|
|
else:
|
|
platform_info[field] = matches[0]
|
|
break
|
|
|
|
return platform_info
|
|
|
|
|
|
def extract_organization_info(text: str) -> Dict[str, Any]:
|
|
"""Extract organizational information from website text."""
|
|
org_info = {
|
|
'parent_organization': None,
|
|
'organizational_units': [],
|
|
'leadership': [],
|
|
'staff_count': None,
|
|
}
|
|
|
|
text_lower = text.lower()
|
|
|
|
# Detect parent organizations
|
|
parent_patterns = [
|
|
r'(?:part of|onderdeel van|under|onder)\s+(?:the\s+)?([A-Z][^,\.\n]+)',
|
|
r'(?:ministry|ministerie)\s+(?:of|van)\s+([A-Z][^,\.\n]+)',
|
|
]
|
|
|
|
for pattern in parent_patterns:
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
org_info['parent_organization'] = match.group(1).strip()
|
|
break
|
|
|
|
# Detect organizational units/departments
|
|
unit_patterns = [
|
|
r'(?:department|afdeling|team|unit)\s+(?:of|voor)?\s*([A-Za-z\s]+)',
|
|
]
|
|
|
|
for pattern in unit_patterns:
|
|
matches = re.findall(pattern, text, re.IGNORECASE)
|
|
org_info['organizational_units'].extend([m.strip() for m in matches[:5]])
|
|
|
|
# Extract staff count
|
|
staff_patterns = [
|
|
r'(\d+)\s*(?:staff|employees|medewerkers|fte)',
|
|
r'team of\s*(\d+)',
|
|
]
|
|
|
|
for pattern in staff_patterns:
|
|
match = re.search(pattern, text_lower)
|
|
if match:
|
|
try:
|
|
org_info['staff_count'] = int(match.group(1))
|
|
except ValueError:
|
|
pass
|
|
break
|
|
|
|
return org_info
|
|
|
|
|
|
def extract_services_info(text: str) -> Dict[str, Any]:
|
|
"""Extract services and accessibility information."""
|
|
services_info = {
|
|
'services': [],
|
|
'accessibility_info': None,
|
|
'membership_info': None,
|
|
}
|
|
|
|
text_lower = text.lower()
|
|
|
|
# Detect services
|
|
service_keywords = {
|
|
'Reading room': ['reading room', 'leeszaal', 'study room'],
|
|
'Interlibrary loan': ['interlibrary loan', 'ibl', 'interbibliothecair leenverkeer'],
|
|
'Digital access': ['digital access', 'online access', 'remote access'],
|
|
'Research support': ['research support', 'onderzoeksondersteuning', 'reference service'],
|
|
'Exhibitions': ['exhibitions', 'tentoonstellingen', 'displays'],
|
|
'Tours': ['tours', 'rondleidingen', 'guided tours'],
|
|
'Events': ['events', 'evenementen', 'lectures', 'workshops'],
|
|
'Scanning services': ['scanning', 'digitization service', 'reproduction'],
|
|
'Wi-Fi': ['wi-fi', 'wifi', 'internet access'],
|
|
'Copying': ['copying', 'kopiëren', 'printing'],
|
|
}
|
|
|
|
for service, keywords in service_keywords.items():
|
|
if any(kw in text_lower for kw in keywords):
|
|
services_info['services'].append(service)
|
|
|
|
# Detect accessibility
|
|
accessibility_keywords = ['wheelchair', 'rolstoel', 'accessible', 'toegankelijk', 'disability', 'handicap']
|
|
if any(kw in text_lower for kw in accessibility_keywords):
|
|
services_info['accessibility_info'] = 'Accessibility features available'
|
|
|
|
# Detect membership
|
|
if any(term in text_lower for term in ['membership', 'lidmaatschap', 'member', 'lid worden', 'join']):
|
|
services_info['membership_info'] = 'Membership available'
|
|
|
|
return services_info
|
|
|
|
|
|
def extract_contact_info(text: str) -> Dict[str, Any]:
|
|
"""Extract contact information from website text."""
|
|
contact_info = {
|
|
'contact_email': None,
|
|
'contact_phone': None,
|
|
'address': None,
|
|
}
|
|
|
|
# Email pattern
|
|
email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
|
|
if email_match:
|
|
contact_info['contact_email'] = email_match.group(0)
|
|
|
|
# Dutch phone pattern
|
|
phone_patterns = [
|
|
r'(?:\+31|0031|0)\s*(?:\d[\s-]*){9,10}',
|
|
r'tel(?:efoon)?[:\s]+([+\d\s\-()]+)',
|
|
]
|
|
|
|
for pattern in phone_patterns:
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
contact_info['contact_phone'] = match.group(0).strip()
|
|
break
|
|
|
|
# Address pattern (Dutch postal code)
|
|
address_match = re.search(r'(\d{4}\s*[A-Z]{2})\s*([A-Za-z\s]+)', text)
|
|
if address_match:
|
|
contact_info['address'] = address_match.group(0)
|
|
|
|
return contact_info
|
|
|
|
|
|
def call_exa_search(query: str, num_results: int = 3) -> Optional[str]:
|
|
"""
|
|
Call Exa MCP tool via subprocess (simulated - in practice this would use the MCP client).
|
|
|
|
For now, returns None as we'll use the direct API in the enrichment pipeline.
|
|
"""
|
|
# This is a placeholder - the actual Exa search will be done via MCP
|
|
return None
|
|
|
|
|
|
def process_exa_results(results: List[Dict[str, Any]]) -> WebsiteEnrichment:
|
|
"""Process Exa search results into WebsiteEnrichment structure."""
|
|
enrichment = WebsiteEnrichment()
|
|
|
|
all_text = ""
|
|
source_urls = []
|
|
|
|
for result in results:
|
|
url = result.get('url', '')
|
|
text = result.get('text', '')
|
|
title = result.get('title', '')
|
|
|
|
if url:
|
|
source_urls.append(url)
|
|
if text:
|
|
all_text += f"\n{text}"
|
|
|
|
# Set homepage from first result
|
|
if not enrichment.homepage_url and url:
|
|
enrichment.homepage_url = url
|
|
|
|
enrichment.source_urls = source_urls
|
|
|
|
if all_text:
|
|
# Extract description (first 500 chars after cleaning)
|
|
clean_text = re.sub(r'\s+', ' ', all_text).strip()
|
|
if len(clean_text) > 100:
|
|
enrichment.description = clean_text[:500] + "..."
|
|
|
|
# Extract collections info
|
|
collections_info = extract_collections_from_text(all_text)
|
|
enrichment.collection_types = collections_info['collection_types']
|
|
enrichment.collection_scope = collections_info['collection_scope']
|
|
enrichment.collection_extent = collections_info['collection_extent']
|
|
enrichment.temporal_coverage = collections_info['temporal_coverage']
|
|
enrichment.digitization_status = collections_info['digitization_status']
|
|
|
|
# Extract digital platform info
|
|
platform_info = extract_digital_platform_info(all_text, enrichment.homepage_url)
|
|
enrichment.catalog_url = platform_info['catalog_url']
|
|
enrichment.api_endpoints = platform_info['api_endpoints']
|
|
enrichment.sparql_endpoint = platform_info['sparql_endpoint']
|
|
enrichment.oai_pmh_endpoint = platform_info['oai_pmh_endpoint']
|
|
enrichment.iiif_support = platform_info['iiif_support']
|
|
enrichment.linked_data = platform_info['linked_data']
|
|
enrichment.metadata_standards = platform_info['metadata_standards']
|
|
|
|
# Extract organization info
|
|
org_info = extract_organization_info(all_text)
|
|
enrichment.parent_organization = org_info['parent_organization']
|
|
enrichment.organizational_units = org_info['organizational_units']
|
|
enrichment.leadership = org_info['leadership']
|
|
enrichment.staff_count = org_info['staff_count']
|
|
|
|
# Extract services info
|
|
services_info = extract_services_info(all_text)
|
|
enrichment.services = services_info['services']
|
|
enrichment.accessibility_info = services_info['accessibility_info']
|
|
enrichment.membership_info = services_info['membership_info']
|
|
|
|
# Extract contact info
|
|
contact_info = extract_contact_info(all_text)
|
|
enrichment.contact_email = contact_info['contact_email']
|
|
enrichment.contact_phone = contact_info['contact_phone']
|
|
enrichment.address = contact_info['address']
|
|
|
|
# Calculate confidence score based on data extracted
|
|
extracted_fields = sum([
|
|
bool(enrichment.description),
|
|
len(enrichment.collection_types) > 0,
|
|
bool(enrichment.collection_extent),
|
|
bool(enrichment.digitization_status),
|
|
len(enrichment.metadata_standards) > 0,
|
|
len(enrichment.services) > 0,
|
|
bool(enrichment.contact_email or enrichment.contact_phone),
|
|
])
|
|
enrichment.confidence_score = min(extracted_fields / 7.0, 1.0)
|
|
|
|
return enrichment
|
|
|
|
|
|
def load_kb_library_files() -> List[Dict[str, Any]]:
|
|
"""Load all KB library YAML files."""
|
|
entries = []
|
|
|
|
for filepath in sorted(ENTRIES_DIR.glob("*_kb_isil.yaml")):
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
data['_filepath'] = str(filepath)
|
|
data['_filename'] = filepath.name
|
|
entries.append(data)
|
|
except Exception as e:
|
|
logger.error(f"Error loading {filepath}: {e}")
|
|
|
|
return entries
|
|
|
|
|
|
def save_entry(entry: Dict[str, Any], filepath: str):
|
|
"""Save entry back to YAML file."""
|
|
# Remove internal fields before saving
|
|
save_data = {k: v for k, v in entry.items() if not k.startswith('_')}
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(save_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
|
|
def enrichment_to_dict(enrichment: WebsiteEnrichment) -> Dict[str, Any]:
|
|
"""Convert WebsiteEnrichment dataclass to dict for YAML storage."""
|
|
data = {
|
|
'extraction_timestamp': enrichment.extraction_timestamp,
|
|
'extraction_method': enrichment.extraction_method,
|
|
'confidence_score': enrichment.confidence_score,
|
|
'source_urls': enrichment.source_urls,
|
|
}
|
|
|
|
# Add non-empty fields
|
|
if enrichment.description:
|
|
data['description'] = enrichment.description
|
|
if enrichment.mission_statement:
|
|
data['mission_statement'] = enrichment.mission_statement
|
|
if enrichment.history_summary:
|
|
data['history_summary'] = enrichment.history_summary
|
|
if enrichment.founding_year:
|
|
data['founding_year'] = enrichment.founding_year
|
|
|
|
# Collections
|
|
if enrichment.collection_types:
|
|
data['collection_types'] = enrichment.collection_types
|
|
if enrichment.collection_scope:
|
|
data['collection_scope'] = enrichment.collection_scope
|
|
if enrichment.collection_extent:
|
|
data['collection_extent'] = enrichment.collection_extent
|
|
if enrichment.temporal_coverage:
|
|
data['temporal_coverage'] = enrichment.temporal_coverage
|
|
if enrichment.digitization_status:
|
|
data['digitization_status'] = enrichment.digitization_status
|
|
|
|
# Digital Platform
|
|
if enrichment.homepage_url:
|
|
data['homepage_url'] = enrichment.homepage_url
|
|
if enrichment.catalog_url:
|
|
data['catalog_url'] = enrichment.catalog_url
|
|
if enrichment.api_endpoints:
|
|
data['api_endpoints'] = enrichment.api_endpoints
|
|
if enrichment.sparql_endpoint:
|
|
data['sparql_endpoint'] = enrichment.sparql_endpoint
|
|
if enrichment.oai_pmh_endpoint:
|
|
data['oai_pmh_endpoint'] = enrichment.oai_pmh_endpoint
|
|
if enrichment.iiif_support is not None:
|
|
data['iiif_support'] = enrichment.iiif_support
|
|
if enrichment.linked_data is not None:
|
|
data['linked_data'] = enrichment.linked_data
|
|
if enrichment.metadata_standards:
|
|
data['metadata_standards'] = enrichment.metadata_standards
|
|
|
|
# Services
|
|
if enrichment.services:
|
|
data['services'] = enrichment.services
|
|
if enrichment.accessibility_info:
|
|
data['accessibility_info'] = enrichment.accessibility_info
|
|
if enrichment.membership_info:
|
|
data['membership_info'] = enrichment.membership_info
|
|
|
|
# Contact
|
|
if enrichment.contact_email:
|
|
data['contact_email'] = enrichment.contact_email
|
|
if enrichment.contact_phone:
|
|
data['contact_phone'] = enrichment.contact_phone
|
|
if enrichment.address:
|
|
data['address_from_website'] = enrichment.address
|
|
|
|
# Organization
|
|
if enrichment.parent_organization:
|
|
data['parent_organization'] = enrichment.parent_organization
|
|
if enrichment.organizational_units:
|
|
data['organizational_units'] = enrichment.organizational_units
|
|
if enrichment.staff_count:
|
|
data['staff_count'] = enrichment.staff_count
|
|
if enrichment.leadership:
|
|
data['leadership'] = enrichment.leadership
|
|
|
|
# Technical
|
|
if enrichment.repository_software:
|
|
data['repository_software'] = enrichment.repository_software
|
|
if enrichment.cms_system:
|
|
data['cms_system'] = enrichment.cms_system
|
|
if enrichment.programming_languages:
|
|
data['programming_languages'] = enrichment.programming_languages
|
|
|
|
return data
|
|
|
|
|
|
def main():
|
|
"""Main function - this provides structure for MCP-based enrichment."""
|
|
parser = argparse.ArgumentParser(description='Enrich KB libraries with website data via Exa')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
|
|
parser.add_argument('--limit', type=int, default=None, help='Limit number of entries to process')
|
|
args = parser.parse_args()
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("KB Netherlands Libraries - Exa Website Enrichment")
|
|
logger.info("=" * 60)
|
|
|
|
# Load entries
|
|
entries = load_kb_library_files()
|
|
logger.info(f"Loaded {len(entries)} KB library entries")
|
|
|
|
if args.limit:
|
|
entries = entries[:args.limit]
|
|
logger.info(f"Limited to {len(entries)} entries")
|
|
|
|
# Filter entries that need website enrichment
|
|
needs_enrichment = []
|
|
for entry in entries:
|
|
# Check if already has website enrichment
|
|
if 'website_enrichment' not in entry:
|
|
# Check if we have a website URL to search
|
|
google_enrichment = entry.get('google_maps_enrichment', {})
|
|
wikidata_enrichment = entry.get('wikidata_enrichment', {})
|
|
|
|
website_url = (
|
|
google_enrichment.get('website') or
|
|
wikidata_enrichment.get('wikidata_identifiers', {}).get('Website')
|
|
)
|
|
|
|
if website_url:
|
|
entry['_website_url'] = website_url
|
|
needs_enrichment.append(entry)
|
|
|
|
logger.info(f"Entries needing website enrichment: {len(needs_enrichment)}")
|
|
|
|
if args.dry_run:
|
|
logger.info("DRY RUN - No changes will be made")
|
|
for entry in needs_enrichment[:10]:
|
|
name = entry.get('original_entry', {}).get('organisatie', 'Unknown')
|
|
website = entry.get('_website_url', 'No URL')
|
|
logger.info(f" Would enrich: {name} - {website}")
|
|
return
|
|
|
|
# Print guidance for manual MCP-based enrichment
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("MANUAL ENRICHMENT REQUIRED")
|
|
logger.info("=" * 60)
|
|
logger.info("\nThis script identifies entries needing enrichment.")
|
|
logger.info("Use the Exa MCP tool to search each library's website.")
|
|
logger.info("\nExample search queries:")
|
|
|
|
for entry in needs_enrichment[:5]:
|
|
name = entry.get('original_entry', {}).get('organisatie', 'Unknown')
|
|
city = entry.get('original_entry', {}).get('plaatsnaam_bezoekadres', '')
|
|
website = entry.get('_website_url', '')
|
|
|
|
if website:
|
|
domain = website.replace('https://', '').replace('http://', '').split('/')[0]
|
|
logger.info(f"\n Library: {name}")
|
|
logger.info(f" Website: {website}")
|
|
logger.info(f" Query: site:{domain} about collections services contact")
|
|
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info(f"Total entries to enrich: {len(needs_enrichment)}")
|
|
logger.info("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|