glam/scripts/enrich_kb_libraries_exa.py
kempersc 30162e6526 Add script to validate KB library entries and generate enrichment report
- Implemented a Python script to validate KB library YAML files for required fields and data quality.
- Analyzed enrichment coverage from Wikidata and Google Maps, generating statistics.
- Created a comprehensive markdown report summarizing validation results and enrichment quality.
- Included error handling for file loading and validation processes.
- Generated JSON statistics for further analysis.
2025-11-28 14:48:33 +01:00

618 lines
23 KiB
Python

#!/usr/bin/env python3
"""
Enrich KB Netherlands library entries with website data using Exa MCP.
This script extracts detailed information from library websites to populate
LinkML schema fields including:
- Description and mission
- Collections (types, scope, extent)
- Digital platforms (APIs, IIIF, linked data)
- Services and accessibility
- Organizational structure
- Contact information
- Opening hours (from website if available)
- Metadata standards
- Staff/leadership (if publicly listed)
Usage:
python scripts/enrich_kb_libraries_exa.py [--dry-run] [--limit N]
"""
import os
import sys
import json
import yaml
import time
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, field
import logging
import argparse
import subprocess
import re
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Paths
ENTRIES_DIR = Path("/Users/kempersc/apps/glam/data/nde/enriched/entries")
REPORTS_DIR = Path("/Users/kempersc/apps/glam/reports")
# Rate limiting for Exa API
REQUEST_DELAY = 1.0 # seconds between requests
@dataclass
class WebsiteEnrichment:
"""Container for website-extracted data following LinkML schema."""
# Basic info
description: Optional[str] = None
mission_statement: Optional[str] = None
history_summary: Optional[str] = None
founding_year: Optional[str] = None
# Collections (CustodianCollection fields)
collections: List[Dict[str, Any]] = field(default_factory=list)
collection_types: List[str] = field(default_factory=list)
collection_scope: Optional[str] = None
collection_extent: Optional[str] = None
temporal_coverage: Optional[str] = None
digitization_status: Optional[str] = None
# Digital Platform fields
homepage_url: Optional[str] = None
catalog_url: Optional[str] = None
api_endpoints: List[str] = field(default_factory=list)
sparql_endpoint: Optional[str] = None
oai_pmh_endpoint: Optional[str] = None
iiif_support: Optional[bool] = None
linked_data: Optional[bool] = None
metadata_standards: List[str] = field(default_factory=list)
# Services
services: List[str] = field(default_factory=list)
accessibility_info: Optional[str] = None
membership_info: Optional[str] = None
# Contact & Location
contact_email: Optional[str] = None
contact_phone: Optional[str] = None
address: Optional[str] = None
opening_hours_text: Optional[str] = None
# Organization
parent_organization: Optional[str] = None
organizational_units: List[str] = field(default_factory=list)
staff_count: Optional[int] = None
leadership: List[Dict[str, str]] = field(default_factory=list)
# Technical
repository_software: Optional[str] = None
cms_system: Optional[str] = None
programming_languages: List[str] = field(default_factory=list)
# Provenance
extraction_timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
source_urls: List[str] = field(default_factory=list)
extraction_method: str = "exa_web_search"
confidence_score: float = 0.0
def extract_collections_from_text(text: str) -> Dict[str, Any]:
"""Extract collection information from website text."""
collections_info = {
'collection_types': [],
'collection_scope': None,
'collection_extent': None,
'temporal_coverage': None,
'digitization_status': None,
}
text_lower = text.lower()
# Detect collection types
type_patterns = {
'books': ['books', 'boeken', 'publications', 'publicaties'],
'newspapers': ['newspapers', 'kranten', 'nieuws'],
'magazines': ['magazines', 'tijdschriften', 'periodicals'],
'manuscripts': ['manuscripts', 'handschriften', 'medieval'],
'digital_born': ['e-books', 'ebooks', 'digital', 'digitaal', 'websites'],
'photographs': ['photos', 'photographs', 'foto', 'images'],
'maps': ['maps', 'kaarten', 'cartography'],
'music': ['music', 'muziek', 'audio', 'sound'],
'archives': ['archives', 'archieven', 'records'],
}
for coll_type, keywords in type_patterns.items():
if any(kw in text_lower for kw in keywords):
collections_info['collection_types'].append(coll_type)
# Extract extent (numbers with units)
extent_patterns = [
r'(\d[\d,\.]*)\s*(million|miljoen)?\s*(books|items|volumes|objects|pieces)',
r'(\d[\d,\.]*)\s*(kilometer|km|metres|meter)s?\s*(of|aan)\s*(shelves|shelf|materials)',
r'over\s+(\d[\d,\.]*)\s*(items|volumes|books)',
r'meer dan\s+(\d[\d,\.]*)\s*(items|stukken|boeken)',
]
for pattern in extent_patterns:
match = re.search(pattern, text_lower)
if match:
collections_info['collection_extent'] = match.group(0)
break
# Detect digitization status
if any(phrase in text_lower for phrase in ['fully digitized', 'volledig gedigitaliseerd', 'complete digital']):
collections_info['digitization_status'] = 'COMPLETE'
elif any(phrase in text_lower for phrase in ['digitization', 'digitalisering', 'being digitized', 'digital collection']):
collections_info['digitization_status'] = 'PARTIAL'
elif any(phrase in text_lower for phrase in ['no digital', 'niet digitaal', 'physical only']):
collections_info['digitization_status'] = 'NOT_DIGITIZED'
return collections_info
def extract_digital_platform_info(text: str, url: str) -> Dict[str, Any]:
"""Extract digital platform information from website text."""
platform_info = {
'homepage_url': url if url else None,
'catalog_url': None,
'api_endpoints': [],
'sparql_endpoint': None,
'oai_pmh_endpoint': None,
'iiif_support': None,
'linked_data': None,
'metadata_standards': [],
}
text_lower = text.lower()
# Detect IIIF support
if 'iiif' in text_lower:
platform_info['iiif_support'] = True
# Detect linked data
if any(term in text_lower for term in ['linked data', 'rdf', 'sparql', 'semantic web', 'json-ld']):
platform_info['linked_data'] = True
# Detect metadata standards
standard_patterns = {
'Dublin Core': ['dublin core', 'dc:', 'dcterms'],
'MARC21': ['marc21', 'marc 21', 'marc format'],
'EAD': ['ead', 'encoded archival description'],
'LIDO': ['lido'],
'MODS': ['mods', 'metadata object description'],
'PREMIS': ['premis', 'preservation metadata'],
'Schema.org': ['schema.org', 'schema:'],
}
for standard, patterns in standard_patterns.items():
if any(p in text_lower for p in patterns):
platform_info['metadata_standards'].append(standard)
# Extract URLs
url_patterns = {
'catalog_url': [r'(https?://[^\s<>"]+(?:catalog|catalogue|search|zoeken|collectie)[^\s<>"]*)'],
'api_endpoints': [r'(https?://[^\s<>"]+(?:api|webservice)[^\s<>"]*)'],
'sparql_endpoint': [r'(https?://[^\s<>"]+sparql[^\s<>"]*)'],
'oai_pmh_endpoint': [r'(https?://[^\s<>"]+(?:oai|oai-pmh)[^\s<>"]*)'],
}
for field, patterns in url_patterns.items():
for pattern in patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
if matches:
if field == 'api_endpoints':
platform_info[field].extend(matches[:3]) # Max 3
else:
platform_info[field] = matches[0]
break
return platform_info
def extract_organization_info(text: str) -> Dict[str, Any]:
"""Extract organizational information from website text."""
org_info = {
'parent_organization': None,
'organizational_units': [],
'leadership': [],
'staff_count': None,
}
text_lower = text.lower()
# Detect parent organizations
parent_patterns = [
r'(?:part of|onderdeel van|under|onder)\s+(?:the\s+)?([A-Z][^,\.\n]+)',
r'(?:ministry|ministerie)\s+(?:of|van)\s+([A-Z][^,\.\n]+)',
]
for pattern in parent_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
org_info['parent_organization'] = match.group(1).strip()
break
# Detect organizational units/departments
unit_patterns = [
r'(?:department|afdeling|team|unit)\s+(?:of|voor)?\s*([A-Za-z\s]+)',
]
for pattern in unit_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
org_info['organizational_units'].extend([m.strip() for m in matches[:5]])
# Extract staff count
staff_patterns = [
r'(\d+)\s*(?:staff|employees|medewerkers|fte)',
r'team of\s*(\d+)',
]
for pattern in staff_patterns:
match = re.search(pattern, text_lower)
if match:
try:
org_info['staff_count'] = int(match.group(1))
except ValueError:
pass
break
return org_info
def extract_services_info(text: str) -> Dict[str, Any]:
"""Extract services and accessibility information."""
services_info = {
'services': [],
'accessibility_info': None,
'membership_info': None,
}
text_lower = text.lower()
# Detect services
service_keywords = {
'Reading room': ['reading room', 'leeszaal', 'study room'],
'Interlibrary loan': ['interlibrary loan', 'ibl', 'interbibliothecair leenverkeer'],
'Digital access': ['digital access', 'online access', 'remote access'],
'Research support': ['research support', 'onderzoeksondersteuning', 'reference service'],
'Exhibitions': ['exhibitions', 'tentoonstellingen', 'displays'],
'Tours': ['tours', 'rondleidingen', 'guided tours'],
'Events': ['events', 'evenementen', 'lectures', 'workshops'],
'Scanning services': ['scanning', 'digitization service', 'reproduction'],
'Wi-Fi': ['wi-fi', 'wifi', 'internet access'],
'Copying': ['copying', 'kopiëren', 'printing'],
}
for service, keywords in service_keywords.items():
if any(kw in text_lower for kw in keywords):
services_info['services'].append(service)
# Detect accessibility
accessibility_keywords = ['wheelchair', 'rolstoel', 'accessible', 'toegankelijk', 'disability', 'handicap']
if any(kw in text_lower for kw in accessibility_keywords):
services_info['accessibility_info'] = 'Accessibility features available'
# Detect membership
if any(term in text_lower for term in ['membership', 'lidmaatschap', 'member', 'lid worden', 'join']):
services_info['membership_info'] = 'Membership available'
return services_info
def extract_contact_info(text: str) -> Dict[str, Any]:
"""Extract contact information from website text."""
contact_info = {
'contact_email': None,
'contact_phone': None,
'address': None,
}
# Email pattern
email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
if email_match:
contact_info['contact_email'] = email_match.group(0)
# Dutch phone pattern
phone_patterns = [
r'(?:\+31|0031|0)\s*(?:\d[\s-]*){9,10}',
r'tel(?:efoon)?[:\s]+([+\d\s\-()]+)',
]
for pattern in phone_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
contact_info['contact_phone'] = match.group(0).strip()
break
# Address pattern (Dutch postal code)
address_match = re.search(r'(\d{4}\s*[A-Z]{2})\s*([A-Za-z\s]+)', text)
if address_match:
contact_info['address'] = address_match.group(0)
return contact_info
def call_exa_search(query: str, num_results: int = 3) -> Optional[str]:
"""
Call Exa MCP tool via subprocess (simulated - in practice this would use the MCP client).
For now, returns None as we'll use the direct API in the enrichment pipeline.
"""
# This is a placeholder - the actual Exa search will be done via MCP
return None
def process_exa_results(results: List[Dict[str, Any]]) -> WebsiteEnrichment:
"""Process Exa search results into WebsiteEnrichment structure."""
enrichment = WebsiteEnrichment()
all_text = ""
source_urls = []
for result in results:
url = result.get('url', '')
text = result.get('text', '')
title = result.get('title', '')
if url:
source_urls.append(url)
if text:
all_text += f"\n{text}"
# Set homepage from first result
if not enrichment.homepage_url and url:
enrichment.homepage_url = url
enrichment.source_urls = source_urls
if all_text:
# Extract description (first 500 chars after cleaning)
clean_text = re.sub(r'\s+', ' ', all_text).strip()
if len(clean_text) > 100:
enrichment.description = clean_text[:500] + "..."
# Extract collections info
collections_info = extract_collections_from_text(all_text)
enrichment.collection_types = collections_info['collection_types']
enrichment.collection_scope = collections_info['collection_scope']
enrichment.collection_extent = collections_info['collection_extent']
enrichment.temporal_coverage = collections_info['temporal_coverage']
enrichment.digitization_status = collections_info['digitization_status']
# Extract digital platform info
platform_info = extract_digital_platform_info(all_text, enrichment.homepage_url)
enrichment.catalog_url = platform_info['catalog_url']
enrichment.api_endpoints = platform_info['api_endpoints']
enrichment.sparql_endpoint = platform_info['sparql_endpoint']
enrichment.oai_pmh_endpoint = platform_info['oai_pmh_endpoint']
enrichment.iiif_support = platform_info['iiif_support']
enrichment.linked_data = platform_info['linked_data']
enrichment.metadata_standards = platform_info['metadata_standards']
# Extract organization info
org_info = extract_organization_info(all_text)
enrichment.parent_organization = org_info['parent_organization']
enrichment.organizational_units = org_info['organizational_units']
enrichment.leadership = org_info['leadership']
enrichment.staff_count = org_info['staff_count']
# Extract services info
services_info = extract_services_info(all_text)
enrichment.services = services_info['services']
enrichment.accessibility_info = services_info['accessibility_info']
enrichment.membership_info = services_info['membership_info']
# Extract contact info
contact_info = extract_contact_info(all_text)
enrichment.contact_email = contact_info['contact_email']
enrichment.contact_phone = contact_info['contact_phone']
enrichment.address = contact_info['address']
# Calculate confidence score based on data extracted
extracted_fields = sum([
bool(enrichment.description),
len(enrichment.collection_types) > 0,
bool(enrichment.collection_extent),
bool(enrichment.digitization_status),
len(enrichment.metadata_standards) > 0,
len(enrichment.services) > 0,
bool(enrichment.contact_email or enrichment.contact_phone),
])
enrichment.confidence_score = min(extracted_fields / 7.0, 1.0)
return enrichment
def load_kb_library_files() -> List[Dict[str, Any]]:
"""Load all KB library YAML files."""
entries = []
for filepath in sorted(ENTRIES_DIR.glob("*_kb_isil.yaml")):
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
data['_filepath'] = str(filepath)
data['_filename'] = filepath.name
entries.append(data)
except Exception as e:
logger.error(f"Error loading {filepath}: {e}")
return entries
def save_entry(entry: Dict[str, Any], filepath: str):
"""Save entry back to YAML file."""
# Remove internal fields before saving
save_data = {k: v for k, v in entry.items() if not k.startswith('_')}
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(save_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
def enrichment_to_dict(enrichment: WebsiteEnrichment) -> Dict[str, Any]:
"""Convert WebsiteEnrichment dataclass to dict for YAML storage."""
data = {
'extraction_timestamp': enrichment.extraction_timestamp,
'extraction_method': enrichment.extraction_method,
'confidence_score': enrichment.confidence_score,
'source_urls': enrichment.source_urls,
}
# Add non-empty fields
if enrichment.description:
data['description'] = enrichment.description
if enrichment.mission_statement:
data['mission_statement'] = enrichment.mission_statement
if enrichment.history_summary:
data['history_summary'] = enrichment.history_summary
if enrichment.founding_year:
data['founding_year'] = enrichment.founding_year
# Collections
if enrichment.collection_types:
data['collection_types'] = enrichment.collection_types
if enrichment.collection_scope:
data['collection_scope'] = enrichment.collection_scope
if enrichment.collection_extent:
data['collection_extent'] = enrichment.collection_extent
if enrichment.temporal_coverage:
data['temporal_coverage'] = enrichment.temporal_coverage
if enrichment.digitization_status:
data['digitization_status'] = enrichment.digitization_status
# Digital Platform
if enrichment.homepage_url:
data['homepage_url'] = enrichment.homepage_url
if enrichment.catalog_url:
data['catalog_url'] = enrichment.catalog_url
if enrichment.api_endpoints:
data['api_endpoints'] = enrichment.api_endpoints
if enrichment.sparql_endpoint:
data['sparql_endpoint'] = enrichment.sparql_endpoint
if enrichment.oai_pmh_endpoint:
data['oai_pmh_endpoint'] = enrichment.oai_pmh_endpoint
if enrichment.iiif_support is not None:
data['iiif_support'] = enrichment.iiif_support
if enrichment.linked_data is not None:
data['linked_data'] = enrichment.linked_data
if enrichment.metadata_standards:
data['metadata_standards'] = enrichment.metadata_standards
# Services
if enrichment.services:
data['services'] = enrichment.services
if enrichment.accessibility_info:
data['accessibility_info'] = enrichment.accessibility_info
if enrichment.membership_info:
data['membership_info'] = enrichment.membership_info
# Contact
if enrichment.contact_email:
data['contact_email'] = enrichment.contact_email
if enrichment.contact_phone:
data['contact_phone'] = enrichment.contact_phone
if enrichment.address:
data['address_from_website'] = enrichment.address
# Organization
if enrichment.parent_organization:
data['parent_organization'] = enrichment.parent_organization
if enrichment.organizational_units:
data['organizational_units'] = enrichment.organizational_units
if enrichment.staff_count:
data['staff_count'] = enrichment.staff_count
if enrichment.leadership:
data['leadership'] = enrichment.leadership
# Technical
if enrichment.repository_software:
data['repository_software'] = enrichment.repository_software
if enrichment.cms_system:
data['cms_system'] = enrichment.cms_system
if enrichment.programming_languages:
data['programming_languages'] = enrichment.programming_languages
return data
def main():
"""Main function - this provides structure for MCP-based enrichment."""
parser = argparse.ArgumentParser(description='Enrich KB libraries with website data via Exa')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
parser.add_argument('--limit', type=int, default=None, help='Limit number of entries to process')
args = parser.parse_args()
logger.info("=" * 60)
logger.info("KB Netherlands Libraries - Exa Website Enrichment")
logger.info("=" * 60)
# Load entries
entries = load_kb_library_files()
logger.info(f"Loaded {len(entries)} KB library entries")
if args.limit:
entries = entries[:args.limit]
logger.info(f"Limited to {len(entries)} entries")
# Filter entries that need website enrichment
needs_enrichment = []
for entry in entries:
# Check if already has website enrichment
if 'website_enrichment' not in entry:
# Check if we have a website URL to search
google_enrichment = entry.get('google_maps_enrichment', {})
wikidata_enrichment = entry.get('wikidata_enrichment', {})
website_url = (
google_enrichment.get('website') or
wikidata_enrichment.get('wikidata_identifiers', {}).get('Website')
)
if website_url:
entry['_website_url'] = website_url
needs_enrichment.append(entry)
logger.info(f"Entries needing website enrichment: {len(needs_enrichment)}")
if args.dry_run:
logger.info("DRY RUN - No changes will be made")
for entry in needs_enrichment[:10]:
name = entry.get('original_entry', {}).get('organisatie', 'Unknown')
website = entry.get('_website_url', 'No URL')
logger.info(f" Would enrich: {name} - {website}")
return
# Print guidance for manual MCP-based enrichment
logger.info("\n" + "=" * 60)
logger.info("MANUAL ENRICHMENT REQUIRED")
logger.info("=" * 60)
logger.info("\nThis script identifies entries needing enrichment.")
logger.info("Use the Exa MCP tool to search each library's website.")
logger.info("\nExample search queries:")
for entry in needs_enrichment[:5]:
name = entry.get('original_entry', {}).get('organisatie', 'Unknown')
city = entry.get('original_entry', {}).get('plaatsnaam_bezoekadres', '')
website = entry.get('_website_url', '')
if website:
domain = website.replace('https://', '').replace('http://', '').split('/')[0]
logger.info(f"\n Library: {name}")
logger.info(f" Website: {website}")
logger.info(f" Query: site:{domain} about collections services contact")
logger.info("\n" + "=" * 60)
logger.info(f"Total entries to enrich: {len(needs_enrichment)}")
logger.info("=" * 60)
if __name__ == "__main__":
main()