glam/scripts/enrich_digital_platforms.py
2025-12-05 15:30:23 +01:00

414 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Batch enrichment script for digital_platforms metadata.
This script:
1. Finds entries with websites but no digital_platforms section
2. Extracts digital platform metadata from existing web claims
3. Infers platform type and metadata from available data
4. Adds the digital_platforms section to each entry
Usage:
python scripts/enrich_digital_platforms.py [--limit N] [--start-index N] [--dry-run]
"""
import argparse
import os
import re
import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional
ENTRIES_DIR = Path("data/nde/enriched/entries")
# Platform type detection patterns
PLATFORM_TYPE_PATTERNS = {
"DIGITAL_ARCHIVE": ["digitaal archief", "digital archive", "online archief", "archiefbank"],
"DISCOVERY_PORTAL": ["collectie", "collection", "zoeken", "search", "database"],
"WEBSITE": ["website", "homepage", "info", "contact", "over ons", "about"],
"WEB_PORTAL": ["portal", "portaal", "platform"],
"ONLINE_CATALOG": ["catalogus", "catalog", "bibliotheek", "library"],
"VIRTUAL_MUSEUM": ["virtueel", "virtual", "3d", "rondleiding", "tour"],
"EDUCATIONAL_PLATFORM": ["educatie", "education", "lesmateriaal", "leren"],
}
# CMS detection patterns
CMS_PATTERNS = {
"WordPress": ["wp-content", "wp-includes", "wordpress"],
"Drupal": ["drupal", "sites/default", "modules/system"],
"Joomla": ["joomla", "components/com_"],
"Custom CMS": [],
}
# Data standard patterns
DATA_STANDARD_PATTERNS = {
"Schema.org": ["schema.org", "itemtype", "itemscope"],
"Dublin Core": ["dc:", "dcterms:", "dublin core"],
"Open Graph": ["og:", "og:title", "og:description"],
"IIIF": ["iiif", "manifest.json", "image-api"],
"Linked Data": ["application/ld+json", "@context", "rdf"],
}
def load_entry(filepath: Path) -> dict:
"""Load a YAML entry file."""
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def save_entry(filepath: Path, data: dict):
"""Save a YAML entry file."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
def has_website(entry: dict) -> bool:
"""Check if entry has a website URL."""
# Check original_entry.webadres_organisatie
original = entry.get('original_entry', {})
if original.get('webadres_organisatie'):
return True
# Check wikidata official website
wikidata = entry.get('wikidata_enrichment', {})
if wikidata.get('wikidata_official_website'):
return True
# Check google maps website
google = entry.get('google_maps_enrichment', {})
if google.get('website'):
return True
return False
def has_digital_platforms(entry: dict) -> bool:
"""Check if entry already has digital_platforms section."""
return 'digital_platforms' in entry and entry['digital_platforms']
def get_website_url(entry: dict) -> Optional[str]:
"""Extract website URL from entry."""
# Priority: original_entry > wikidata > google_maps
original = entry.get('original_entry', {})
if original.get('webadres_organisatie'):
return original['webadres_organisatie']
wikidata = entry.get('wikidata_enrichment', {})
if wikidata.get('wikidata_official_website'):
return wikidata['wikidata_official_website']
google = entry.get('google_maps_enrichment', {})
if google.get('website'):
return google['website']
return None
def detect_platform_type(entry: dict) -> str:
"""Detect platform type from entry data."""
institution_type = entry.get('original_entry', {}).get('type_organisatie', '')
types = entry.get('original_entry', {}).get('type', [])
# Map institution type to platform type
if 'archive' in institution_type.lower() or 'archief' in institution_type.lower() or 'A' in types:
return "DISCOVERY_PORTAL"
if 'museum' in institution_type.lower() or 'M' in types:
return "WEBSITE"
if 'library' in institution_type.lower() or 'bibliotheek' in institution_type.lower() or 'L' in types:
return "ONLINE_CATALOG"
if 'research' in institution_type.lower() or 'R' in types:
return "WEB_PORTAL"
# Check web claims for patterns
web_claims = entry.get('web_claims', {}).get('claims', [])
for claim in web_claims:
claim_value = str(claim.get('claim_value', '')).lower()
for platform_type, patterns in PLATFORM_TYPE_PATTERNS.items():
if any(p in claim_value for p in patterns):
return platform_type
return "WEBSITE" # Default
def detect_technology_stack(entry: dict) -> str:
"""Detect technology stack from web claims."""
web_claims = entry.get('web_claims', {}).get('claims', [])
detected_cms = []
for claim in web_claims:
html_file = str(claim.get('html_file', '')).lower()
claim_value = str(claim.get('claim_value', '')).lower()
for cms, patterns in CMS_PATTERNS.items():
if any(p in html_file or p in claim_value for p in patterns):
if cms not in detected_cms:
detected_cms.append(cms)
# Check for Atlantis or other known systems
original = entry.get('original_entry', {})
system = original.get('systeem', '')
if system:
if system not in detected_cms:
detected_cms.append(system)
if detected_cms:
return ", ".join(detected_cms)
return "Standard web technology"
def detect_data_standards(entry: dict) -> list:
"""Detect data standards from web claims."""
web_claims = entry.get('web_claims', {}).get('claims', [])
detected_standards = set()
for claim in web_claims:
extraction_method = str(claim.get('extraction_method', '')).lower()
claim_type = str(claim.get('claim_type', '')).lower()
claim_value = str(claim.get('claim_value', '')).lower()
# Check for schema.org
if 'schema' in extraction_method or 'jsonld' in extraction_method:
detected_standards.add("Schema.org")
# Check for Open Graph
if 'og_' in extraction_method or 'open graph' in extraction_method:
detected_standards.add("Open Graph")
# Check for collection/catalog patterns indicating standards
if 'collection' in claim_type or 'catalog' in claim_type:
detected_standards.add("Dublin Core")
if not detected_standards:
detected_standards.add("HTML5")
return list(detected_standards)
def extract_user_services(entry: dict) -> str:
"""Extract user services from web claims and institution type."""
services = []
# Check web claims for specific features
web_claims = entry.get('web_claims', {}).get('claims', [])
for claim in web_claims:
claim_type = claim.get('claim_type', '')
if 'search' in claim_type.lower():
if "Search" not in services:
services.append("Search")
if 'gallery' in claim_type.lower() or 'image' in claim_type.lower():
if "Image gallery" not in services:
services.append("Image gallery")
if 'video' in claim_type.lower():
if "Video content" not in services:
services.append("Video content")
if 'social' in claim_type.lower():
if "Social media integration" not in services:
services.append("Social media integration")
if 'login' in claim_type.lower() or 'signup' in claim_type.lower():
if "User accounts" not in services:
services.append("User accounts")
if 'email' in claim_type.lower() or 'phone' in claim_type.lower():
if "Contact information" not in services:
services.append("Contact information")
# Add basic services based on institution type
original = entry.get('original_entry', {})
institution_type = original.get('type_organisatie', '')
types = original.get('type', [])
if 'museum' in institution_type.lower() or 'M' in types:
if "Exhibition information" not in services:
services.append("Exhibition information")
if "Visit planning" not in services:
services.append("Visit planning")
if 'archive' in institution_type.lower() or 'archief' in institution_type.lower() or 'A' in types:
if "Collection search" not in services:
services.append("Collection search")
if not services:
services = ["General information", "Contact"]
return ", ".join(services)
def extract_sustainability_model(entry: dict) -> str:
"""Extract sustainability model from entry data."""
wikidata = entry.get('wikidata_enrichment', {})
claims = wikidata.get('wikidata_claims', {})
# Check for legal form
legal_form = claims.get('P1454_legal_form', {})
if legal_form:
value = legal_form.get('value', {})
if isinstance(value, dict):
label = value.get('label_nl', value.get('label_en', ''))
if 'stichting' in label.lower():
return "Non-profit foundation"
if 'vereniging' in label.lower():
return "Membership association"
# Check original entry for hints
original = entry.get('original_entry', {})
museum_register = original.get('museum_register', '')
if museum_register == 'ja':
return "Registered museum (government supported)"
return "Institutional funding"
def extract_digital_collections(entry: dict) -> str:
"""Extract description of digital collections."""
descriptions = []
# Check for collection claims in web data
web_claims = entry.get('web_claims', {}).get('claims', [])
for claim in web_claims:
if claim.get('claim_type') == 'collection_page':
descriptions.append("Online collection access")
if claim.get('claim_type') == 'description_short':
# Use first short description as a basis
if not descriptions:
descriptions.append(claim.get('claim_value', '')[:200])
# Check wikidata descriptions
wikidata = entry.get('wikidata_enrichment', {})
if wikidata.get('wikidata_description_en'):
descriptions.append(wikidata['wikidata_description_en'])
# Check google maps editorial summary
google = entry.get('google_maps_enrichment', {})
if google.get('editorial_summary'):
descriptions.append(google['editorial_summary'])
if descriptions:
# Combine and deduplicate
return "; ".join(set(descriptions[:2]))
return "Organizational website with heritage information"
def get_platform_name(entry: dict) -> str:
"""Get the platform name from entry data."""
# Use organization name
original = entry.get('original_entry', {})
if original.get('organisatie'):
return f"{original['organisatie']} Website"
wikidata = entry.get('wikidata_enrichment', {})
if wikidata.get('wikidata_label_nl'):
return f"{wikidata['wikidata_label_nl']} Website"
if wikidata.get('wikidata_label_en'):
return f"{wikidata['wikidata_label_en']} Website"
return "Official Website"
def create_digital_platform(entry: dict) -> Optional[dict]:
"""Create digital_platforms section for an entry."""
website_url = get_website_url(entry)
if not website_url:
return None
platform = {
'platform_name': get_platform_name(entry),
'platform_url': website_url,
'platform_type': detect_platform_type(entry),
'platform_category': ["Organizational website"],
'digital_collections': extract_digital_collections(entry),
'technology_stack': detect_technology_stack(entry),
'data_standards': detect_data_standards(entry),
'user_services': extract_user_services(entry),
'sustainability_model': extract_sustainability_model(entry),
'enrichment_timestamp': datetime.now(timezone.utc).isoformat(),
'source_method': 'automated_extraction',
}
# Add additional category based on platform type
if platform['platform_type'] == 'DISCOVERY_PORTAL':
platform['platform_category'].append("Heritage discovery")
elif platform['platform_type'] == 'ONLINE_CATALOG':
platform['platform_category'].append("Collection catalog")
elif platform['platform_type'] == 'VIRTUAL_MUSEUM':
platform['platform_category'].append("Virtual exhibition")
return platform
def enrich_entry(entry: dict) -> dict:
"""Add digital_platforms section to entry."""
platform = create_digital_platform(entry)
if platform:
entry['digital_platforms'] = [platform]
return entry
def find_entries_to_enrich() -> list[Path]:
"""Find all entries that need digital_platforms enrichment."""
entries_to_enrich = []
for filepath in sorted(ENTRIES_DIR.glob("*.yaml")):
if filepath.name.startswith('_'):
continue
try:
entry = load_entry(filepath)
if has_website(entry) and not has_digital_platforms(entry):
entries_to_enrich.append(filepath)
except Exception as e:
print(f"Error reading {filepath}: {e}")
return entries_to_enrich
def main():
parser = argparse.ArgumentParser(description="Enrich entries with digital_platforms metadata")
parser.add_argument('--limit', type=int, default=None, help="Maximum number of entries to process")
parser.add_argument('--start-index', type=int, default=0, help="Start index for batch processing")
parser.add_argument('--dry-run', action='store_true', help="Print changes without writing")
parser.add_argument('--verbose', '-v', action='store_true', help="Verbose output")
args = parser.parse_args()
print("Finding entries to enrich...")
entries = find_entries_to_enrich()
print(f"Found {len(entries)} entries with websites but no digital_platforms")
# Apply start index and limit
entries = entries[args.start_index:]
if args.limit:
entries = entries[:args.limit]
print(f"Processing {len(entries)} entries...")
enriched_count = 0
for filepath in entries:
try:
entry = load_entry(filepath)
entry = enrich_entry(entry)
if 'digital_platforms' in entry:
enriched_count += 1
if args.verbose:
platform = entry['digital_platforms'][0]
print(f"\n{filepath.name}:")
print(f" Platform: {platform['platform_name']}")
print(f" URL: {platform['platform_url']}")
print(f" Type: {platform['platform_type']}")
print(f" Tech: {platform['technology_stack']}")
if not args.dry_run:
save_entry(filepath, entry)
except Exception as e:
print(f"Error processing {filepath}: {e}")
print(f"\n{'Would enrich' if args.dry_run else 'Enriched'} {enriched_count} entries")
if __name__ == "__main__":
main()