414 lines
15 KiB
Python
414 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Batch enrichment script for digital_platforms metadata.
|
|
|
|
This script:
|
|
1. Finds entries with websites but no digital_platforms section
|
|
2. Extracts digital platform metadata from existing web claims
|
|
3. Infers platform type and metadata from available data
|
|
4. Adds the digital_platforms section to each entry
|
|
|
|
Usage:
|
|
python scripts/enrich_digital_platforms.py [--limit N] [--start-index N] [--dry-run]
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
|
|
ENTRIES_DIR = Path("data/nde/enriched/entries")
|
|
|
|
# Platform type detection patterns
|
|
PLATFORM_TYPE_PATTERNS = {
|
|
"DIGITAL_ARCHIVE": ["digitaal archief", "digital archive", "online archief", "archiefbank"],
|
|
"DISCOVERY_PORTAL": ["collectie", "collection", "zoeken", "search", "database"],
|
|
"WEBSITE": ["website", "homepage", "info", "contact", "over ons", "about"],
|
|
"WEB_PORTAL": ["portal", "portaal", "platform"],
|
|
"ONLINE_CATALOG": ["catalogus", "catalog", "bibliotheek", "library"],
|
|
"VIRTUAL_MUSEUM": ["virtueel", "virtual", "3d", "rondleiding", "tour"],
|
|
"EDUCATIONAL_PLATFORM": ["educatie", "education", "lesmateriaal", "leren"],
|
|
}
|
|
|
|
# CMS detection patterns
|
|
CMS_PATTERNS = {
|
|
"WordPress": ["wp-content", "wp-includes", "wordpress"],
|
|
"Drupal": ["drupal", "sites/default", "modules/system"],
|
|
"Joomla": ["joomla", "components/com_"],
|
|
"Custom CMS": [],
|
|
}
|
|
|
|
# Data standard patterns
|
|
DATA_STANDARD_PATTERNS = {
|
|
"Schema.org": ["schema.org", "itemtype", "itemscope"],
|
|
"Dublin Core": ["dc:", "dcterms:", "dublin core"],
|
|
"Open Graph": ["og:", "og:title", "og:description"],
|
|
"IIIF": ["iiif", "manifest.json", "image-api"],
|
|
"Linked Data": ["application/ld+json", "@context", "rdf"],
|
|
}
|
|
|
|
|
|
def load_entry(filepath: Path) -> dict:
|
|
"""Load a YAML entry file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
def save_entry(filepath: Path, data: dict):
|
|
"""Save a YAML entry file."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
|
|
def has_website(entry: dict) -> bool:
|
|
"""Check if entry has a website URL."""
|
|
# Check original_entry.webadres_organisatie
|
|
original = entry.get('original_entry', {})
|
|
if original.get('webadres_organisatie'):
|
|
return True
|
|
|
|
# Check wikidata official website
|
|
wikidata = entry.get('wikidata_enrichment', {})
|
|
if wikidata.get('wikidata_official_website'):
|
|
return True
|
|
|
|
# Check google maps website
|
|
google = entry.get('google_maps_enrichment', {})
|
|
if google.get('website'):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def has_digital_platforms(entry: dict) -> bool:
|
|
"""Check if entry already has digital_platforms section."""
|
|
return 'digital_platforms' in entry and entry['digital_platforms']
|
|
|
|
|
|
def get_website_url(entry: dict) -> Optional[str]:
|
|
"""Extract website URL from entry."""
|
|
# Priority: original_entry > wikidata > google_maps
|
|
original = entry.get('original_entry', {})
|
|
if original.get('webadres_organisatie'):
|
|
return original['webadres_organisatie']
|
|
|
|
wikidata = entry.get('wikidata_enrichment', {})
|
|
if wikidata.get('wikidata_official_website'):
|
|
return wikidata['wikidata_official_website']
|
|
|
|
google = entry.get('google_maps_enrichment', {})
|
|
if google.get('website'):
|
|
return google['website']
|
|
|
|
return None
|
|
|
|
|
|
def detect_platform_type(entry: dict) -> str:
|
|
"""Detect platform type from entry data."""
|
|
institution_type = entry.get('original_entry', {}).get('type_organisatie', '')
|
|
types = entry.get('original_entry', {}).get('type', [])
|
|
|
|
# Map institution type to platform type
|
|
if 'archive' in institution_type.lower() or 'archief' in institution_type.lower() or 'A' in types:
|
|
return "DISCOVERY_PORTAL"
|
|
if 'museum' in institution_type.lower() or 'M' in types:
|
|
return "WEBSITE"
|
|
if 'library' in institution_type.lower() or 'bibliotheek' in institution_type.lower() or 'L' in types:
|
|
return "ONLINE_CATALOG"
|
|
if 'research' in institution_type.lower() or 'R' in types:
|
|
return "WEB_PORTAL"
|
|
|
|
# Check web claims for patterns
|
|
web_claims = entry.get('web_claims', {}).get('claims', [])
|
|
for claim in web_claims:
|
|
claim_value = str(claim.get('claim_value', '')).lower()
|
|
for platform_type, patterns in PLATFORM_TYPE_PATTERNS.items():
|
|
if any(p in claim_value for p in patterns):
|
|
return platform_type
|
|
|
|
return "WEBSITE" # Default
|
|
|
|
|
|
def detect_technology_stack(entry: dict) -> str:
|
|
"""Detect technology stack from web claims."""
|
|
web_claims = entry.get('web_claims', {}).get('claims', [])
|
|
detected_cms = []
|
|
|
|
for claim in web_claims:
|
|
html_file = str(claim.get('html_file', '')).lower()
|
|
claim_value = str(claim.get('claim_value', '')).lower()
|
|
|
|
for cms, patterns in CMS_PATTERNS.items():
|
|
if any(p in html_file or p in claim_value for p in patterns):
|
|
if cms not in detected_cms:
|
|
detected_cms.append(cms)
|
|
|
|
# Check for Atlantis or other known systems
|
|
original = entry.get('original_entry', {})
|
|
system = original.get('systeem', '')
|
|
if system:
|
|
if system not in detected_cms:
|
|
detected_cms.append(system)
|
|
|
|
if detected_cms:
|
|
return ", ".join(detected_cms)
|
|
return "Standard web technology"
|
|
|
|
|
|
def detect_data_standards(entry: dict) -> list:
|
|
"""Detect data standards from web claims."""
|
|
web_claims = entry.get('web_claims', {}).get('claims', [])
|
|
detected_standards = set()
|
|
|
|
for claim in web_claims:
|
|
extraction_method = str(claim.get('extraction_method', '')).lower()
|
|
claim_type = str(claim.get('claim_type', '')).lower()
|
|
claim_value = str(claim.get('claim_value', '')).lower()
|
|
|
|
# Check for schema.org
|
|
if 'schema' in extraction_method or 'jsonld' in extraction_method:
|
|
detected_standards.add("Schema.org")
|
|
|
|
# Check for Open Graph
|
|
if 'og_' in extraction_method or 'open graph' in extraction_method:
|
|
detected_standards.add("Open Graph")
|
|
|
|
# Check for collection/catalog patterns indicating standards
|
|
if 'collection' in claim_type or 'catalog' in claim_type:
|
|
detected_standards.add("Dublin Core")
|
|
|
|
if not detected_standards:
|
|
detected_standards.add("HTML5")
|
|
|
|
return list(detected_standards)
|
|
|
|
|
|
def extract_user_services(entry: dict) -> str:
|
|
"""Extract user services from web claims and institution type."""
|
|
services = []
|
|
|
|
# Check web claims for specific features
|
|
web_claims = entry.get('web_claims', {}).get('claims', [])
|
|
for claim in web_claims:
|
|
claim_type = claim.get('claim_type', '')
|
|
|
|
if 'search' in claim_type.lower():
|
|
if "Search" not in services:
|
|
services.append("Search")
|
|
if 'gallery' in claim_type.lower() or 'image' in claim_type.lower():
|
|
if "Image gallery" not in services:
|
|
services.append("Image gallery")
|
|
if 'video' in claim_type.lower():
|
|
if "Video content" not in services:
|
|
services.append("Video content")
|
|
if 'social' in claim_type.lower():
|
|
if "Social media integration" not in services:
|
|
services.append("Social media integration")
|
|
if 'login' in claim_type.lower() or 'signup' in claim_type.lower():
|
|
if "User accounts" not in services:
|
|
services.append("User accounts")
|
|
if 'email' in claim_type.lower() or 'phone' in claim_type.lower():
|
|
if "Contact information" not in services:
|
|
services.append("Contact information")
|
|
|
|
# Add basic services based on institution type
|
|
original = entry.get('original_entry', {})
|
|
institution_type = original.get('type_organisatie', '')
|
|
types = original.get('type', [])
|
|
|
|
if 'museum' in institution_type.lower() or 'M' in types:
|
|
if "Exhibition information" not in services:
|
|
services.append("Exhibition information")
|
|
if "Visit planning" not in services:
|
|
services.append("Visit planning")
|
|
|
|
if 'archive' in institution_type.lower() or 'archief' in institution_type.lower() or 'A' in types:
|
|
if "Collection search" not in services:
|
|
services.append("Collection search")
|
|
|
|
if not services:
|
|
services = ["General information", "Contact"]
|
|
|
|
return ", ".join(services)
|
|
|
|
|
|
def extract_sustainability_model(entry: dict) -> str:
|
|
"""Extract sustainability model from entry data."""
|
|
wikidata = entry.get('wikidata_enrichment', {})
|
|
claims = wikidata.get('wikidata_claims', {})
|
|
|
|
# Check for legal form
|
|
legal_form = claims.get('P1454_legal_form', {})
|
|
if legal_form:
|
|
value = legal_form.get('value', {})
|
|
if isinstance(value, dict):
|
|
label = value.get('label_nl', value.get('label_en', ''))
|
|
if 'stichting' in label.lower():
|
|
return "Non-profit foundation"
|
|
if 'vereniging' in label.lower():
|
|
return "Membership association"
|
|
|
|
# Check original entry for hints
|
|
original = entry.get('original_entry', {})
|
|
museum_register = original.get('museum_register', '')
|
|
if museum_register == 'ja':
|
|
return "Registered museum (government supported)"
|
|
|
|
return "Institutional funding"
|
|
|
|
|
|
def extract_digital_collections(entry: dict) -> str:
|
|
"""Extract description of digital collections."""
|
|
descriptions = []
|
|
|
|
# Check for collection claims in web data
|
|
web_claims = entry.get('web_claims', {}).get('claims', [])
|
|
for claim in web_claims:
|
|
if claim.get('claim_type') == 'collection_page':
|
|
descriptions.append("Online collection access")
|
|
if claim.get('claim_type') == 'description_short':
|
|
# Use first short description as a basis
|
|
if not descriptions:
|
|
descriptions.append(claim.get('claim_value', '')[:200])
|
|
|
|
# Check wikidata descriptions
|
|
wikidata = entry.get('wikidata_enrichment', {})
|
|
if wikidata.get('wikidata_description_en'):
|
|
descriptions.append(wikidata['wikidata_description_en'])
|
|
|
|
# Check google maps editorial summary
|
|
google = entry.get('google_maps_enrichment', {})
|
|
if google.get('editorial_summary'):
|
|
descriptions.append(google['editorial_summary'])
|
|
|
|
if descriptions:
|
|
# Combine and deduplicate
|
|
return "; ".join(set(descriptions[:2]))
|
|
|
|
return "Organizational website with heritage information"
|
|
|
|
|
|
def get_platform_name(entry: dict) -> str:
|
|
"""Get the platform name from entry data."""
|
|
# Use organization name
|
|
original = entry.get('original_entry', {})
|
|
if original.get('organisatie'):
|
|
return f"{original['organisatie']} Website"
|
|
|
|
wikidata = entry.get('wikidata_enrichment', {})
|
|
if wikidata.get('wikidata_label_nl'):
|
|
return f"{wikidata['wikidata_label_nl']} Website"
|
|
if wikidata.get('wikidata_label_en'):
|
|
return f"{wikidata['wikidata_label_en']} Website"
|
|
|
|
return "Official Website"
|
|
|
|
|
|
def create_digital_platform(entry: dict) -> Optional[dict]:
|
|
"""Create digital_platforms section for an entry."""
|
|
website_url = get_website_url(entry)
|
|
if not website_url:
|
|
return None
|
|
|
|
platform = {
|
|
'platform_name': get_platform_name(entry),
|
|
'platform_url': website_url,
|
|
'platform_type': detect_platform_type(entry),
|
|
'platform_category': ["Organizational website"],
|
|
'digital_collections': extract_digital_collections(entry),
|
|
'technology_stack': detect_technology_stack(entry),
|
|
'data_standards': detect_data_standards(entry),
|
|
'user_services': extract_user_services(entry),
|
|
'sustainability_model': extract_sustainability_model(entry),
|
|
'enrichment_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'source_method': 'automated_extraction',
|
|
}
|
|
|
|
# Add additional category based on platform type
|
|
if platform['platform_type'] == 'DISCOVERY_PORTAL':
|
|
platform['platform_category'].append("Heritage discovery")
|
|
elif platform['platform_type'] == 'ONLINE_CATALOG':
|
|
platform['platform_category'].append("Collection catalog")
|
|
elif platform['platform_type'] == 'VIRTUAL_MUSEUM':
|
|
platform['platform_category'].append("Virtual exhibition")
|
|
|
|
return platform
|
|
|
|
|
|
def enrich_entry(entry: dict) -> dict:
|
|
"""Add digital_platforms section to entry."""
|
|
platform = create_digital_platform(entry)
|
|
if platform:
|
|
entry['digital_platforms'] = [platform]
|
|
return entry
|
|
|
|
|
|
def find_entries_to_enrich() -> list[Path]:
|
|
"""Find all entries that need digital_platforms enrichment."""
|
|
entries_to_enrich = []
|
|
|
|
for filepath in sorted(ENTRIES_DIR.glob("*.yaml")):
|
|
if filepath.name.startswith('_'):
|
|
continue
|
|
|
|
try:
|
|
entry = load_entry(filepath)
|
|
if has_website(entry) and not has_digital_platforms(entry):
|
|
entries_to_enrich.append(filepath)
|
|
except Exception as e:
|
|
print(f"Error reading {filepath}: {e}")
|
|
|
|
return entries_to_enrich
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Enrich entries with digital_platforms metadata")
|
|
parser.add_argument('--limit', type=int, default=None, help="Maximum number of entries to process")
|
|
parser.add_argument('--start-index', type=int, default=0, help="Start index for batch processing")
|
|
parser.add_argument('--dry-run', action='store_true', help="Print changes without writing")
|
|
parser.add_argument('--verbose', '-v', action='store_true', help="Verbose output")
|
|
args = parser.parse_args()
|
|
|
|
print("Finding entries to enrich...")
|
|
entries = find_entries_to_enrich()
|
|
print(f"Found {len(entries)} entries with websites but no digital_platforms")
|
|
|
|
# Apply start index and limit
|
|
entries = entries[args.start_index:]
|
|
if args.limit:
|
|
entries = entries[:args.limit]
|
|
|
|
print(f"Processing {len(entries)} entries...")
|
|
|
|
enriched_count = 0
|
|
for filepath in entries:
|
|
try:
|
|
entry = load_entry(filepath)
|
|
entry = enrich_entry(entry)
|
|
|
|
if 'digital_platforms' in entry:
|
|
enriched_count += 1
|
|
|
|
if args.verbose:
|
|
platform = entry['digital_platforms'][0]
|
|
print(f"\n{filepath.name}:")
|
|
print(f" Platform: {platform['platform_name']}")
|
|
print(f" URL: {platform['platform_url']}")
|
|
print(f" Type: {platform['platform_type']}")
|
|
print(f" Tech: {platform['technology_stack']}")
|
|
|
|
if not args.dry_run:
|
|
save_entry(filepath, entry)
|
|
|
|
except Exception as e:
|
|
print(f"Error processing {filepath}: {e}")
|
|
|
|
print(f"\n{'Would enrich' if args.dry_run else 'Enriched'} {enriched_count} entries")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|