276 lines
9.7 KiB
Python
276 lines
9.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fast batch enrichment script for digital_platforms metadata.
|
|
Reads from a pre-generated list file for speed.
|
|
|
|
Usage:
|
|
# First generate the list:
|
|
find data/nde/enriched/entries -name "*.yaml" -exec grep -L "digital_platforms:" {} \; > /tmp/entries_to_enrich.txt
|
|
|
|
# Then run:
|
|
python scripts/enrich_digital_platforms_fast.py --input /tmp/entries_to_enrich.txt --batch 100
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
|
|
def load_entry(filepath: Path) -> dict:
|
|
"""Load a YAML entry file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
def save_entry(filepath: Path, data: dict):
|
|
"""Save a YAML entry file."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
|
|
def get_website_url(entry: dict) -> Optional[str]:
|
|
"""Extract website URL from entry."""
|
|
# Priority: most explicit sources first
|
|
|
|
# 1. Original entry webadres
|
|
original = entry.get('original_entry', {})
|
|
if original.get('webadres_organisatie'):
|
|
return original['webadres_organisatie']
|
|
|
|
# 2. Contact section website
|
|
contact = entry.get('contact', {})
|
|
if contact.get('website'):
|
|
return contact['website']
|
|
|
|
# 3. Digital presence website
|
|
digital_presence = entry.get('digital_presence', {})
|
|
if digital_presence.get('website'):
|
|
return digital_presence['website']
|
|
|
|
# 4. Wikidata official website
|
|
wikidata = entry.get('wikidata_enrichment', {})
|
|
if wikidata.get('wikidata_official_website'):
|
|
return wikidata['wikidata_official_website']
|
|
|
|
# 5. Google Maps website
|
|
google = entry.get('google_maps_enrichment', {})
|
|
if google.get('website'):
|
|
return google['website']
|
|
|
|
return None
|
|
|
|
|
|
def detect_platform_type(entry: dict) -> str:
|
|
"""Detect platform type from entry data."""
|
|
institution_type = entry.get('original_entry', {}).get('type_organisatie', '')
|
|
types = entry.get('original_entry', {}).get('type', [])
|
|
|
|
if 'archive' in institution_type.lower() or 'archief' in institution_type.lower() or 'A' in types:
|
|
return "DISCOVERY_PORTAL"
|
|
if 'museum' in institution_type.lower() or 'M' in types:
|
|
return "WEBSITE"
|
|
if 'library' in institution_type.lower() or 'bibliotheek' in institution_type.lower() or 'L' in types:
|
|
return "ONLINE_CATALOG"
|
|
if 'research' in institution_type.lower() or 'R' in types:
|
|
return "WEB_PORTAL"
|
|
|
|
return "WEBSITE"
|
|
|
|
|
|
def detect_technology_stack(entry: dict) -> str:
|
|
"""Detect technology stack from entry."""
|
|
original = entry.get('original_entry', {})
|
|
system = original.get('systeem', '')
|
|
if system:
|
|
return system
|
|
return "Standard web technology"
|
|
|
|
|
|
def detect_data_standards(entry: dict) -> list:
|
|
"""Detect data standards from web claims."""
|
|
web_claims = entry.get('web_claims', {}).get('claims', [])
|
|
detected_standards = set()
|
|
|
|
for claim in web_claims:
|
|
extraction_method = str(claim.get('extraction_method', '')).lower()
|
|
if 'schema' in extraction_method or 'jsonld' in extraction_method:
|
|
detected_standards.add("Schema.org")
|
|
if 'og_' in extraction_method or 'open graph' in extraction_method:
|
|
detected_standards.add("Open Graph")
|
|
|
|
if not detected_standards:
|
|
detected_standards.add("HTML5")
|
|
|
|
return list(detected_standards)
|
|
|
|
|
|
def extract_user_services(entry: dict) -> str:
|
|
"""Extract user services from institution type."""
|
|
services = []
|
|
original = entry.get('original_entry', {})
|
|
institution_type = original.get('type_organisatie', '')
|
|
types = original.get('type', [])
|
|
|
|
if 'museum' in institution_type.lower() or 'M' in types:
|
|
services.extend(["Exhibition information", "Visit planning"])
|
|
if 'archive' in institution_type.lower() or 'archief' in institution_type.lower() or 'A' in types:
|
|
services.append("Collection search")
|
|
if 'library' in institution_type.lower() or 'bibliotheek' in institution_type.lower() or 'L' in types:
|
|
services.append("Catalog search")
|
|
|
|
if not services:
|
|
services = ["General information", "Contact"]
|
|
|
|
return ", ".join(services)
|
|
|
|
|
|
def extract_sustainability_model(entry: dict) -> str:
|
|
"""Extract sustainability model from entry data."""
|
|
original = entry.get('original_entry', {})
|
|
museum_register = original.get('museum_register', '')
|
|
if museum_register == 'ja':
|
|
return "Registered museum (government supported)"
|
|
|
|
wikidata = entry.get('wikidata_enrichment', {})
|
|
claims = wikidata.get('wikidata_claims', {})
|
|
legal_form = claims.get('P1454_legal_form', {})
|
|
if legal_form:
|
|
value = legal_form.get('value', {})
|
|
if isinstance(value, dict):
|
|
label = value.get('label_nl', value.get('label_en', ''))
|
|
if 'stichting' in label.lower():
|
|
return "Non-profit foundation"
|
|
if 'vereniging' in label.lower():
|
|
return "Membership association"
|
|
|
|
return "Institutional funding"
|
|
|
|
|
|
def extract_digital_collections(entry: dict) -> str:
|
|
"""Extract description of digital collections."""
|
|
wikidata = entry.get('wikidata_enrichment', {})
|
|
if wikidata.get('wikidata_description_en'):
|
|
return wikidata['wikidata_description_en']
|
|
if wikidata.get('wikidata_description_nl'):
|
|
return wikidata['wikidata_description_nl']
|
|
|
|
google = entry.get('google_maps_enrichment', {})
|
|
if google.get('editorial_summary'):
|
|
return google['editorial_summary']
|
|
|
|
return "Organizational website with heritage information"
|
|
|
|
|
|
def get_platform_name(entry: dict) -> str:
|
|
"""Get the platform name from entry data."""
|
|
original = entry.get('original_entry', {})
|
|
if original.get('organisatie'):
|
|
return f"{original['organisatie']} Website"
|
|
|
|
wikidata = entry.get('wikidata_enrichment', {})
|
|
if wikidata.get('wikidata_label_nl'):
|
|
return f"{wikidata['wikidata_label_nl']} Website"
|
|
if wikidata.get('wikidata_label_en'):
|
|
return f"{wikidata['wikidata_label_en']} Website"
|
|
|
|
return "Official Website"
|
|
|
|
|
|
def create_digital_platform(entry: dict) -> Optional[dict]:
|
|
"""Create digital_platforms section for an entry."""
|
|
website_url = get_website_url(entry)
|
|
if not website_url:
|
|
return None
|
|
|
|
platform_type = detect_platform_type(entry)
|
|
platform = {
|
|
'platform_name': get_platform_name(entry),
|
|
'platform_url': website_url,
|
|
'platform_type': platform_type,
|
|
'platform_category': ["Organizational website"],
|
|
'digital_collections': extract_digital_collections(entry),
|
|
'technology_stack': detect_technology_stack(entry),
|
|
'data_standards': detect_data_standards(entry),
|
|
'user_services': extract_user_services(entry),
|
|
'sustainability_model': extract_sustainability_model(entry),
|
|
'enrichment_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'source_method': 'automated_extraction',
|
|
}
|
|
|
|
if platform_type == 'DISCOVERY_PORTAL':
|
|
platform['platform_category'].append("Heritage discovery")
|
|
elif platform_type == 'ONLINE_CATALOG':
|
|
platform['platform_category'].append("Collection catalog")
|
|
elif platform_type == 'VIRTUAL_MUSEUM':
|
|
platform['platform_category'].append("Virtual exhibition")
|
|
|
|
return platform
|
|
|
|
|
|
def process_entry(filepath: Path) -> tuple[bool, str]:
|
|
"""Process a single entry. Returns (success, message)."""
|
|
try:
|
|
entry = load_entry(filepath)
|
|
|
|
# Skip if already has digital_platforms
|
|
if 'digital_platforms' in entry and entry['digital_platforms']:
|
|
return False, "already enriched"
|
|
|
|
platform = create_digital_platform(entry)
|
|
if platform:
|
|
entry['digital_platforms'] = [platform]
|
|
save_entry(filepath, entry)
|
|
return True, f"added {platform['platform_type']}"
|
|
else:
|
|
return False, "no website URL"
|
|
except Exception as e:
|
|
return False, f"error: {e}"
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Fast batch enrich entries with digital_platforms")
|
|
parser.add_argument('--input', '-i', type=str, required=True, help="Input file with list of entries")
|
|
parser.add_argument('--batch', '-b', type=int, default=100, help="Batch size to process")
|
|
parser.add_argument('--skip', '-s', type=int, default=0, help="Number of entries to skip")
|
|
parser.add_argument('--dry-run', action='store_true', help="Don't actually write files")
|
|
args = parser.parse_args()
|
|
|
|
# Read list of files to process
|
|
with open(args.input, 'r') as f:
|
|
files = [Path(line.strip()) for line in f if line.strip()]
|
|
|
|
print(f"Total entries in list: {len(files)}")
|
|
|
|
# Apply skip and batch
|
|
files = files[args.skip:args.skip + args.batch]
|
|
print(f"Processing {len(files)} entries (skip={args.skip}, batch={args.batch})")
|
|
|
|
enriched = 0
|
|
skipped = 0
|
|
errors = 0
|
|
|
|
for i, filepath in enumerate(files):
|
|
if args.dry_run:
|
|
print(f"[DRY] {filepath.name}")
|
|
continue
|
|
|
|
success, msg = process_entry(filepath)
|
|
if success:
|
|
enriched += 1
|
|
if enriched % 10 == 0:
|
|
print(f"Progress: {enriched} enriched, {i+1}/{len(files)}")
|
|
elif "error" in msg:
|
|
errors += 1
|
|
print(f"ERROR {filepath.name}: {msg}")
|
|
else:
|
|
skipped += 1
|
|
|
|
print(f"\nDone: {enriched} enriched, {skipped} skipped, {errors} errors")
|
|
print(f"Next batch: --skip {args.skip + args.batch}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|