glam/scripts/update_nde_yaml_with_wikidata_test_batch.py

#!/usr/bin/env python3
"""
Update NDE YAML file with Wikidata IDs - Test Batch (First 10 Records)

This script adds wikidata_id fields to the first 10 records in the NDE YAML file
based on the enrichment results from our test batch.

Test Batch Results:
1. Stichting Herinneringscentrum Kamp Westerbork → Q22246632
2. Stichting Hunebedcentrum → Q2679819
3. Regionaal Historisch Centrum (RHC) Drents Archief → Q1978308
4. Stichting Drents Museum → Q1258370
5. Stichting Drents Museum De Buitenplaats → No match
6. Gemeente Aa en Hunze → Q300665
7. Gemeente Borger-Odoorn → Q835118
8. Gemeente Coevorden → Q60453
9. Gemeente De Wolden → Q835108
10. Samenwerkingsorganisatie De Wolden/Hoogeveen → No match

Success Rate: 8/10 (80%)
"""

import yaml
from pathlib import Path
from datetime import datetime, timezone
import json

# Wikidata mapping for test batch (first 10 records)
WIKIDATA_MAPPING = {
    "Stichting Herinneringscentrum Kamp Westerbork": "Q22246632",
    "Stichting Hunebedcentrum": "Q2679819",
    "Regionaal Historisch Centrum (RHC) Drents Archief": "Q1978308",
    "Stichting Drents Museum": "Q1258370",
    "Stichting Drents Museum De Buitenplaats": None,  # No match found
    "Gemeente Aa en Hunze": "Q300665",
    "Gemeente Borger-Odoorn": "Q835118",
    "Gemeente Coevorden": "Q60453",
    "Gemeente De Wolden": "Q835108",
    "Samenwerkingsorganisatie De Wolden/Hoogeveen": None,  # No match found
}

def main():
    """Update YAML file with Wikidata IDs."""

    # Paths
    data_dir = Path(__file__).parent.parent / "data" / "nde"
    yaml_path = data_dir / "voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.yaml"
    backup_path = data_dir / f"voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.backup.{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.yaml"

    print(f"Reading YAML file: {yaml_path}")

    # Load YAML file
    with open(yaml_path, 'r', encoding='utf-8') as f:
        organizations = yaml.safe_load(f)

    print(f"Loaded {len(organizations)} organizations")

    # Create backup
    print(f"Creating backup: {backup_path}")
    with open(backup_path, 'w', encoding='utf-8') as f:
        yaml.dump(organizations, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    # Update first 10 records with Wikidata IDs
    enriched_count = 0
    no_match_count = 0

    for i, org in enumerate(organizations[:10]):
        org_name = org.get('organisatie', '')

        if org_name in WIKIDATA_MAPPING:
            wikidata_id = WIKIDATA_MAPPING[org_name]

            if wikidata_id:
                org['wikidata_id'] = wikidata_id
                enriched_count += 1
                print(f"{i+1}. ✓ {org_name} → {wikidata_id}")
            else:
                # Add enrichment note for records with no match
                org['wikidata_enrichment_status'] = 'no_match_found'
                no_match_count += 1
                print(f"{i+1}. ✗ {org_name} → No match")
        else:
            print(f"{i+1}. ? {org_name} → Not in mapping")

    # Save updated YAML
    print(f"\nSaving updated YAML to: {yaml_path}")
    with open(yaml_path, 'w', encoding='utf-8') as f:
        yaml.dump(organizations, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    # Create enrichment log
    enrichment_log = {
        "enrichment_date": datetime.now(timezone.utc).isoformat(),
        "enrichment_method": "Wikidata MCP service - manual search and SPARQL queries",
        "records_processed": 10,
        "records_enriched": enriched_count,
        "records_no_match": no_match_count,
        "success_rate": f"{enriched_count/10*100:.1f}%",
        "wikidata_mapping": {
            k: v for k, v in WIKIDATA_MAPPING.items()
        },
        "notes": [
            "Test batch of first 10 records from NDE CSV",
            "Used Wikidata search API and SPARQL queries for verification",
            "No match found for 'De Buitenplaats' museum (branch location, likely not in Wikidata)",
            "No match found for 'Samenwerkingsorganisatie' (collaborative organization, likely not in Wikidata)"
        ]
    }

    log_path = data_dir / "sparql" / f"enrichment_log_test_batch_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.json"
    print(f"\nCreating enrichment log: {log_path}")
    with open(log_path, 'w', encoding='utf-8') as f:
        json.dump(enrichment_log, f, indent=2, ensure_ascii=False)

    print("\n" + "="*60)
    print("ENRICHMENT COMPLETE")
    print("="*60)
    print(f"Records processed: {10}")
    print(f"Records enriched: {enriched_count}")
    print(f"Records with no match: {no_match_count}")
    print(f"Success rate: {enriched_count/10*100:.1f}%")
    print(f"\nBackup saved to: {backup_path}")
    print(f"Enrichment log: {log_path}")

if __name__ == "__main__":
    main()