#!/usr/bin/env python3 """ Update NDE YAML file with Wikidata IDs - Test Batch (First 10 Records) This script adds wikidata_id fields to the first 10 records in the NDE YAML file based on the enrichment results from our test batch. Test Batch Results: 1. Stichting Herinneringscentrum Kamp Westerbork → Q22246632 2. Stichting Hunebedcentrum → Q2679819 3. Regionaal Historisch Centrum (RHC) Drents Archief → Q1978308 4. Stichting Drents Museum → Q1258370 5. Stichting Drents Museum De Buitenplaats → No match 6. Gemeente Aa en Hunze → Q300665 7. Gemeente Borger-Odoorn → Q835118 8. Gemeente Coevorden → Q60453 9. Gemeente De Wolden → Q835108 10. Samenwerkingsorganisatie De Wolden/Hoogeveen → No match Success Rate: 8/10 (80%) """ import yaml from pathlib import Path from datetime import datetime, timezone import json # Wikidata mapping for test batch (first 10 records) WIKIDATA_MAPPING = { "Stichting Herinneringscentrum Kamp Westerbork": "Q22246632", "Stichting Hunebedcentrum": "Q2679819", "Regionaal Historisch Centrum (RHC) Drents Archief": "Q1978308", "Stichting Drents Museum": "Q1258370", "Stichting Drents Museum De Buitenplaats": None, # No match found "Gemeente Aa en Hunze": "Q300665", "Gemeente Borger-Odoorn": "Q835118", "Gemeente Coevorden": "Q60453", "Gemeente De Wolden": "Q835108", "Samenwerkingsorganisatie De Wolden/Hoogeveen": None, # No match found } def main(): """Update YAML file with Wikidata IDs.""" # Paths data_dir = Path(__file__).parent.parent / "data" / "nde" yaml_path = data_dir / "voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.yaml" backup_path = data_dir / f"voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.backup.{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.yaml" print(f"Reading YAML file: {yaml_path}") # Load YAML file with open(yaml_path, 'r', encoding='utf-8') as f: organizations = yaml.safe_load(f) print(f"Loaded {len(organizations)} organizations") # Create backup print(f"Creating backup: {backup_path}") with open(backup_path, 'w', encoding='utf-8') as f: yaml.dump(organizations, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Update first 10 records with Wikidata IDs enriched_count = 0 no_match_count = 0 for i, org in enumerate(organizations[:10]): org_name = org.get('organisatie', '') if org_name in WIKIDATA_MAPPING: wikidata_id = WIKIDATA_MAPPING[org_name] if wikidata_id: org['wikidata_id'] = wikidata_id enriched_count += 1 print(f"{i+1}. ✓ {org_name} → {wikidata_id}") else: # Add enrichment note for records with no match org['wikidata_enrichment_status'] = 'no_match_found' no_match_count += 1 print(f"{i+1}. ✗ {org_name} → No match") else: print(f"{i+1}. ? {org_name} → Not in mapping") # Save updated YAML print(f"\nSaving updated YAML to: {yaml_path}") with open(yaml_path, 'w', encoding='utf-8') as f: yaml.dump(organizations, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Create enrichment log enrichment_log = { "enrichment_date": datetime.now(timezone.utc).isoformat(), "enrichment_method": "Wikidata MCP service - manual search and SPARQL queries", "records_processed": 10, "records_enriched": enriched_count, "records_no_match": no_match_count, "success_rate": f"{enriched_count/10*100:.1f}%", "wikidata_mapping": { k: v for k, v in WIKIDATA_MAPPING.items() }, "notes": [ "Test batch of first 10 records from NDE CSV", "Used Wikidata search API and SPARQL queries for verification", "No match found for 'De Buitenplaats' museum (branch location, likely not in Wikidata)", "No match found for 'Samenwerkingsorganisatie' (collaborative organization, likely not in Wikidata)" ] } log_path = data_dir / "sparql" / f"enrichment_log_test_batch_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.json" print(f"\nCreating enrichment log: {log_path}") with open(log_path, 'w', encoding='utf-8') as f: json.dump(enrichment_log, f, indent=2, ensure_ascii=False) print("\n" + "="*60) print("ENRICHMENT COMPLETE") print("="*60) print(f"Records processed: {10}") print(f"Records enriched: {enriched_count}") print(f"Records with no match: {no_match_count}") print(f"Success rate: {enriched_count/10*100:.1f}%") print(f"\nBackup saved to: {backup_path}") print(f"Enrichment log: {log_path}") if __name__ == "__main__": main()