#!/usr/bin/env python3 """ Manual batch processing helper for NDE Wikidata enrichment. This script helps process the next batch of records interactively, allowing me to search Wikidata and update records systematically. """ import yaml from pathlib import Path from datetime import datetime, timezone # Batch to process BATCH_START = 10 # Start after test batch BATCH_END = 60 # Process 50 records # Wikidata Q-numbers found (manual searches) WIKIDATA_MATCHES = { "Gemeente Hoogeveen": "Q208012", "Gemeente Emmen": "Q14641", "Gemeente Meppel": "Q60425", # Already correct from search "Gemeente Midden-Drenthe": "Q835125", "Gemeente Noordenveld": "Q835083", # Add more as found... } def main(): """Update YAML with batch results.""" data_path = Path("/Users/kempersc/apps/glam/data/nde/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.yaml") # Load data print(f"Loading {data_path.name}...") with open(data_path, 'r', encoding='utf-8') as f: orgs = yaml.safe_load(f) # Create backup backup_path = data_path.parent / f"{data_path.stem}.backup.{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.yaml" print(f"Creating backup: {backup_path.name}") with open(backup_path, 'w', encoding='utf-8') as f: yaml.dump(orgs, f, allow_unicode=True, default_flow_style=False, sort_keys=False) # Update records updated_count = 0 for i in range(BATCH_START, min(BATCH_END, len(orgs))): org_name = orgs[i].get('organisatie', '') if org_name in WIKIDATA_MATCHES: q_number = WIKIDATA_MATCHES[org_name] if not orgs[i].get('wikidata_id'): orgs[i]['wikidata_id'] = q_number updated_count += 1 print(f"{i+1:3d}. ✓ {org_name[:50]:50s} → {q_number}") else: print(f"{i+1:3d}. ○ {org_name[:50]:50s} (already enriched)") else: print(f"{i+1:3d}. ? {org_name[:50]:50s} (not in mapping)") # Save updated YAML if updated_count > 0: print(f"\nSaving {updated_count} updates...") with open(data_path, 'w', encoding='utf-8') as f: yaml.dump(orgs, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print("✅ Saved!") else: print("\n⚠️ No updates to save") # Show summary print(f"\nBatch {BATCH_START+1}-{BATCH_END} Summary:") print(f" Updated: {updated_count}") print(f" Batch size: {BATCH_END - BATCH_START}") print(f" Success rate: {updated_count/(BATCH_END-BATCH_START)*100:.1f}%") if __name__ == "__main__": main()