glam/scripts/batch_update_helper.py
2025-11-19 23:25:22 +01:00

74 lines
2.6 KiB
Python

#!/usr/bin/env python3
"""
Manual batch processing helper for NDE Wikidata enrichment.
This script helps process the next batch of records interactively,
allowing me to search Wikidata and update records systematically.
"""
import yaml
from pathlib import Path
from datetime import datetime, timezone
# Batch to process
BATCH_START = 10 # Start after test batch
BATCH_END = 60 # Process 50 records
# Wikidata Q-numbers found (manual searches)
WIKIDATA_MATCHES = {
"Gemeente Hoogeveen": "Q208012",
"Gemeente Emmen": "Q14641",
"Gemeente Meppel": "Q60425", # Already correct from search
"Gemeente Midden-Drenthe": "Q835125",
"Gemeente Noordenveld": "Q835083",
# Add more as found...
}
def main():
"""Update YAML with batch results."""
data_path = Path("/Users/kempersc/apps/glam/data/nde/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.yaml")
# Load data
print(f"Loading {data_path.name}...")
with open(data_path, 'r', encoding='utf-8') as f:
orgs = yaml.safe_load(f)
# Create backup
backup_path = data_path.parent / f"{data_path.stem}.backup.{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.yaml"
print(f"Creating backup: {backup_path.name}")
with open(backup_path, 'w', encoding='utf-8') as f:
yaml.dump(orgs, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Update records
updated_count = 0
for i in range(BATCH_START, min(BATCH_END, len(orgs))):
org_name = orgs[i].get('organisatie', '')
if org_name in WIKIDATA_MATCHES:
q_number = WIKIDATA_MATCHES[org_name]
if not orgs[i].get('wikidata_id'):
orgs[i]['wikidata_id'] = q_number
updated_count += 1
print(f"{i+1:3d}. ✓ {org_name[:50]:50s}{q_number}")
else:
print(f"{i+1:3d}. ○ {org_name[:50]:50s} (already enriched)")
else:
print(f"{i+1:3d}. ? {org_name[:50]:50s} (not in mapping)")
# Save updated YAML
if updated_count > 0:
print(f"\nSaving {updated_count} updates...")
with open(data_path, 'w', encoding='utf-8') as f:
yaml.dump(orgs, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print("✅ Saved!")
else:
print("\n⚠️ No updates to save")
# Show summary
print(f"\nBatch {BATCH_START+1}-{BATCH_END} Summary:")
print(f" Updated: {updated_count}")
print(f" Batch size: {BATCH_END - BATCH_START}")
print(f" Success rate: {updated_count/(BATCH_END-BATCH_START)*100:.1f}%")
if __name__ == "__main__":
main()