74 lines
2.6 KiB
Python
74 lines
2.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Manual batch processing helper for NDE Wikidata enrichment.
|
|
|
|
This script helps process the next batch of records interactively,
|
|
allowing me to search Wikidata and update records systematically.
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
# Batch to process
|
|
BATCH_START = 10 # Start after test batch
|
|
BATCH_END = 60 # Process 50 records
|
|
|
|
# Wikidata Q-numbers found (manual searches)
|
|
WIKIDATA_MATCHES = {
|
|
"Gemeente Hoogeveen": "Q208012",
|
|
"Gemeente Emmen": "Q14641",
|
|
"Gemeente Meppel": "Q60425", # Already correct from search
|
|
"Gemeente Midden-Drenthe": "Q835125",
|
|
"Gemeente Noordenveld": "Q835083",
|
|
# Add more as found...
|
|
}
|
|
|
|
def main():
|
|
"""Update YAML with batch results."""
|
|
data_path = Path("/Users/kempersc/apps/glam/data/nde/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.yaml")
|
|
|
|
# Load data
|
|
print(f"Loading {data_path.name}...")
|
|
with open(data_path, 'r', encoding='utf-8') as f:
|
|
orgs = yaml.safe_load(f)
|
|
|
|
# Create backup
|
|
backup_path = data_path.parent / f"{data_path.stem}.backup.{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.yaml"
|
|
print(f"Creating backup: {backup_path.name}")
|
|
with open(backup_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(orgs, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
# Update records
|
|
updated_count = 0
|
|
for i in range(BATCH_START, min(BATCH_END, len(orgs))):
|
|
org_name = orgs[i].get('organisatie', '')
|
|
|
|
if org_name in WIKIDATA_MATCHES:
|
|
q_number = WIKIDATA_MATCHES[org_name]
|
|
if not orgs[i].get('wikidata_id'):
|
|
orgs[i]['wikidata_id'] = q_number
|
|
updated_count += 1
|
|
print(f"{i+1:3d}. ✓ {org_name[:50]:50s} → {q_number}")
|
|
else:
|
|
print(f"{i+1:3d}. ○ {org_name[:50]:50s} (already enriched)")
|
|
else:
|
|
print(f"{i+1:3d}. ? {org_name[:50]:50s} (not in mapping)")
|
|
|
|
# Save updated YAML
|
|
if updated_count > 0:
|
|
print(f"\nSaving {updated_count} updates...")
|
|
with open(data_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(orgs, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
print("✅ Saved!")
|
|
else:
|
|
print("\n⚠️ No updates to save")
|
|
|
|
# Show summary
|
|
print(f"\nBatch {BATCH_START+1}-{BATCH_END} Summary:")
|
|
print(f" Updated: {updated_count}")
|
|
print(f" Batch size: {BATCH_END - BATCH_START}")
|
|
print(f" Success rate: {updated_count/(BATCH_END-BATCH_START)*100:.1f}%")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|