glam/scripts/update_nde_batch_2.py
2025-11-19 23:25:22 +01:00

100 lines
3.3 KiB
Python

#!/usr/bin/env python3
"""
Update NDE YAML with Batch 2 enrichment results (records 11-17).
Wikidata Q-numbers found through manual searches and SPARQL queries.
"""
import yaml
from pathlib import Path
from datetime import datetime, timezone
# Batch 2 results: Records 11-17 (municipalities in Drenthe)
BATCH_2_MAPPING = {
"Gemeente Hoogeveen": "Q208012",
"Gemeente Emmen": "Q14641",
"Gemeente Meppel": "Q60425",
"Gemeente Midden-Drenthe": "Q835125",
"Gemeente Noordenveld": "Q835083",
"Gemeente Westerveld": "Q747920",
"Gemeente Tynaarlo": "Q840457",
}
def main():
"""Update YAML with Batch 2 results."""
base_dir = Path(__file__).parent.parent
data_path = base_dir / "data" / "nde" / "voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.yaml"
print("=" * 80)
print("NDE WIKIDATA ENRICHMENT - BATCH 2 UPDATE")
print("=" * 80)
print()
# Load data
print(f"Loading {data_path.name}...")
with open(data_path, 'r', encoding='utf-8') as f:
orgs = yaml.safe_load(f)
print(f"Total organizations: {len(orgs)}")
print()
# Create backup
backup_path = data_path.parent / f"{data_path.stem}.backup.{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.yaml"
print(f"Creating backup: {backup_path.name}")
with open(backup_path, 'w', encoding='utf-8') as f:
yaml.dump(orgs, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print()
# Update records
updated_count = 0
already_enriched = 0
print("Updating records:")
print("-" * 80)
for i, org in enumerate(orgs):
org_name = org.get('organisatie', '')
if org_name in BATCH_2_MAPPING:
q_number = BATCH_2_MAPPING[org_name]
if org.get('wikidata_id'):
already_enriched += 1
print(f"{i+1:3d}. ○ {org_name[:50]:50s} (already enriched: {org['wikidata_id']})")
else:
orgs[i]['wikidata_id'] = q_number
updated_count += 1
print(f"{i+1:3d}. ✓ {org_name[:50]:50s}{q_number}")
print()
print("=" * 80)
# Save if updates made
if updated_count > 0:
print(f"Saving {updated_count} updates to {data_path.name}...")
with open(data_path, 'w', encoding='utf-8') as f:
yaml.dump(orgs, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print("✅ Saved successfully!")
else:
print("⚠️ No new updates to save")
print()
print("=" * 80)
print("BATCH 2 SUMMARY")
print("=" * 80)
print(f"Records in mapping: {len(BATCH_2_MAPPING)}")
print(f"Newly enriched: {updated_count}")
print(f"Already enriched: {already_enriched}")
print(f"Total enriched so far: {sum(1 for o in orgs if o.get('wikidata_id'))}")
print(f"Remaining records: {len(orgs) - sum(1 for o in orgs if o.get('wikidata_id'))}")
print()
# Calculate overall progress
total_enriched = sum(1 for o in orgs if o.get('wikidata_id'))
progress_pct = (total_enriched / len(orgs)) * 100
print(f"OVERALL PROGRESS: {total_enriched}/{len(orgs)} ({progress_pct:.1f}%)")
print()
if __name__ == "__main__":
main()