#!/usr/bin/env python3 """ Update NDE YAML with Batch 3 enrichment results (records 18-26). Batch 3 focuses on specialty museums and historical societies in Drenthe. Success rate: 33% (3/9 organizations matched). """ import yaml from pathlib import Path from datetime import datetime, timezone # Batch 3 results: Records 18-26 (specialty museums + historical societies) BATCH_3_MAPPING = { # Museums - 3 matches found "Stichting Exploitatie Industrieel Smalspoormuseum": "Q1911968", # Record 23 "Stichting Zeemuseum Miramar": "Q22006174", # Record 24 "Stichting Cultuurhistorisch Streek- en Handkarren Museum De Wemme": "Q56461228", # Record 26 } # Organizations with no Wikidata match (will mark as no_match_found) NO_MATCH_ORGS = [ "Stichting Harmonium Museum Nederland", # Record 18 - museum closed/relocated "Historische Vereniging Carspel Oderen", # Record 19 "Historische Vereniging De Wijk Koekange", # Record 20 "Historische Kring Hoogeveen", # Record 21 "Historische Vereniging Nijeveen", # Record 22 "Museum de Proefkolonie", # Record 25 - couldn't find specific museum ] def main(): """Update YAML with Batch 3 results.""" base_dir = Path(__file__).parent.parent data_path = base_dir / "data" / "nde" / "voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.yaml" print("=" * 80) print("NDE WIKIDATA ENRICHMENT - BATCH 3 UPDATE") print("=" * 80) print() # Load data print(f"Loading {data_path.name}...") with open(data_path, 'r', encoding='utf-8') as f: orgs = yaml.safe_load(f) print(f"Total organizations: {len(orgs)}") print() # Create backup backup_path = data_path.parent / f"{data_path.stem}.backup.{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.yaml" print(f"Creating backup: {backup_path.name}") with open(backup_path, 'w', encoding='utf-8') as f: yaml.dump(orgs, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print() # Update records updated_count = 0 already_enriched = 0 no_match_count = 0 print("Updating records:") print("-" * 80) for i, org in enumerate(orgs): org_name = org.get('organisatie', '') # Add Q-numbers for matches if org_name in BATCH_3_MAPPING: q_number = BATCH_3_MAPPING[org_name] if org.get('wikidata_id'): already_enriched += 1 print(f"{i+1:3d}. ○ {org_name[:50]:50s} (already: {org['wikidata_id']})") else: orgs[i]['wikidata_id'] = q_number updated_count += 1 print(f"{i+1:3d}. ✓ {org_name[:50]:50s} → {q_number}") # Mark no-matches elif org_name in NO_MATCH_ORGS: if not org.get('wikidata_enrichment_status'): orgs[i]['wikidata_enrichment_status'] = 'no_match_found' no_match_count += 1 print(f"{i+1:3d}. ✗ {org_name[:50]:50s} (no match)") print() print("=" * 80) # Save if updates made total_changes = updated_count + no_match_count if total_changes > 0: print(f"Saving {total_changes} updates to {data_path.name}...") with open(data_path, 'w', encoding='utf-8') as f: yaml.dump(orgs, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print("✅ Saved successfully!") else: print("⚠️ No new updates to save") print() print("=" * 80) print("BATCH 3 SUMMARY") print("=" * 80) print(f"Organizations processed: {len(BATCH_3_MAPPING) + len(NO_MATCH_ORGS)}") print(f" Newly enriched: {updated_count}") print(f" No matches found: {no_match_count}") print(f" Already enriched: {already_enriched}") print() print(f"Batch 3 success rate: {updated_count}/{len(BATCH_3_MAPPING) + len(NO_MATCH_ORGS)} ({updated_count/(len(BATCH_3_MAPPING) + len(NO_MATCH_ORGS))*100:.1f}%)") print() # Calculate overall progress total_enriched = sum(1 for o in orgs if o.get('wikidata_id')) progress_pct = (total_enriched / len(orgs)) * 100 print(f"OVERALL PROGRESS: {total_enriched}/{len(orgs)} ({progress_pct:.1f}%)") print(f"Remaining records: {len(orgs) - total_enriched}") print() if __name__ == "__main__": main()