120 lines
4.3 KiB
Python
120 lines
4.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Update NDE YAML with Batch 3 enrichment results (records 18-26).
|
|
|
|
Batch 3 focuses on specialty museums and historical societies in Drenthe.
|
|
Success rate: 33% (3/9 organizations matched).
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
# Batch 3 results: Records 18-26 (specialty museums + historical societies)
|
|
BATCH_3_MAPPING = {
|
|
# Museums - 3 matches found
|
|
"Stichting Exploitatie Industrieel Smalspoormuseum": "Q1911968", # Record 23
|
|
"Stichting Zeemuseum Miramar": "Q22006174", # Record 24
|
|
"Stichting Cultuurhistorisch Streek- en Handkarren Museum De Wemme": "Q56461228", # Record 26
|
|
}
|
|
|
|
# Organizations with no Wikidata match (will mark as no_match_found)
|
|
NO_MATCH_ORGS = [
|
|
"Stichting Harmonium Museum Nederland", # Record 18 - museum closed/relocated
|
|
"Historische Vereniging Carspel Oderen", # Record 19
|
|
"Historische Vereniging De Wijk Koekange", # Record 20
|
|
"Historische Kring Hoogeveen", # Record 21
|
|
"Historische Vereniging Nijeveen", # Record 22
|
|
"Museum de Proefkolonie", # Record 25 - couldn't find specific museum
|
|
]
|
|
|
|
def main():
|
|
"""Update YAML with Batch 3 results."""
|
|
base_dir = Path(__file__).parent.parent
|
|
data_path = base_dir / "data" / "nde" / "voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.yaml"
|
|
|
|
print("=" * 80)
|
|
print("NDE WIKIDATA ENRICHMENT - BATCH 3 UPDATE")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load data
|
|
print(f"Loading {data_path.name}...")
|
|
with open(data_path, 'r', encoding='utf-8') as f:
|
|
orgs = yaml.safe_load(f)
|
|
|
|
print(f"Total organizations: {len(orgs)}")
|
|
print()
|
|
|
|
# Create backup
|
|
backup_path = data_path.parent / f"{data_path.stem}.backup.{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.yaml"
|
|
print(f"Creating backup: {backup_path.name}")
|
|
with open(backup_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(orgs, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
print()
|
|
|
|
# Update records
|
|
updated_count = 0
|
|
already_enriched = 0
|
|
no_match_count = 0
|
|
|
|
print("Updating records:")
|
|
print("-" * 80)
|
|
|
|
for i, org in enumerate(orgs):
|
|
org_name = org.get('organisatie', '')
|
|
|
|
# Add Q-numbers for matches
|
|
if org_name in BATCH_3_MAPPING:
|
|
q_number = BATCH_3_MAPPING[org_name]
|
|
|
|
if org.get('wikidata_id'):
|
|
already_enriched += 1
|
|
print(f"{i+1:3d}. ○ {org_name[:50]:50s} (already: {org['wikidata_id']})")
|
|
else:
|
|
orgs[i]['wikidata_id'] = q_number
|
|
updated_count += 1
|
|
print(f"{i+1:3d}. ✓ {org_name[:50]:50s} → {q_number}")
|
|
|
|
# Mark no-matches
|
|
elif org_name in NO_MATCH_ORGS:
|
|
if not org.get('wikidata_enrichment_status'):
|
|
orgs[i]['wikidata_enrichment_status'] = 'no_match_found'
|
|
no_match_count += 1
|
|
print(f"{i+1:3d}. ✗ {org_name[:50]:50s} (no match)")
|
|
|
|
print()
|
|
print("=" * 80)
|
|
|
|
# Save if updates made
|
|
total_changes = updated_count + no_match_count
|
|
if total_changes > 0:
|
|
print(f"Saving {total_changes} updates to {data_path.name}...")
|
|
with open(data_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(orgs, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
print("✅ Saved successfully!")
|
|
else:
|
|
print("⚠️ No new updates to save")
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print("BATCH 3 SUMMARY")
|
|
print("=" * 80)
|
|
print(f"Organizations processed: {len(BATCH_3_MAPPING) + len(NO_MATCH_ORGS)}")
|
|
print(f" Newly enriched: {updated_count}")
|
|
print(f" No matches found: {no_match_count}")
|
|
print(f" Already enriched: {already_enriched}")
|
|
print()
|
|
print(f"Batch 3 success rate: {updated_count}/{len(BATCH_3_MAPPING) + len(NO_MATCH_ORGS)} ({updated_count/(len(BATCH_3_MAPPING) + len(NO_MATCH_ORGS))*100:.1f}%)")
|
|
print()
|
|
|
|
# Calculate overall progress
|
|
total_enriched = sum(1 for o in orgs if o.get('wikidata_id'))
|
|
progress_pct = (total_enriched / len(orgs)) * 100
|
|
|
|
print(f"OVERALL PROGRESS: {total_enriched}/{len(orgs)} ({progress_pct:.1f}%)")
|
|
print(f"Remaining records: {len(orgs) - total_enriched}")
|
|
print()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|