glam/scripts/update_nde_yaml_with_wikidata_test_batch.py
2025-11-19 23:25:22 +01:00

125 lines
4.8 KiB
Python

#!/usr/bin/env python3
"""
Update NDE YAML file with Wikidata IDs - Test Batch (First 10 Records)
This script adds wikidata_id fields to the first 10 records in the NDE YAML file
based on the enrichment results from our test batch.
Test Batch Results:
1. Stichting Herinneringscentrum Kamp Westerbork → Q22246632
2. Stichting Hunebedcentrum → Q2679819
3. Regionaal Historisch Centrum (RHC) Drents Archief → Q1978308
4. Stichting Drents Museum → Q1258370
5. Stichting Drents Museum De Buitenplaats → No match
6. Gemeente Aa en Hunze → Q300665
7. Gemeente Borger-Odoorn → Q835118
8. Gemeente Coevorden → Q60453
9. Gemeente De Wolden → Q835108
10. Samenwerkingsorganisatie De Wolden/Hoogeveen → No match
Success Rate: 8/10 (80%)
"""
import yaml
from pathlib import Path
from datetime import datetime, timezone
import json
# Wikidata mapping for test batch (first 10 records)
WIKIDATA_MAPPING = {
"Stichting Herinneringscentrum Kamp Westerbork": "Q22246632",
"Stichting Hunebedcentrum": "Q2679819",
"Regionaal Historisch Centrum (RHC) Drents Archief": "Q1978308",
"Stichting Drents Museum": "Q1258370",
"Stichting Drents Museum De Buitenplaats": None, # No match found
"Gemeente Aa en Hunze": "Q300665",
"Gemeente Borger-Odoorn": "Q835118",
"Gemeente Coevorden": "Q60453",
"Gemeente De Wolden": "Q835108",
"Samenwerkingsorganisatie De Wolden/Hoogeveen": None, # No match found
}
def main():
"""Update YAML file with Wikidata IDs."""
# Paths
data_dir = Path(__file__).parent.parent / "data" / "nde"
yaml_path = data_dir / "voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.yaml"
backup_path = data_dir / f"voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.backup.{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.yaml"
print(f"Reading YAML file: {yaml_path}")
# Load YAML file
with open(yaml_path, 'r', encoding='utf-8') as f:
organizations = yaml.safe_load(f)
print(f"Loaded {len(organizations)} organizations")
# Create backup
print(f"Creating backup: {backup_path}")
with open(backup_path, 'w', encoding='utf-8') as f:
yaml.dump(organizations, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Update first 10 records with Wikidata IDs
enriched_count = 0
no_match_count = 0
for i, org in enumerate(organizations[:10]):
org_name = org.get('organisatie', '')
if org_name in WIKIDATA_MAPPING:
wikidata_id = WIKIDATA_MAPPING[org_name]
if wikidata_id:
org['wikidata_id'] = wikidata_id
enriched_count += 1
print(f"{i+1}. ✓ {org_name}{wikidata_id}")
else:
# Add enrichment note for records with no match
org['wikidata_enrichment_status'] = 'no_match_found'
no_match_count += 1
print(f"{i+1}. ✗ {org_name} → No match")
else:
print(f"{i+1}. ? {org_name} → Not in mapping")
# Save updated YAML
print(f"\nSaving updated YAML to: {yaml_path}")
with open(yaml_path, 'w', encoding='utf-8') as f:
yaml.dump(organizations, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Create enrichment log
enrichment_log = {
"enrichment_date": datetime.now(timezone.utc).isoformat(),
"enrichment_method": "Wikidata MCP service - manual search and SPARQL queries",
"records_processed": 10,
"records_enriched": enriched_count,
"records_no_match": no_match_count,
"success_rate": f"{enriched_count/10*100:.1f}%",
"wikidata_mapping": {
k: v for k, v in WIKIDATA_MAPPING.items()
},
"notes": [
"Test batch of first 10 records from NDE CSV",
"Used Wikidata search API and SPARQL queries for verification",
"No match found for 'De Buitenplaats' museum (branch location, likely not in Wikidata)",
"No match found for 'Samenwerkingsorganisatie' (collaborative organization, likely not in Wikidata)"
]
}
log_path = data_dir / "sparql" / f"enrichment_log_test_batch_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.json"
print(f"\nCreating enrichment log: {log_path}")
with open(log_path, 'w', encoding='utf-8') as f:
json.dump(enrichment_log, f, indent=2, ensure_ascii=False)
print("\n" + "="*60)
print("ENRICHMENT COMPLETE")
print("="*60)
print(f"Records processed: {10}")
print(f"Records enriched: {enriched_count}")
print(f"Records with no match: {no_match_count}")
print(f"Success rate: {enriched_count/10*100:.1f}%")
print(f"\nBackup saved to: {backup_path}")
print(f"Enrichment log: {log_path}")
if __name__ == "__main__":
main()