125 lines
4.8 KiB
Python
125 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Update NDE YAML file with Wikidata IDs - Test Batch (First 10 Records)
|
|
|
|
This script adds wikidata_id fields to the first 10 records in the NDE YAML file
|
|
based on the enrichment results from our test batch.
|
|
|
|
Test Batch Results:
|
|
1. Stichting Herinneringscentrum Kamp Westerbork → Q22246632
|
|
2. Stichting Hunebedcentrum → Q2679819
|
|
3. Regionaal Historisch Centrum (RHC) Drents Archief → Q1978308
|
|
4. Stichting Drents Museum → Q1258370
|
|
5. Stichting Drents Museum De Buitenplaats → No match
|
|
6. Gemeente Aa en Hunze → Q300665
|
|
7. Gemeente Borger-Odoorn → Q835118
|
|
8. Gemeente Coevorden → Q60453
|
|
9. Gemeente De Wolden → Q835108
|
|
10. Samenwerkingsorganisatie De Wolden/Hoogeveen → No match
|
|
|
|
Success Rate: 8/10 (80%)
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
import json
|
|
|
|
# Wikidata mapping for test batch (first 10 records)
|
|
WIKIDATA_MAPPING = {
|
|
"Stichting Herinneringscentrum Kamp Westerbork": "Q22246632",
|
|
"Stichting Hunebedcentrum": "Q2679819",
|
|
"Regionaal Historisch Centrum (RHC) Drents Archief": "Q1978308",
|
|
"Stichting Drents Museum": "Q1258370",
|
|
"Stichting Drents Museum De Buitenplaats": None, # No match found
|
|
"Gemeente Aa en Hunze": "Q300665",
|
|
"Gemeente Borger-Odoorn": "Q835118",
|
|
"Gemeente Coevorden": "Q60453",
|
|
"Gemeente De Wolden": "Q835108",
|
|
"Samenwerkingsorganisatie De Wolden/Hoogeveen": None, # No match found
|
|
}
|
|
|
|
def main():
|
|
"""Update YAML file with Wikidata IDs."""
|
|
|
|
# Paths
|
|
data_dir = Path(__file__).parent.parent / "data" / "nde"
|
|
yaml_path = data_dir / "voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.yaml"
|
|
backup_path = data_dir / f"voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.backup.{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.yaml"
|
|
|
|
print(f"Reading YAML file: {yaml_path}")
|
|
|
|
# Load YAML file
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
organizations = yaml.safe_load(f)
|
|
|
|
print(f"Loaded {len(organizations)} organizations")
|
|
|
|
# Create backup
|
|
print(f"Creating backup: {backup_path}")
|
|
with open(backup_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(organizations, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
# Update first 10 records with Wikidata IDs
|
|
enriched_count = 0
|
|
no_match_count = 0
|
|
|
|
for i, org in enumerate(organizations[:10]):
|
|
org_name = org.get('organisatie', '')
|
|
|
|
if org_name in WIKIDATA_MAPPING:
|
|
wikidata_id = WIKIDATA_MAPPING[org_name]
|
|
|
|
if wikidata_id:
|
|
org['wikidata_id'] = wikidata_id
|
|
enriched_count += 1
|
|
print(f"{i+1}. ✓ {org_name} → {wikidata_id}")
|
|
else:
|
|
# Add enrichment note for records with no match
|
|
org['wikidata_enrichment_status'] = 'no_match_found'
|
|
no_match_count += 1
|
|
print(f"{i+1}. ✗ {org_name} → No match")
|
|
else:
|
|
print(f"{i+1}. ? {org_name} → Not in mapping")
|
|
|
|
# Save updated YAML
|
|
print(f"\nSaving updated YAML to: {yaml_path}")
|
|
with open(yaml_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(organizations, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
# Create enrichment log
|
|
enrichment_log = {
|
|
"enrichment_date": datetime.now(timezone.utc).isoformat(),
|
|
"enrichment_method": "Wikidata MCP service - manual search and SPARQL queries",
|
|
"records_processed": 10,
|
|
"records_enriched": enriched_count,
|
|
"records_no_match": no_match_count,
|
|
"success_rate": f"{enriched_count/10*100:.1f}%",
|
|
"wikidata_mapping": {
|
|
k: v for k, v in WIKIDATA_MAPPING.items()
|
|
},
|
|
"notes": [
|
|
"Test batch of first 10 records from NDE CSV",
|
|
"Used Wikidata search API and SPARQL queries for verification",
|
|
"No match found for 'De Buitenplaats' museum (branch location, likely not in Wikidata)",
|
|
"No match found for 'Samenwerkingsorganisatie' (collaborative organization, likely not in Wikidata)"
|
|
]
|
|
}
|
|
|
|
log_path = data_dir / "sparql" / f"enrichment_log_test_batch_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.json"
|
|
print(f"\nCreating enrichment log: {log_path}")
|
|
with open(log_path, 'w', encoding='utf-8') as f:
|
|
json.dump(enrichment_log, f, indent=2, ensure_ascii=False)
|
|
|
|
print("\n" + "="*60)
|
|
print("ENRICHMENT COMPLETE")
|
|
print("="*60)
|
|
print(f"Records processed: {10}")
|
|
print(f"Records enriched: {enriched_count}")
|
|
print(f"Records with no match: {no_match_count}")
|
|
print(f"Success rate: {enriched_count/10*100:.1f}%")
|
|
print(f"\nBackup saved to: {backup_path}")
|
|
print(f"Enrichment log: {log_path}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|