#!/usr/bin/env python3 """ Apply Wikidata links to extracted entities in custodian files. This script reads the wikidata_entity_links.yaml reference file and updates custodian files to add wikidata_id to matching entities. """ import yaml from pathlib import Path from datetime import datetime import sys import re # Use C-based yaml loader for speed try: from yaml import CSafeLoader as SafeLoader, CSafeDumper as SafeDumper except ImportError: from yaml import SafeLoader, SafeDumper def load_wikidata_links(ref_file: Path) -> dict: """Load Wikidata entity links from reference file.""" with open(ref_file) as f: data = yaml.load(f, Loader=SafeLoader) # Flatten into a single lookup dict links = {} categories = [ 'countries', 'provinces', 'regions', 'settlements', 'heritage_institutions', 'historical_events', 'persons', 'associations', 'government' ] for category in categories: if category in data: for name, info in data[category].items(): # Primary name links[name.lower()] = info # Also add aliases if present if 'aliases' in info: for alias in info['aliases']: if alias.lower() not in links: links[alias.lower()] = info return links def apply_links_to_file(custodian_file: Path, links: dict) -> tuple[int, int]: """Apply Wikidata links to entities in a custodian file. Returns (entities_checked, entities_linked) """ with open(custodian_file) as f: data = yaml.load(f, Loader=SafeLoader) if not data or 'validated_entity_claims' not in data: return 0, 0 claims = data['validated_entity_claims'] if not claims or 'claims' not in claims or not claims['claims']: return 0, 0 entities_checked = 0 entities_linked = 0 modified = False for entity in claims['claims']: entities_checked += 1 name = entity.get('entity', '') if not isinstance(name, str): continue # Check if we have a Wikidata link for this entity name_lower = name.lower() if name_lower in links and 'wikidata_id' not in entity: link_info = links[name_lower] entity['wikidata_id'] = link_info['wikidata_id'] entity['wikidata_label'] = link_info.get('label', name) entities_linked += 1 modified = True if modified: # Update timestamp claims['wikidata_linking_timestamp'] = datetime.now().isoformat() with open(custodian_file, 'w') as f: yaml.dump(data, f, Dumper=SafeDumper, allow_unicode=True, default_flow_style=False, sort_keys=False) return entities_checked, entities_linked def main(): ref_file = Path('data/reference/wikidata_entity_links.yaml') custodian_dir = Path('data/custodian') print("Loading Wikidata entity links...") links = load_wikidata_links(ref_file) print(f" Loaded {len(links)} entity mappings") print("\nApplying links to custodian files...") total_checked = 0 total_linked = 0 files_modified = 0 files = sorted(custodian_dir.glob('NL-*.yaml')) total_files = len(files) for i, f in enumerate(files): if i % 200 == 0: print(f" Processing {i}/{total_files}...") try: checked, linked = apply_links_to_file(f, links) total_checked += checked total_linked += linked if linked > 0: files_modified += 1 except Exception as e: print(f" Error processing {f.name}: {e}") print(f"\n=== WIKIDATA LINKING COMPLETE ===") print(f"Files processed: {total_files}") print(f"Files modified: {files_modified}") print(f"Entities checked: {total_checked}") print(f"Entities linked: {total_linked}") if __name__ == '__main__': main()