glam/scripts/apply_wikidata_links.py
2025-12-14 17:09:55 +01:00

128 lines
4 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Apply Wikidata links to extracted entities in custodian files.
This script reads the wikidata_entity_links.yaml reference file and
updates custodian files to add wikidata_id to matching entities.
"""
import yaml
from pathlib import Path
from datetime import datetime
import sys
import re
# Use C-based yaml loader for speed
try:
from yaml import CSafeLoader as SafeLoader, CSafeDumper as SafeDumper
except ImportError:
from yaml import SafeLoader, SafeDumper
def load_wikidata_links(ref_file: Path) -> dict:
"""Load Wikidata entity links from reference file."""
with open(ref_file) as f:
data = yaml.load(f, Loader=SafeLoader)
# Flatten into a single lookup dict
links = {}
categories = [
'countries', 'provinces', 'regions', 'settlements',
'heritage_institutions', 'historical_events',
'persons', 'associations', 'government'
]
for category in categories:
if category in data:
for name, info in data[category].items():
# Primary name
links[name.lower()] = info
# Also add aliases if present
if 'aliases' in info:
for alias in info['aliases']:
if alias.lower() not in links:
links[alias.lower()] = info
return links
def apply_links_to_file(custodian_file: Path, links: dict) -> tuple[int, int]:
"""Apply Wikidata links to entities in a custodian file.
Returns (entities_checked, entities_linked)
"""
with open(custodian_file) as f:
data = yaml.load(f, Loader=SafeLoader)
if not data or 'validated_entity_claims' not in data:
return 0, 0
claims = data['validated_entity_claims']
if not claims or 'claims' not in claims or not claims['claims']:
return 0, 0
entities_checked = 0
entities_linked = 0
modified = False
for entity in claims['claims']:
entities_checked += 1
name = entity.get('entity', '')
if not isinstance(name, str):
continue
# Check if we have a Wikidata link for this entity
name_lower = name.lower()
if name_lower in links and 'wikidata_id' not in entity:
link_info = links[name_lower]
entity['wikidata_id'] = link_info['wikidata_id']
entity['wikidata_label'] = link_info.get('label', name)
entities_linked += 1
modified = True
if modified:
# Update timestamp
claims['wikidata_linking_timestamp'] = datetime.now().isoformat()
with open(custodian_file, 'w') as f:
yaml.dump(data, f, Dumper=SafeDumper, allow_unicode=True, default_flow_style=False, sort_keys=False)
return entities_checked, entities_linked
def main():
ref_file = Path('data/reference/wikidata_entity_links.yaml')
custodian_dir = Path('data/custodian')
print("Loading Wikidata entity links...")
links = load_wikidata_links(ref_file)
print(f" Loaded {len(links)} entity mappings")
print("\nApplying links to custodian files...")
total_checked = 0
total_linked = 0
files_modified = 0
files = sorted(custodian_dir.glob('NL-*.yaml'))
total_files = len(files)
for i, f in enumerate(files):
if i % 200 == 0:
print(f" Processing {i}/{total_files}...")
try:
checked, linked = apply_links_to_file(f, links)
total_checked += checked
total_linked += linked
if linked > 0:
files_modified += 1
except Exception as e:
print(f" Error processing {f.name}: {e}")
print(f"\n=== WIKIDATA LINKING COMPLETE ===")
print(f"Files processed: {total_files}")
print(f"Files modified: {files_modified}")
print(f"Entities checked: {total_checked}")
print(f"Entities linked: {total_linked}")
if __name__ == '__main__':
main()