128 lines
4 KiB
Python
Executable file
128 lines
4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Apply Wikidata links to extracted entities in custodian files.
|
|
|
|
This script reads the wikidata_entity_links.yaml reference file and
|
|
updates custodian files to add wikidata_id to matching entities.
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import sys
|
|
import re
|
|
|
|
# Use C-based yaml loader for speed
|
|
try:
|
|
from yaml import CSafeLoader as SafeLoader, CSafeDumper as SafeDumper
|
|
except ImportError:
|
|
from yaml import SafeLoader, SafeDumper
|
|
|
|
def load_wikidata_links(ref_file: Path) -> dict:
|
|
"""Load Wikidata entity links from reference file."""
|
|
with open(ref_file) as f:
|
|
data = yaml.load(f, Loader=SafeLoader)
|
|
|
|
# Flatten into a single lookup dict
|
|
links = {}
|
|
categories = [
|
|
'countries', 'provinces', 'regions', 'settlements',
|
|
'heritage_institutions', 'historical_events',
|
|
'persons', 'associations', 'government'
|
|
]
|
|
|
|
for category in categories:
|
|
if category in data:
|
|
for name, info in data[category].items():
|
|
# Primary name
|
|
links[name.lower()] = info
|
|
|
|
# Also add aliases if present
|
|
if 'aliases' in info:
|
|
for alias in info['aliases']:
|
|
if alias.lower() not in links:
|
|
links[alias.lower()] = info
|
|
|
|
return links
|
|
|
|
def apply_links_to_file(custodian_file: Path, links: dict) -> tuple[int, int]:
|
|
"""Apply Wikidata links to entities in a custodian file.
|
|
|
|
Returns (entities_checked, entities_linked)
|
|
"""
|
|
with open(custodian_file) as f:
|
|
data = yaml.load(f, Loader=SafeLoader)
|
|
|
|
if not data or 'validated_entity_claims' not in data:
|
|
return 0, 0
|
|
|
|
claims = data['validated_entity_claims']
|
|
if not claims or 'claims' not in claims or not claims['claims']:
|
|
return 0, 0
|
|
|
|
entities_checked = 0
|
|
entities_linked = 0
|
|
modified = False
|
|
|
|
for entity in claims['claims']:
|
|
entities_checked += 1
|
|
name = entity.get('entity', '')
|
|
if not isinstance(name, str):
|
|
continue
|
|
|
|
# Check if we have a Wikidata link for this entity
|
|
name_lower = name.lower()
|
|
if name_lower in links and 'wikidata_id' not in entity:
|
|
link_info = links[name_lower]
|
|
entity['wikidata_id'] = link_info['wikidata_id']
|
|
entity['wikidata_label'] = link_info.get('label', name)
|
|
entities_linked += 1
|
|
modified = True
|
|
|
|
if modified:
|
|
# Update timestamp
|
|
claims['wikidata_linking_timestamp'] = datetime.now().isoformat()
|
|
|
|
with open(custodian_file, 'w') as f:
|
|
yaml.dump(data, f, Dumper=SafeDumper, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
return entities_checked, entities_linked
|
|
|
|
def main():
|
|
ref_file = Path('data/reference/wikidata_entity_links.yaml')
|
|
custodian_dir = Path('data/custodian')
|
|
|
|
print("Loading Wikidata entity links...")
|
|
links = load_wikidata_links(ref_file)
|
|
print(f" Loaded {len(links)} entity mappings")
|
|
|
|
print("\nApplying links to custodian files...")
|
|
|
|
total_checked = 0
|
|
total_linked = 0
|
|
files_modified = 0
|
|
|
|
files = sorted(custodian_dir.glob('NL-*.yaml'))
|
|
total_files = len(files)
|
|
|
|
for i, f in enumerate(files):
|
|
if i % 200 == 0:
|
|
print(f" Processing {i}/{total_files}...")
|
|
|
|
try:
|
|
checked, linked = apply_links_to_file(f, links)
|
|
total_checked += checked
|
|
total_linked += linked
|
|
if linked > 0:
|
|
files_modified += 1
|
|
except Exception as e:
|
|
print(f" Error processing {f.name}: {e}")
|
|
|
|
print(f"\n=== WIKIDATA LINKING COMPLETE ===")
|
|
print(f"Files processed: {total_files}")
|
|
print(f"Files modified: {files_modified}")
|
|
print(f"Entities checked: {total_checked}")
|
|
print(f"Entities linked: {total_linked}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|