glam/scripts/link_entities_to_wikidata.py
2025-12-14 17:09:55 +01:00

94 lines
3.2 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Link extracted entities to Wikidata.
This script finds high-frequency entities and attempts to link them to Wikidata
using the Wikidata MCP server's search functionality.
"""
import yaml
import json
from pathlib import Path
from collections import Counter
from datetime import datetime
import sys
def collect_entities(custodian_dir: Path, entity_types: list[str], min_count: int = 2) -> dict[str, Counter]:
"""Collect entities by type from custodian files."""
entity_by_type = {t: Counter() for t in entity_types}
for f in custodian_dir.glob('NL-*.yaml'):
try:
with open(f) as fh:
data = yaml.safe_load(fh)
if data and 'validated_entity_claims' in data:
claims = data['validated_entity_claims']
if claims and 'claims' in claims:
for e in claims['claims']:
etype = e.get('entity_type', '')
ename = e.get('entity', '')
if etype in entity_by_type and isinstance(ename, str):
entity_by_type[etype][ename] += 1
except:
pass
# Filter by minimum count
for etype in entity_by_type:
entity_by_type[etype] = Counter({
k: v for k, v in entity_by_type[etype].items()
if v >= min_count
})
return entity_by_type
def main():
custodian_dir = Path('data/custodian')
# Entity types to link
entity_types = [
'GRP.HER', # Heritage institutions -> Q-items
'TOP.SET', # Settlements -> Q-items
'AGT.PER', # Persons -> Q-items
'GRP.ASS', # Associations -> Q-items
'GRP.GOV', # Government orgs -> Q-items
]
print("Collecting entities from custodian files...")
entities = collect_entities(custodian_dir, entity_types, min_count=2)
# Create candidates for linking
candidates = []
for etype, counter in entities.items():
for name, count in counter.most_common(50): # Top 50 per type
candidates.append({
'entity_name': name,
'entity_type': etype,
'occurrence_count': count,
'wikidata_id': None,
'wikidata_label': None,
'link_status': 'pending'
})
print(f"\nFound {len(candidates)} candidate entities for Wikidata linking")
print("\nBreakdown by type:")
for etype in entity_types:
count = sum(1 for c in candidates if c['entity_type'] == etype)
print(f" {etype}: {count}")
# Save candidates
output_file = Path('reports/wikidata_link_candidates.yaml')
with open(output_file, 'w') as f:
yaml.dump({
'generated': datetime.now().isoformat(),
'total_candidates': len(candidates),
'candidates': candidates
}, f, allow_unicode=True, default_flow_style=False)
print(f"\nSaved candidates to {output_file}")
print("\nTo link entities, use the Wikidata MCP server:")
print(" wikidata-authenticated_search_entity(query='Amsterdam')")
return candidates
if __name__ == '__main__':
main()