94 lines
3.2 KiB
Python
Executable file
94 lines
3.2 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Link extracted entities to Wikidata.
|
|
|
|
This script finds high-frequency entities and attempts to link them to Wikidata
|
|
using the Wikidata MCP server's search functionality.
|
|
"""
|
|
|
|
import yaml
|
|
import json
|
|
from pathlib import Path
|
|
from collections import Counter
|
|
from datetime import datetime
|
|
import sys
|
|
|
|
def collect_entities(custodian_dir: Path, entity_types: list[str], min_count: int = 2) -> dict[str, Counter]:
|
|
"""Collect entities by type from custodian files."""
|
|
entity_by_type = {t: Counter() for t in entity_types}
|
|
|
|
for f in custodian_dir.glob('NL-*.yaml'):
|
|
try:
|
|
with open(f) as fh:
|
|
data = yaml.safe_load(fh)
|
|
if data and 'validated_entity_claims' in data:
|
|
claims = data['validated_entity_claims']
|
|
if claims and 'claims' in claims:
|
|
for e in claims['claims']:
|
|
etype = e.get('entity_type', '')
|
|
ename = e.get('entity', '')
|
|
if etype in entity_by_type and isinstance(ename, str):
|
|
entity_by_type[etype][ename] += 1
|
|
except:
|
|
pass
|
|
|
|
# Filter by minimum count
|
|
for etype in entity_by_type:
|
|
entity_by_type[etype] = Counter({
|
|
k: v for k, v in entity_by_type[etype].items()
|
|
if v >= min_count
|
|
})
|
|
|
|
return entity_by_type
|
|
|
|
def main():
|
|
custodian_dir = Path('data/custodian')
|
|
|
|
# Entity types to link
|
|
entity_types = [
|
|
'GRP.HER', # Heritage institutions -> Q-items
|
|
'TOP.SET', # Settlements -> Q-items
|
|
'AGT.PER', # Persons -> Q-items
|
|
'GRP.ASS', # Associations -> Q-items
|
|
'GRP.GOV', # Government orgs -> Q-items
|
|
]
|
|
|
|
print("Collecting entities from custodian files...")
|
|
entities = collect_entities(custodian_dir, entity_types, min_count=2)
|
|
|
|
# Create candidates for linking
|
|
candidates = []
|
|
for etype, counter in entities.items():
|
|
for name, count in counter.most_common(50): # Top 50 per type
|
|
candidates.append({
|
|
'entity_name': name,
|
|
'entity_type': etype,
|
|
'occurrence_count': count,
|
|
'wikidata_id': None,
|
|
'wikidata_label': None,
|
|
'link_status': 'pending'
|
|
})
|
|
|
|
print(f"\nFound {len(candidates)} candidate entities for Wikidata linking")
|
|
print("\nBreakdown by type:")
|
|
for etype in entity_types:
|
|
count = sum(1 for c in candidates if c['entity_type'] == etype)
|
|
print(f" {etype}: {count}")
|
|
|
|
# Save candidates
|
|
output_file = Path('reports/wikidata_link_candidates.yaml')
|
|
with open(output_file, 'w') as f:
|
|
yaml.dump({
|
|
'generated': datetime.now().isoformat(),
|
|
'total_candidates': len(candidates),
|
|
'candidates': candidates
|
|
}, f, allow_unicode=True, default_flow_style=False)
|
|
|
|
print(f"\nSaved candidates to {output_file}")
|
|
print("\nTo link entities, use the Wikidata MCP server:")
|
|
print(" wikidata-authenticated_search_entity(query='Amsterdam')")
|
|
|
|
return candidates
|
|
|
|
if __name__ == '__main__':
|
|
main()
|