#!/usr/bin/env python3 """ Link extracted entities to Wikidata. This script finds high-frequency entities and attempts to link them to Wikidata using the Wikidata MCP server's search functionality. """ import yaml import json from pathlib import Path from collections import Counter from datetime import datetime import sys def collect_entities(custodian_dir: Path, entity_types: list[str], min_count: int = 2) -> dict[str, Counter]: """Collect entities by type from custodian files.""" entity_by_type = {t: Counter() for t in entity_types} for f in custodian_dir.glob('NL-*.yaml'): try: with open(f) as fh: data = yaml.safe_load(fh) if data and 'validated_entity_claims' in data: claims = data['validated_entity_claims'] if claims and 'claims' in claims: for e in claims['claims']: etype = e.get('entity_type', '') ename = e.get('entity', '') if etype in entity_by_type and isinstance(ename, str): entity_by_type[etype][ename] += 1 except: pass # Filter by minimum count for etype in entity_by_type: entity_by_type[etype] = Counter({ k: v for k, v in entity_by_type[etype].items() if v >= min_count }) return entity_by_type def main(): custodian_dir = Path('data/custodian') # Entity types to link entity_types = [ 'GRP.HER', # Heritage institutions -> Q-items 'TOP.SET', # Settlements -> Q-items 'AGT.PER', # Persons -> Q-items 'GRP.ASS', # Associations -> Q-items 'GRP.GOV', # Government orgs -> Q-items ] print("Collecting entities from custodian files...") entities = collect_entities(custodian_dir, entity_types, min_count=2) # Create candidates for linking candidates = [] for etype, counter in entities.items(): for name, count in counter.most_common(50): # Top 50 per type candidates.append({ 'entity_name': name, 'entity_type': etype, 'occurrence_count': count, 'wikidata_id': None, 'wikidata_label': None, 'link_status': 'pending' }) print(f"\nFound {len(candidates)} candidate entities for Wikidata linking") print("\nBreakdown by type:") for etype in entity_types: count = sum(1 for c in candidates if c['entity_type'] == etype) print(f" {etype}: {count}") # Save candidates output_file = Path('reports/wikidata_link_candidates.yaml') with open(output_file, 'w') as f: yaml.dump({ 'generated': datetime.now().isoformat(), 'total_candidates': len(candidates), 'candidates': candidates }, f, allow_unicode=True, default_flow_style=False) print(f"\nSaved candidates to {output_file}") print("\nTo link entities, use the Wikidata MCP server:") print(" wikidata-authenticated_search_entity(query='Amsterdam')") return candidates if __name__ == '__main__': main()