glam/scripts/quick_wikidata_search_batch14.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

134 lines
4.2 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Chilean Batch 14: Quick Wikidata Check for Rodulfo Philippi Museum
Focus on the most promising candidate: museum named after famous scientist
"""
import json
import requests
import time
def search_wikidata_simple(search_term: str, language='es'):
"""Simple Wikidata API search."""
url = 'https://www.wikidata.org/w/api.php'
params = {
'action': 'wbsearchentities',
'format': 'json',
'language': language,
'type': 'item',
'search': search_term,
'limit': 10
}
try:
response = requests.get(url, params=params, timeout=10)
response.raise_for_status()
data = response.json()
return data.get('search', [])
except Exception as e:
print(f"Error: {e}")
return []
def get_entity_details(qid: str):
"""Get details for a Wikidata entity."""
url = f'https://www.wikidata.org/wiki/Special:EntityData/{qid}.json'
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
data = response.json()
entity = data['entities'][qid]
# Extract useful info
label = entity.get('labels', {}).get('es', {}).get('value', 'No label')
if label == 'No label':
label = entity.get('labels', {}).get('en', {}).get('value', 'No label')
description = entity.get('descriptions', {}).get('es', {}).get('value', 'No description')
if description == 'No description':
description = entity.get('descriptions', {}).get('en', {}).get('value', 'No description')
# Get location if available (P131)
location = 'Unknown'
if 'claims' in entity and 'P131' in entity['claims']:
loc_claim = entity['claims']['P131'][0]
loc_qid = loc_claim['mainsnak']['datavalue']['value']['id']
location = loc_qid
# Get instance of (P31)
instance_of = 'Unknown'
if 'claims' in entity and 'P31' in entity['claims']:
inst_claim = entity['claims']['P31'][0]
inst_qid = inst_claim['mainsnak']['datavalue']['value']['id']
instance_of = inst_qid
return {
'qid': qid,
'label': label,
'description': description,
'location_qid': location,
'instance_of_qid': instance_of
}
except Exception as e:
print(f"Error getting entity {qid}: {e}")
return None
def main():
"""Quick search for Philippi museums."""
search_terms = [
"Museo Rodolfo Philippi Chile",
"Museo Rudolph Philippi Chile",
"Museo Philippi Chañaral",
"Museo Philippi Valdivia",
"Instituto Alemán Puerto Montt"
]
all_results = {}
print("=" * 80)
print("Chilean Batch 14: Quick Wikidata Search")
print("=" * 80)
print()
for term in search_terms:
print(f"Searching: {term}")
results = search_wikidata_simple(term)
if results:
print(f" Found {len(results)} results:")
term_results = []
for result in results[:5]:
qid = result['id']
label = result.get('label', 'No label')
description = result.get('description', 'No description')
print(f" {qid}: {label}")
print(f" {description}")
# Get more details
details = get_entity_details(qid)
if details:
term_results.append(details)
time.sleep(0.5) # Be nice to Wikidata
all_results[term] = term_results
else:
print(" No results found")
all_results[term] = []
print()
time.sleep(1) # Rate limiting
# Save results
output_file = 'scripts/batch14_quick_search_results.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(all_results, f, indent=2, ensure_ascii=False)
print(f"Results saved to: {output_file}")
print()
if __name__ == '__main__':
main()