- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
134 lines
4.2 KiB
Python
Executable file
134 lines
4.2 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Chilean Batch 14: Quick Wikidata Check for Rodulfo Philippi Museum
|
|
Focus on the most promising candidate: museum named after famous scientist
|
|
"""
|
|
|
|
import json
|
|
import requests
|
|
import time
|
|
|
|
def search_wikidata_simple(search_term: str, language='es'):
|
|
"""Simple Wikidata API search."""
|
|
url = 'https://www.wikidata.org/w/api.php'
|
|
params = {
|
|
'action': 'wbsearchentities',
|
|
'format': 'json',
|
|
'language': language,
|
|
'type': 'item',
|
|
'search': search_term,
|
|
'limit': 10
|
|
}
|
|
|
|
try:
|
|
response = requests.get(url, params=params, timeout=10)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
return data.get('search', [])
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
return []
|
|
|
|
def get_entity_details(qid: str):
|
|
"""Get details for a Wikidata entity."""
|
|
url = f'https://www.wikidata.org/wiki/Special:EntityData/{qid}.json'
|
|
|
|
try:
|
|
response = requests.get(url, timeout=10)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
entity = data['entities'][qid]
|
|
|
|
# Extract useful info
|
|
label = entity.get('labels', {}).get('es', {}).get('value', 'No label')
|
|
if label == 'No label':
|
|
label = entity.get('labels', {}).get('en', {}).get('value', 'No label')
|
|
|
|
description = entity.get('descriptions', {}).get('es', {}).get('value', 'No description')
|
|
if description == 'No description':
|
|
description = entity.get('descriptions', {}).get('en', {}).get('value', 'No description')
|
|
|
|
# Get location if available (P131)
|
|
location = 'Unknown'
|
|
if 'claims' in entity and 'P131' in entity['claims']:
|
|
loc_claim = entity['claims']['P131'][0]
|
|
loc_qid = loc_claim['mainsnak']['datavalue']['value']['id']
|
|
location = loc_qid
|
|
|
|
# Get instance of (P31)
|
|
instance_of = 'Unknown'
|
|
if 'claims' in entity and 'P31' in entity['claims']:
|
|
inst_claim = entity['claims']['P31'][0]
|
|
inst_qid = inst_claim['mainsnak']['datavalue']['value']['id']
|
|
instance_of = inst_qid
|
|
|
|
return {
|
|
'qid': qid,
|
|
'label': label,
|
|
'description': description,
|
|
'location_qid': location,
|
|
'instance_of_qid': instance_of
|
|
}
|
|
except Exception as e:
|
|
print(f"Error getting entity {qid}: {e}")
|
|
return None
|
|
|
|
def main():
|
|
"""Quick search for Philippi museums."""
|
|
|
|
search_terms = [
|
|
"Museo Rodolfo Philippi Chile",
|
|
"Museo Rudolph Philippi Chile",
|
|
"Museo Philippi Chañaral",
|
|
"Museo Philippi Valdivia",
|
|
"Instituto Alemán Puerto Montt"
|
|
]
|
|
|
|
all_results = {}
|
|
|
|
print("=" * 80)
|
|
print("Chilean Batch 14: Quick Wikidata Search")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
for term in search_terms:
|
|
print(f"Searching: {term}")
|
|
results = search_wikidata_simple(term)
|
|
|
|
if results:
|
|
print(f" Found {len(results)} results:")
|
|
term_results = []
|
|
|
|
for result in results[:5]:
|
|
qid = result['id']
|
|
label = result.get('label', 'No label')
|
|
description = result.get('description', 'No description')
|
|
|
|
print(f" {qid}: {label}")
|
|
print(f" {description}")
|
|
|
|
# Get more details
|
|
details = get_entity_details(qid)
|
|
if details:
|
|
term_results.append(details)
|
|
|
|
time.sleep(0.5) # Be nice to Wikidata
|
|
|
|
all_results[term] = term_results
|
|
else:
|
|
print(" No results found")
|
|
all_results[term] = []
|
|
|
|
print()
|
|
time.sleep(1) # Rate limiting
|
|
|
|
# Save results
|
|
output_file = 'scripts/batch14_quick_search_results.json'
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(all_results, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Results saved to: {output_file}")
|
|
print()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|