glam/scripts/query_wikidata_chilean_batch11.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

278 lines
9.3 KiB
Python

#!/usr/bin/env python3
"""
Query Wikidata for Chilean Museums - Batch 11 (Final Museum Push)
=================================================================
Target: 13 remaining museums needing Wikidata enrichment
Strategy: Direct SPARQL queries with fuzzy matching
Goal: Add 5-8 museums → reach 60-63/90 (67-70% overall coverage)
"""
import requests
import json
from time import sleep
from rapidfuzz import fuzz
# Museums without Wikidata from analysis
TARGET_MUSEUMS = [
{
"name": "Museo de Tocopilla",
"city": "María Elena",
"region": "Tocopilla",
"search_terms": ["Museo de Tocopilla", "Museo Histórico Tocopilla"]
},
{
"name": "Museo Rodulfo Philippi",
"city": "Chañaral",
"region": "Chañaral",
"search_terms": ["Museo Rodulfo Philippi", "Museo Rudolf Philippi"]
},
{
"name": "Museo del Libro del Mar",
"city": "San Antonio",
"region": "San Antonio",
"search_terms": ["Museo del Libro del Mar", "Museo Libro Mar"]
},
{
"name": "Museo de Historia Local Los Perales",
"city": "Quilpué",
"region": "Marga Marga",
"search_terms": ["Museo Historia Local Los Perales", "Museo Quilpué"]
},
{
"name": "Museo Histórico-Arqueológico",
"city": "Quillota",
"region": "Quillota",
"search_terms": ["Museo Histórico Arqueológico Quillota", "Museo Quillota"]
},
{
"name": "Museo Histórico y Cultural",
"city": "Cauquenes",
"region": "Cauquenes",
"search_terms": ["Museo Histórico Cultural Cauquenes", "Museo Cauquenes"]
},
{
"name": "Museo Mapuche de Purén",
"city": "Capitán Pastene",
"region": "Malleco",
"search_terms": ["Museo Mapuche Purén", "Museo Capitán Pastene"]
},
{
"name": "Museo Rudolph Philippi",
"city": "Valdivia",
"region": "Valdivia",
"search_terms": ["Museo Rudolph Philippi", "Museo Rudolf Philippi Valdivia"]
},
{
"name": "Museo de las Iglesias",
"city": "Castro",
"region": "Chiloé",
"search_terms": ["Museo Iglesias Chiloé", "Museo Castro"]
},
{
"name": "Museo Pleistocénico",
"city": "Osorno",
"region": "Osorno",
"search_terms": ["Museo Pleistocénico", "Museo Pleistocene Osorno"]
},
{
"name": "Red de Museos Aysén",
"city": "Coyhaique",
"region": "Aisén",
"search_terms": ["Red Museos Aysén", "Museo Regional Aysén"]
},
{
"name": "Museo Territorial Yagan Usi",
"city": "Cabo de Hornos",
"region": "Antártica Chilena",
"search_terms": ["Museo Yagan", "Museo Territorial Yagan"]
},
{
"name": "Museo Histórico Municipal",
"city": "Provincia de Última Esperanza",
"region": "Última Esperanza",
"search_terms": ["Museo Histórico Puerto Natales", "Museo Última Esperanza"]
}
]
def query_chilean_museums():
"""
Query all Chilean museums from Wikidata in one efficient SPARQL query
"""
endpoint = "https://query.wikidata.org/sparql"
query = """
SELECT DISTINCT ?museum ?museumLabel ?museumDescription ?location ?locationLabel ?founded ?coordinates
WHERE {
# Museum types
VALUES ?museumType {
wd:Q33506 wd:Q207694 wd:Q4737021 wd:Q812979 wd:Q10283556
wd:Q641152 wd:Q1124131 wd:Q17431399 wd:Q2772772 wd:Q2001305
}
?museum wdt:P31/wdt:P279* ?museumType .
?museum wdt:P17 wd:Q298 . # Country: Chile
OPTIONAL { ?museum wdt:P131 ?location . }
OPTIONAL { ?museum wdt:P571 ?founded . }
OPTIONAL { ?museum wdt:P625 ?coordinates . }
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" . }
}
ORDER BY ?founded
"""
headers = {
"User-Agent": "GLAM-Extractor/1.0 (Chilean Heritage Enrichment)",
"Accept": "application/sparql-results+json"
}
try:
print("🔍 Querying Wikidata for all Chilean museums...")
response = requests.get(
endpoint,
params={"query": query, "format": "json"},
headers=headers,
timeout=60
)
if response.status_code == 200:
data = response.json()
print(f"✅ Retrieved {len(data['results']['bindings'])} museums from Wikidata")
return data['results']['bindings']
else:
print(f"❌ Query failed with status {response.status_code}")
return []
except Exception as e:
print(f"❌ Query error: {e}")
return []
def fuzzy_match_museum(target_museum, wikidata_museums):
"""
Find best match for target museum in Wikidata results
"""
best_match = None
best_score = 0.0
for wd_museum in wikidata_museums:
wd_name = wd_museum.get('museumLabel', {}).get('value', '')
wd_location = wd_museum.get('locationLabel', {}).get('value', '')
# Try matching against all search terms
for search_term in target_museum['search_terms']:
# Calculate multiple fuzzy scores
ratio = fuzz.ratio(search_term.lower(), wd_name.lower()) / 100
partial = fuzz.partial_ratio(search_term.lower(), wd_name.lower()) / 100
token_sort = fuzz.token_sort_ratio(search_term.lower(), wd_name.lower()) / 100
score = max(ratio, partial, token_sort)
# Boost score if location matches
if target_museum['city'].lower() in wd_location.lower() or \
target_museum['region'].lower() in wd_location.lower():
score += 0.1
if score > best_score and score >= 0.80:
best_score = score
q_number = wd_museum['museum']['value'].split('/')[-1]
best_match = {
'q_number': q_number,
'wikidata_name': wd_name,
'location': wd_location,
'description': wd_museum.get('museumDescription', {}).get('value', ''),
'match_score': score,
'founded': wd_museum.get('founded', {}).get('value', 'Unknown'),
'coordinates': wd_museum.get('coordinates', {}).get('value', 'Unknown')
}
return best_match
def main():
print("="*80)
print("CHILEAN MUSEUMS WIKIDATA QUERY - BATCH 11")
print("="*80)
print(f"Target: {len(TARGET_MUSEUMS)} museums")
print("Strategy: Single SPARQL query + fuzzy matching (threshold: 0.80)")
print("="*80)
# Get all Chilean museums from Wikidata
wikidata_museums = query_chilean_museums()
if not wikidata_museums:
print("❌ Failed to retrieve museums from Wikidata")
return
print("\n" + "="*80)
print("MATCHING MUSEUMS")
print("="*80)
matches = []
no_matches = []
for target in TARGET_MUSEUMS:
print(f"\n🔍 Searching: {target['name']} ({target['city']}, {target['region']})")
match = fuzzy_match_museum(target, wikidata_museums)
if match:
print(f" ✅ MATCH: {match['wikidata_name']} ({match['q_number']})")
print(f" 📍 Location: {match['location']}")
print(f" 📊 Score: {match['match_score']:.2f}")
matches.append({
'museum': target,
'match': match
})
else:
print(f" ❌ No match found (threshold: 0.80)")
no_matches.append(target)
# Summary
print("\n" + "="*80)
print("BATCH 11 QUERY SUMMARY")
print("="*80)
print(f"✅ Matches found: {len(matches)}/{len(TARGET_MUSEUMS)}")
print(f"❌ No matches: {len(no_matches)}/{len(TARGET_MUSEUMS)}")
if matches:
print("\n" + "-"*80)
print("MATCHED MUSEUMS:")
print("-"*80)
for item in matches:
print(f"{item['museum']['name']}")
print(f"{item['match']['wikidata_name']} ({item['match']['q_number']})")
print(f" Score: {item['match']['match_score']:.2f}")
if no_matches:
print("\n" + "-"*80)
print("MUSEUMS WITHOUT WIKIDATA MATCHES:")
print("-"*80)
for museum in no_matches:
print(f"{museum['name']} ({museum['city']}, {museum['region']})")
# Save results
output = {
'batch': 11,
'query_date': '2025-11-09',
'matches': matches,
'no_matches': no_matches,
'match_rate': f"{len(matches)}/{len(TARGET_MUSEUMS)}",
'expected_coverage': f"{55 + len(matches)}/90",
'expected_coverage_percent': f"{((55 + len(matches))/90*100):.1f}%"
}
with open('scripts/batch11_query_results.json', 'w', encoding='utf-8') as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"\n💾 Results saved to: scripts/batch11_query_results.json")
print(f"\n📊 Projected coverage: {55 + len(matches)}/90 = {((55 + len(matches))/90*100):.1f}%")
if len(matches) >= 8:
print("🎯 SUCCESS! Reached 70% coverage target!")
elif len(matches) >= 5:
print("✅ Good progress toward 70% coverage goal")
else:
print("⚠️ May need additional enrichment strategies")
if __name__ == "__main__":
main()