- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
278 lines
9.3 KiB
Python
278 lines
9.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Query Wikidata for Chilean Museums - Batch 11 (Final Museum Push)
|
|
=================================================================
|
|
|
|
Target: 13 remaining museums needing Wikidata enrichment
|
|
Strategy: Direct SPARQL queries with fuzzy matching
|
|
Goal: Add 5-8 museums → reach 60-63/90 (67-70% overall coverage)
|
|
"""
|
|
|
|
import requests
|
|
import json
|
|
from time import sleep
|
|
from rapidfuzz import fuzz
|
|
|
|
# Museums without Wikidata from analysis
|
|
TARGET_MUSEUMS = [
|
|
{
|
|
"name": "Museo de Tocopilla",
|
|
"city": "María Elena",
|
|
"region": "Tocopilla",
|
|
"search_terms": ["Museo de Tocopilla", "Museo Histórico Tocopilla"]
|
|
},
|
|
{
|
|
"name": "Museo Rodulfo Philippi",
|
|
"city": "Chañaral",
|
|
"region": "Chañaral",
|
|
"search_terms": ["Museo Rodulfo Philippi", "Museo Rudolf Philippi"]
|
|
},
|
|
{
|
|
"name": "Museo del Libro del Mar",
|
|
"city": "San Antonio",
|
|
"region": "San Antonio",
|
|
"search_terms": ["Museo del Libro del Mar", "Museo Libro Mar"]
|
|
},
|
|
{
|
|
"name": "Museo de Historia Local Los Perales",
|
|
"city": "Quilpué",
|
|
"region": "Marga Marga",
|
|
"search_terms": ["Museo Historia Local Los Perales", "Museo Quilpué"]
|
|
},
|
|
{
|
|
"name": "Museo Histórico-Arqueológico",
|
|
"city": "Quillota",
|
|
"region": "Quillota",
|
|
"search_terms": ["Museo Histórico Arqueológico Quillota", "Museo Quillota"]
|
|
},
|
|
{
|
|
"name": "Museo Histórico y Cultural",
|
|
"city": "Cauquenes",
|
|
"region": "Cauquenes",
|
|
"search_terms": ["Museo Histórico Cultural Cauquenes", "Museo Cauquenes"]
|
|
},
|
|
{
|
|
"name": "Museo Mapuche de Purén",
|
|
"city": "Capitán Pastene",
|
|
"region": "Malleco",
|
|
"search_terms": ["Museo Mapuche Purén", "Museo Capitán Pastene"]
|
|
},
|
|
{
|
|
"name": "Museo Rudolph Philippi",
|
|
"city": "Valdivia",
|
|
"region": "Valdivia",
|
|
"search_terms": ["Museo Rudolph Philippi", "Museo Rudolf Philippi Valdivia"]
|
|
},
|
|
{
|
|
"name": "Museo de las Iglesias",
|
|
"city": "Castro",
|
|
"region": "Chiloé",
|
|
"search_terms": ["Museo Iglesias Chiloé", "Museo Castro"]
|
|
},
|
|
{
|
|
"name": "Museo Pleistocénico",
|
|
"city": "Osorno",
|
|
"region": "Osorno",
|
|
"search_terms": ["Museo Pleistocénico", "Museo Pleistocene Osorno"]
|
|
},
|
|
{
|
|
"name": "Red de Museos Aysén",
|
|
"city": "Coyhaique",
|
|
"region": "Aisén",
|
|
"search_terms": ["Red Museos Aysén", "Museo Regional Aysén"]
|
|
},
|
|
{
|
|
"name": "Museo Territorial Yagan Usi",
|
|
"city": "Cabo de Hornos",
|
|
"region": "Antártica Chilena",
|
|
"search_terms": ["Museo Yagan", "Museo Territorial Yagan"]
|
|
},
|
|
{
|
|
"name": "Museo Histórico Municipal",
|
|
"city": "Provincia de Última Esperanza",
|
|
"region": "Última Esperanza",
|
|
"search_terms": ["Museo Histórico Puerto Natales", "Museo Última Esperanza"]
|
|
}
|
|
]
|
|
|
|
def query_chilean_museums():
|
|
"""
|
|
Query all Chilean museums from Wikidata in one efficient SPARQL query
|
|
"""
|
|
endpoint = "https://query.wikidata.org/sparql"
|
|
|
|
query = """
|
|
SELECT DISTINCT ?museum ?museumLabel ?museumDescription ?location ?locationLabel ?founded ?coordinates
|
|
WHERE {
|
|
# Museum types
|
|
VALUES ?museumType {
|
|
wd:Q33506 wd:Q207694 wd:Q4737021 wd:Q812979 wd:Q10283556
|
|
wd:Q641152 wd:Q1124131 wd:Q17431399 wd:Q2772772 wd:Q2001305
|
|
}
|
|
|
|
?museum wdt:P31/wdt:P279* ?museumType .
|
|
?museum wdt:P17 wd:Q298 . # Country: Chile
|
|
|
|
OPTIONAL { ?museum wdt:P131 ?location . }
|
|
OPTIONAL { ?museum wdt:P571 ?founded . }
|
|
OPTIONAL { ?museum wdt:P625 ?coordinates . }
|
|
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" . }
|
|
}
|
|
ORDER BY ?founded
|
|
"""
|
|
|
|
headers = {
|
|
"User-Agent": "GLAM-Extractor/1.0 (Chilean Heritage Enrichment)",
|
|
"Accept": "application/sparql-results+json"
|
|
}
|
|
|
|
try:
|
|
print("🔍 Querying Wikidata for all Chilean museums...")
|
|
response = requests.get(
|
|
endpoint,
|
|
params={"query": query, "format": "json"},
|
|
headers=headers,
|
|
timeout=60
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
print(f"✅ Retrieved {len(data['results']['bindings'])} museums from Wikidata")
|
|
return data['results']['bindings']
|
|
else:
|
|
print(f"❌ Query failed with status {response.status_code}")
|
|
return []
|
|
|
|
except Exception as e:
|
|
print(f"❌ Query error: {e}")
|
|
return []
|
|
|
|
def fuzzy_match_museum(target_museum, wikidata_museums):
|
|
"""
|
|
Find best match for target museum in Wikidata results
|
|
"""
|
|
best_match = None
|
|
best_score = 0.0
|
|
|
|
for wd_museum in wikidata_museums:
|
|
wd_name = wd_museum.get('museumLabel', {}).get('value', '')
|
|
wd_location = wd_museum.get('locationLabel', {}).get('value', '')
|
|
|
|
# Try matching against all search terms
|
|
for search_term in target_museum['search_terms']:
|
|
# Calculate multiple fuzzy scores
|
|
ratio = fuzz.ratio(search_term.lower(), wd_name.lower()) / 100
|
|
partial = fuzz.partial_ratio(search_term.lower(), wd_name.lower()) / 100
|
|
token_sort = fuzz.token_sort_ratio(search_term.lower(), wd_name.lower()) / 100
|
|
|
|
score = max(ratio, partial, token_sort)
|
|
|
|
# Boost score if location matches
|
|
if target_museum['city'].lower() in wd_location.lower() or \
|
|
target_museum['region'].lower() in wd_location.lower():
|
|
score += 0.1
|
|
|
|
if score > best_score and score >= 0.80:
|
|
best_score = score
|
|
q_number = wd_museum['museum']['value'].split('/')[-1]
|
|
best_match = {
|
|
'q_number': q_number,
|
|
'wikidata_name': wd_name,
|
|
'location': wd_location,
|
|
'description': wd_museum.get('museumDescription', {}).get('value', ''),
|
|
'match_score': score,
|
|
'founded': wd_museum.get('founded', {}).get('value', 'Unknown'),
|
|
'coordinates': wd_museum.get('coordinates', {}).get('value', 'Unknown')
|
|
}
|
|
|
|
return best_match
|
|
|
|
def main():
|
|
print("="*80)
|
|
print("CHILEAN MUSEUMS WIKIDATA QUERY - BATCH 11")
|
|
print("="*80)
|
|
print(f"Target: {len(TARGET_MUSEUMS)} museums")
|
|
print("Strategy: Single SPARQL query + fuzzy matching (threshold: 0.80)")
|
|
print("="*80)
|
|
|
|
# Get all Chilean museums from Wikidata
|
|
wikidata_museums = query_chilean_museums()
|
|
|
|
if not wikidata_museums:
|
|
print("❌ Failed to retrieve museums from Wikidata")
|
|
return
|
|
|
|
print("\n" + "="*80)
|
|
print("MATCHING MUSEUMS")
|
|
print("="*80)
|
|
|
|
matches = []
|
|
no_matches = []
|
|
|
|
for target in TARGET_MUSEUMS:
|
|
print(f"\n🔍 Searching: {target['name']} ({target['city']}, {target['region']})")
|
|
|
|
match = fuzzy_match_museum(target, wikidata_museums)
|
|
|
|
if match:
|
|
print(f" ✅ MATCH: {match['wikidata_name']} ({match['q_number']})")
|
|
print(f" 📍 Location: {match['location']}")
|
|
print(f" 📊 Score: {match['match_score']:.2f}")
|
|
matches.append({
|
|
'museum': target,
|
|
'match': match
|
|
})
|
|
else:
|
|
print(f" ❌ No match found (threshold: 0.80)")
|
|
no_matches.append(target)
|
|
|
|
# Summary
|
|
print("\n" + "="*80)
|
|
print("BATCH 11 QUERY SUMMARY")
|
|
print("="*80)
|
|
print(f"✅ Matches found: {len(matches)}/{len(TARGET_MUSEUMS)}")
|
|
print(f"❌ No matches: {len(no_matches)}/{len(TARGET_MUSEUMS)}")
|
|
|
|
if matches:
|
|
print("\n" + "-"*80)
|
|
print("MATCHED MUSEUMS:")
|
|
print("-"*80)
|
|
for item in matches:
|
|
print(f"{item['museum']['name']}")
|
|
print(f" → {item['match']['wikidata_name']} ({item['match']['q_number']})")
|
|
print(f" Score: {item['match']['match_score']:.2f}")
|
|
|
|
if no_matches:
|
|
print("\n" + "-"*80)
|
|
print("MUSEUMS WITHOUT WIKIDATA MATCHES:")
|
|
print("-"*80)
|
|
for museum in no_matches:
|
|
print(f" • {museum['name']} ({museum['city']}, {museum['region']})")
|
|
|
|
# Save results
|
|
output = {
|
|
'batch': 11,
|
|
'query_date': '2025-11-09',
|
|
'matches': matches,
|
|
'no_matches': no_matches,
|
|
'match_rate': f"{len(matches)}/{len(TARGET_MUSEUMS)}",
|
|
'expected_coverage': f"{55 + len(matches)}/90",
|
|
'expected_coverage_percent': f"{((55 + len(matches))/90*100):.1f}%"
|
|
}
|
|
|
|
with open('scripts/batch11_query_results.json', 'w', encoding='utf-8') as f:
|
|
json.dump(output, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\n💾 Results saved to: scripts/batch11_query_results.json")
|
|
print(f"\n📊 Projected coverage: {55 + len(matches)}/90 = {((55 + len(matches))/90*100):.1f}%")
|
|
|
|
if len(matches) >= 8:
|
|
print("🎯 SUCCESS! Reached 70% coverage target!")
|
|
elif len(matches) >= 5:
|
|
print("✅ Good progress toward 70% coverage goal")
|
|
else:
|
|
print("⚠️ May need additional enrichment strategies")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|