glam/scripts/query_wikidata_chilean_museums.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

240 lines
8.4 KiB
Python

#!/usr/bin/env python3
"""
Query Wikidata for Chilean Museums using SPARQL
Uses Wikidata Query Service to find museums in Chile with their Q-numbers
"""
import yaml
from SPARQLWrapper import SPARQLWrapper, JSON
from typing import List, Dict
from pathlib import Path
def query_chilean_museums() -> List[Dict]:
"""Query Wikidata for all museums in Chile."""
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
# SPARQL query for museums in Chile
# P31 = instance of, P17 = country, Q298 = Chile
# Q33506 = museum, Q207694 = art museum, Q1459900 = archaeological museum, etc.
query = """
SELECT DISTINCT ?museum ?museumLabel ?cityLabel ?coords ?founded WHERE {
# Museum types (including subclasses)
?museum wdt:P31/wdt:P279* wd:Q33506 .
# Located in Chile
?museum wdt:P17 wd:Q298 .
# Get city/location
OPTIONAL { ?museum wdt:P131 ?city . }
# Get coordinates
OPTIONAL { ?museum wdt:P625 ?coords . }
# Get founding date
OPTIONAL { ?museum wdt:P571 ?founded . }
# Get labels in Spanish and English
SERVICE wikibase:label {
bd:serviceParam wikibase:language "es,en" .
}
}
ORDER BY ?museumLabel
"""
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
print("🔍 Querying Wikidata for Chilean museums...")
print(" Endpoint: https://query.wikidata.org/sparql")
print()
try:
results = sparql.query().convert() # type: ignore
museums = []
for result in results["results"]["bindings"]: # type: ignore
museum_uri = result["museum"]["value"] # type: ignore
q_number = museum_uri.split("/")[-1]
museum = {
"q_number": q_number,
"name": result.get("museumLabel", {}).get("value", ""), # type: ignore
"city": result.get("cityLabel", {}).get("value", ""), # type: ignore
"founded": result.get("founded", {}).get("value", "")[:4] if "founded" in result else "", # type: ignore
"wikidata_url": f"https://www.wikidata.org/wiki/{q_number}"
}
museums.append(museum)
return museums
except Exception as e:
print(f"❌ Error querying Wikidata: {e}")
return []
def load_chilean_institutions(file_path: Path) -> List[Dict]:
"""Load Chilean institutions from YAML file."""
with open(file_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def normalize_name(name: str) -> str:
"""Normalize institution name for matching."""
return name.lower().strip().replace("'", "").replace(" ", " ")
def find_matches(institutions: List[Dict], wikidata_museums: List[Dict]) -> List[Dict]:
"""Find matches between our institutions and Wikidata museums."""
matches = []
# Filter institutions without Wikidata
museums_without_wd = [
inst for inst in institutions
if inst.get('institution_type') == 'MUSEUM'
and not any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst.get('identifiers', [])
)
]
print(f"📊 Matching {len(museums_without_wd)} institutions against {len(wikidata_museums)} Wikidata entries...")
print()
for inst in museums_without_wd:
inst_name = normalize_name(inst['name'])
inst_city = inst.get('locations', [{}])[0].get('city', '').lower()
for wd_museum in wikidata_museums:
wd_name = normalize_name(wd_museum['name'])
wd_city = wd_museum['city'].lower()
# Name match strategies
name_match = False
# Strategy 1: Exact match
if inst_name == wd_name:
name_match = True
# Strategy 2: Partial match (institution name contains Wikidata name or vice versa)
elif inst_name in wd_name or wd_name in inst_name:
name_match = True
# Strategy 3: Key words match (museo + significant word)
elif 'museo' in inst_name and 'museo' in wd_name:
inst_words = set(inst_name.split())
wd_words = set(wd_name.split())
common_words = inst_words & wd_words
# Must share at least 2 significant words beyond "museo"
significant_common = common_words - {'de', 'del', 'la', 'el', 'museo', 'museum'}
if len(significant_common) >= 2:
name_match = True
# City match (flexible - allows partial matches)
city_match = False
if inst_city and wd_city:
if inst_city in wd_city or wd_city in inst_city:
city_match = True
# Accept match if name matches and either city matches or no city info
if name_match and (city_match or not wd_city):
match = {
'institution': inst,
'wikidata': wd_museum,
'name_confidence': 'exact' if inst_name == wd_name else 'partial',
'city_match': city_match
}
matches.append(match)
break # Only take first match per institution
return matches
def main():
print("=" * 80)
print("WIKIDATA SPARQL QUERY - CHILEAN MUSEUMS")
print("=" * 80)
print()
# Query Wikidata
wikidata_museums = query_chilean_museums()
if not wikidata_museums:
print("❌ No results from Wikidata")
return
print(f"✅ Found {len(wikidata_museums)} museums in Wikidata")
print()
# Show sample
print("Sample results (first 10):")
for i, museum in enumerate(wikidata_museums[:10], 1):
print(f" {i}. {museum['name']} ({museum['city']}) → {museum['q_number']}")
print()
# Load our institutions
input_file = Path('data/instances/chile/chilean_institutions_batch6_enriched.yaml')
institutions = load_chilean_institutions(input_file)
print(f"📖 Loaded {len(institutions)} Chilean institutions")
museums_count = sum(1 for i in institutions if i.get('institution_type') == 'MUSEUM')
print(f" {museums_count} are museums")
with_wikidata = sum(
1 for inst in institutions
if inst.get('institution_type') == 'MUSEUM'
and any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst.get('identifiers', [])
)
)
print(f" {with_wikidata} already have Wikidata")
print(f" {museums_count - with_wikidata} need enrichment")
print()
# Find matches
matches = find_matches(institutions, wikidata_museums)
print("=" * 80)
print(f"MATCHING RESULTS: {len(matches)} potential matches found")
print("=" * 80)
print()
# Display matches
for i, match in enumerate(matches, 1):
inst = match['institution']
wd = match['wikidata']
print(f"{i}. {inst['name']}")
print(f" Our city: {inst.get('locations', [{}])[0].get('city', 'Unknown')}")
print(f" ↓ MATCH ({match['name_confidence']} name, city: {match['city_match']})")
print(f" Wikidata: {wd['name']} ({wd['city']})")
print(f" Q-number: {wd['q_number']}")
if wd['founded']:
print(f" Founded: {wd['founded']}")
print()
# Export matches to JSON for batch processing
output_file = Path('data/instances/chile/wikidata_matches_batch7.json')
import json
match_data = [
{
'institution_name': match['institution']['name'],
'institution_city': match['institution'].get('locations', [{}])[0].get('city', ''),
'q_number': match['wikidata']['q_number'],
'wikidata_name': match['wikidata']['name'],
'wikidata_city': match['wikidata']['city'],
'founded': match['wikidata']['founded'],
'confidence': match['name_confidence'],
'city_match': match['city_match']
}
for match in matches
]
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(match_data, f, indent=2, ensure_ascii=False)
print(f"💾 Saved {len(matches)} matches to: {output_file}")
print()
print("🎯 Next step: Review matches and create Batch 7 enrichment script")
if __name__ == '__main__':
main()