glam/scripts/query_wikidata_chilean_batch10.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

315 lines
11 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Query Wikidata for Chilean GLAM Institutions - Batch 10 (Priority Targets)
Focus: Official institutions, research centers, and mixed/cultural centers
Target institutions:
- Servicio Nacional del Patrimonio Cultural (official)
- Fundación Buen Pastor (research)
- Fundación Iglesias Patrimoniales (research)
- Instituto Alemán Puerto Montt (mixed)
- Centro Cultural Sofia Hott (mixed)
- Centro de Interpretación Histórica (mixed)
"""
import json
import requests
import time
from pathlib import Path
from rapidfuzz import fuzz
# Wikidata SPARQL endpoint
ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAMDataExtractor/1.0 (https://github.com/yourusername/glam; your@email.com)"
def query_wikidata(sparql_query: str) -> list:
"""Execute SPARQL query against Wikidata."""
headers = {
'User-Agent': USER_AGENT,
'Accept': 'application/sparql-results+json'
}
params = {
'query': sparql_query,
'format': 'json'
}
response = requests.get(ENDPOINT, params=params, headers=headers)
response.raise_for_status()
data = response.json()
return data['results']['bindings']
def extract_qid(uri: str) -> str:
"""Extract Q-number from Wikidata URI."""
return uri.split('/')[-1]
def query_chilean_official_institutions():
"""Query for Chilean government cultural/heritage agencies."""
query = """
SELECT DISTINCT ?org ?orgLabel ?typeLabel ?websiteLabel ?viafID WHERE {
# Chilean government organizations related to culture/heritage
?org wdt:P31 ?type .
?org wdt:P17 wd:Q298 . # Country: Chile
# Types: government agency, ministry, public service
VALUES ?type {
wd:Q327333 # government agency
wd:Q192350 # government organization
wd:Q2659904 # government institution
wd:Q294414 # public service
}
# Related to culture/heritage/museums/archives
{
?org wdt:P2578 ?mission .
FILTER(CONTAINS(LCASE(?mission), "cultura") ||
CONTAINS(LCASE(?mission), "patrimonio") ||
CONTAINS(LCASE(?mission), "museo") ||
CONTAINS(LCASE(?mission), "archivo"))
} UNION {
?org rdfs:label ?label .
FILTER(LANG(?label) = "es")
FILTER(CONTAINS(LCASE(?label), "cultura") ||
CONTAINS(LCASE(?label), "patrimonio") ||
CONTAINS(LCASE(?label), "museo") ||
CONTAINS(LCASE(?label), "archivo"))
}
OPTIONAL { ?org wdt:P856 ?website }
OPTIONAL { ?org wdt:P214 ?viafID }
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en". }
}
LIMIT 20
"""
return query_wikidata(query)
def query_chilean_foundations():
"""Query for Chilean cultural foundations."""
query = """
SELECT DISTINCT ?org ?orgLabel ?locationLabel ?websiteLabel ?viafID WHERE {
# Chilean foundations
?org wdt:P31/wdt:P279* wd:Q157031 . # foundation
?org wdt:P17 wd:Q298 . # Country: Chile
# Related to culture/heritage
{
?org rdfs:label ?label .
FILTER(LANG(?label) = "es")
FILTER(CONTAINS(LCASE(?label), "pastor") ||
CONTAINS(LCASE(?label), "iglesia") ||
CONTAINS(LCASE(?label), "patrimonial") ||
CONTAINS(LCASE(?label), "cultura"))
} UNION {
?org wdt:P2578 ?mission .
FILTER(CONTAINS(LCASE(?mission), "cultura") ||
CONTAINS(LCASE(?mission), "patrimonio"))
}
OPTIONAL { ?org wdt:P131 ?location }
OPTIONAL { ?org wdt:P856 ?website }
OPTIONAL { ?org wdt:P214 ?viafID }
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en". }
}
LIMIT 20
"""
return query_wikidata(query)
def query_chilean_cultural_centers():
"""Query for Chilean cultural centers and interpretation centers."""
query = """
SELECT DISTINCT ?org ?orgLabel ?locationLabel ?websiteLabel ?viafID WHERE {
?org wdt:P31 ?type .
?org wdt:P17 wd:Q298 . # Country: Chile
# Types: cultural center, interpretation center
VALUES ?type {
wd:Q2334061 # cultural center
wd:Q2095 # educational institution (covers Instituto Alemán)
}
OPTIONAL { ?org wdt:P131 ?location }
OPTIONAL { ?org wdt:P856 ?website }
OPTIONAL { ?org wdt:P214 ?viafID }
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en,de". }
}
LIMIT 30
"""
return query_wikidata(query)
def query_german_institutes_chile():
"""Query specifically for German institutes in Chile."""
query = """
SELECT DISTINCT ?org ?orgLabel ?locationLabel ?websiteLabel ?viafID WHERE {
?org wdt:P31 ?type .
?org wdt:P17 wd:Q298 . # Country: Chile
# Educational or cultural institution
VALUES ?type {
wd:Q2095 # educational institution
wd:Q2385804 # educational organization
wd:Q31855 # research institute
}
# German connection
{
?org rdfs:label ?label .
FILTER(CONTAINS(LCASE(?label), "alemán") ||
CONTAINS(LCASE(?label), "aleman") ||
CONTAINS(LCASE(?label), "german") ||
CONTAINS(LCASE(?label), "deutsch"))
}
OPTIONAL { ?org wdt:P131 ?location }
OPTIONAL { ?org wdt:P856 ?website }
OPTIONAL { ?org wdt:P214 ?viafID }
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en,de". }
}
LIMIT 20
"""
return query_wikidata(query)
def fuzzy_match_institutions(wikidata_results: list, target_names: list) -> list:
"""Fuzzy match Wikidata results against target institution names."""
matches = []
for result in wikidata_results:
wd_name = result.get('orgLabel', {}).get('value', '')
wd_qid = extract_qid(result['org']['value'])
wd_location = result.get('locationLabel', {}).get('value', 'Unknown')
wd_website = result.get('websiteLabel', {}).get('value', None)
wd_viaf = result.get('viafID', {}).get('value', None)
# Match against targets
for target in target_names:
score = fuzz.ratio(target.lower(), wd_name.lower())
partial_score = fuzz.partial_ratio(target.lower(), wd_name.lower())
token_score = fuzz.token_sort_ratio(target.lower(), wd_name.lower())
max_score = max(score, partial_score, token_score)
if max_score >= 70: # Lower threshold for discovery
matches.append({
'target_name': target,
'wikidata_name': wd_name,
'q_number': wd_qid,
'location': wd_location,
'website': wd_website,
'viaf': wd_viaf,
'match_score': max_score,
'match_type': 'fuzzy',
'scores': {
'ratio': score,
'partial': partial_score,
'token': token_score
}
})
# Sort by score
matches.sort(key=lambda x: x['match_score'], reverse=True)
return matches
def main():
print("=" * 80)
print("CHILEAN GLAM INSTITUTIONS - BATCH 10 WIKIDATA QUERY")
print("Target: Official institutions, research centers, mixed institutions")
print("=" * 80)
print()
# Target institutions from our dataset
targets = {
'official': ['Servicio Nacional del Patrimonio Cultural'],
'research': ['Fundación Buen Pastor', 'Fundación Iglesias Patrimoniales'],
'mixed': [
'Instituto Alemán Puerto Montt',
'Centro Cultural Sofia Hott',
'Centro de Interpretación Histórica'
]
}
all_results = []
# Query 1: Official institutions
print("🔍 Querying official institutions...")
try:
results = query_chilean_official_institutions()
print(f" Found {len(results)} official institutions")
matches = fuzzy_match_institutions(results, targets['official'])
all_results.extend([{**m, 'query_type': 'official'} for m in matches])
time.sleep(2) # Rate limiting
except Exception as e:
print(f" ❌ Error: {e}")
print()
# Query 2: Foundations
print("🔍 Querying foundations...")
try:
results = query_chilean_foundations()
print(f" Found {len(results)} foundations")
matches = fuzzy_match_institutions(results, targets['research'])
all_results.extend([{**m, 'query_type': 'foundation'} for m in matches])
time.sleep(2)
except Exception as e:
print(f" ❌ Error: {e}")
print()
# Query 3: Cultural centers
print("🔍 Querying cultural centers...")
try:
results = query_chilean_cultural_centers()
print(f" Found {len(results)} cultural centers")
matches = fuzzy_match_institutions(results, targets['mixed'])
all_results.extend([{**m, 'query_type': 'cultural_center'} for m in matches])
time.sleep(2)
except Exception as e:
print(f" ❌ Error: {e}")
print()
# Query 4: German institutes
print("🔍 Querying German institutes...")
try:
results = query_german_institutes_chile()
print(f" Found {len(results)} German institutes")
matches = fuzzy_match_institutions(results, ['Instituto Alemán Puerto Montt'])
all_results.extend([{**m, 'query_type': 'german_institute'} for m in matches])
time.sleep(2)
except Exception as e:
print(f" ❌ Error: {e}")
print()
# Save results
output_file = Path('data/instances/chile/wikidata_matches_batch10_priority.json')
print(f"💾 Saving results to: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(all_results, f, indent=2, ensure_ascii=False)
print()
# Summary
print("=" * 80)
print("QUERY SUMMARY")
print("=" * 80)
print()
print(f"Total potential matches: {len(all_results)}")
if all_results:
print()
print("Top matches by score:")
for i, match in enumerate(all_results[:10], 1):
print(f"{i:2d}. {match['target_name']}")
print(f"{match['wikidata_name']} ({match['q_number']})")
print(f" Score: {match['match_score']:.1f}% | Type: {match['query_type']}")
print(f" Location: {match['location']}")
if match.get('website'):
print(f" Website: {match['website']}")
print()
else:
print("⚠️ No matches found above threshold (70%)")
print("🎯 Next step: Review matches and create enrich_chilean_batch10.py")
if __name__ == '__main__':
main()