- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
315 lines
11 KiB
Python
Executable file
315 lines
11 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Query Wikidata for Chilean GLAM Institutions - Batch 10 (Priority Targets)
|
|
Focus: Official institutions, research centers, and mixed/cultural centers
|
|
|
|
Target institutions:
|
|
- Servicio Nacional del Patrimonio Cultural (official)
|
|
- Fundación Buen Pastor (research)
|
|
- Fundación Iglesias Patrimoniales (research)
|
|
- Instituto Alemán Puerto Montt (mixed)
|
|
- Centro Cultural Sofia Hott (mixed)
|
|
- Centro de Interpretación Histórica (mixed)
|
|
"""
|
|
|
|
import json
|
|
import requests
|
|
import time
|
|
from pathlib import Path
|
|
from rapidfuzz import fuzz
|
|
|
|
# Wikidata SPARQL endpoint
|
|
ENDPOINT = "https://query.wikidata.org/sparql"
|
|
USER_AGENT = "GLAMDataExtractor/1.0 (https://github.com/yourusername/glam; your@email.com)"
|
|
|
|
def query_wikidata(sparql_query: str) -> list:
|
|
"""Execute SPARQL query against Wikidata."""
|
|
headers = {
|
|
'User-Agent': USER_AGENT,
|
|
'Accept': 'application/sparql-results+json'
|
|
}
|
|
|
|
params = {
|
|
'query': sparql_query,
|
|
'format': 'json'
|
|
}
|
|
|
|
response = requests.get(ENDPOINT, params=params, headers=headers)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
return data['results']['bindings']
|
|
|
|
def extract_qid(uri: str) -> str:
|
|
"""Extract Q-number from Wikidata URI."""
|
|
return uri.split('/')[-1]
|
|
|
|
def query_chilean_official_institutions():
|
|
"""Query for Chilean government cultural/heritage agencies."""
|
|
query = """
|
|
SELECT DISTINCT ?org ?orgLabel ?typeLabel ?websiteLabel ?viafID WHERE {
|
|
# Chilean government organizations related to culture/heritage
|
|
?org wdt:P31 ?type .
|
|
?org wdt:P17 wd:Q298 . # Country: Chile
|
|
|
|
# Types: government agency, ministry, public service
|
|
VALUES ?type {
|
|
wd:Q327333 # government agency
|
|
wd:Q192350 # government organization
|
|
wd:Q2659904 # government institution
|
|
wd:Q294414 # public service
|
|
}
|
|
|
|
# Related to culture/heritage/museums/archives
|
|
{
|
|
?org wdt:P2578 ?mission .
|
|
FILTER(CONTAINS(LCASE(?mission), "cultura") ||
|
|
CONTAINS(LCASE(?mission), "patrimonio") ||
|
|
CONTAINS(LCASE(?mission), "museo") ||
|
|
CONTAINS(LCASE(?mission), "archivo"))
|
|
} UNION {
|
|
?org rdfs:label ?label .
|
|
FILTER(LANG(?label) = "es")
|
|
FILTER(CONTAINS(LCASE(?label), "cultura") ||
|
|
CONTAINS(LCASE(?label), "patrimonio") ||
|
|
CONTAINS(LCASE(?label), "museo") ||
|
|
CONTAINS(LCASE(?label), "archivo"))
|
|
}
|
|
|
|
OPTIONAL { ?org wdt:P856 ?website }
|
|
OPTIONAL { ?org wdt:P214 ?viafID }
|
|
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en". }
|
|
}
|
|
LIMIT 20
|
|
"""
|
|
return query_wikidata(query)
|
|
|
|
def query_chilean_foundations():
|
|
"""Query for Chilean cultural foundations."""
|
|
query = """
|
|
SELECT DISTINCT ?org ?orgLabel ?locationLabel ?websiteLabel ?viafID WHERE {
|
|
# Chilean foundations
|
|
?org wdt:P31/wdt:P279* wd:Q157031 . # foundation
|
|
?org wdt:P17 wd:Q298 . # Country: Chile
|
|
|
|
# Related to culture/heritage
|
|
{
|
|
?org rdfs:label ?label .
|
|
FILTER(LANG(?label) = "es")
|
|
FILTER(CONTAINS(LCASE(?label), "pastor") ||
|
|
CONTAINS(LCASE(?label), "iglesia") ||
|
|
CONTAINS(LCASE(?label), "patrimonial") ||
|
|
CONTAINS(LCASE(?label), "cultura"))
|
|
} UNION {
|
|
?org wdt:P2578 ?mission .
|
|
FILTER(CONTAINS(LCASE(?mission), "cultura") ||
|
|
CONTAINS(LCASE(?mission), "patrimonio"))
|
|
}
|
|
|
|
OPTIONAL { ?org wdt:P131 ?location }
|
|
OPTIONAL { ?org wdt:P856 ?website }
|
|
OPTIONAL { ?org wdt:P214 ?viafID }
|
|
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en". }
|
|
}
|
|
LIMIT 20
|
|
"""
|
|
return query_wikidata(query)
|
|
|
|
def query_chilean_cultural_centers():
|
|
"""Query for Chilean cultural centers and interpretation centers."""
|
|
query = """
|
|
SELECT DISTINCT ?org ?orgLabel ?locationLabel ?websiteLabel ?viafID WHERE {
|
|
?org wdt:P31 ?type .
|
|
?org wdt:P17 wd:Q298 . # Country: Chile
|
|
|
|
# Types: cultural center, interpretation center
|
|
VALUES ?type {
|
|
wd:Q2334061 # cultural center
|
|
wd:Q2095 # educational institution (covers Instituto Alemán)
|
|
}
|
|
|
|
OPTIONAL { ?org wdt:P131 ?location }
|
|
OPTIONAL { ?org wdt:P856 ?website }
|
|
OPTIONAL { ?org wdt:P214 ?viafID }
|
|
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en,de". }
|
|
}
|
|
LIMIT 30
|
|
"""
|
|
return query_wikidata(query)
|
|
|
|
def query_german_institutes_chile():
|
|
"""Query specifically for German institutes in Chile."""
|
|
query = """
|
|
SELECT DISTINCT ?org ?orgLabel ?locationLabel ?websiteLabel ?viafID WHERE {
|
|
?org wdt:P31 ?type .
|
|
?org wdt:P17 wd:Q298 . # Country: Chile
|
|
|
|
# Educational or cultural institution
|
|
VALUES ?type {
|
|
wd:Q2095 # educational institution
|
|
wd:Q2385804 # educational organization
|
|
wd:Q31855 # research institute
|
|
}
|
|
|
|
# German connection
|
|
{
|
|
?org rdfs:label ?label .
|
|
FILTER(CONTAINS(LCASE(?label), "alemán") ||
|
|
CONTAINS(LCASE(?label), "aleman") ||
|
|
CONTAINS(LCASE(?label), "german") ||
|
|
CONTAINS(LCASE(?label), "deutsch"))
|
|
}
|
|
|
|
OPTIONAL { ?org wdt:P131 ?location }
|
|
OPTIONAL { ?org wdt:P856 ?website }
|
|
OPTIONAL { ?org wdt:P214 ?viafID }
|
|
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en,de". }
|
|
}
|
|
LIMIT 20
|
|
"""
|
|
return query_wikidata(query)
|
|
|
|
def fuzzy_match_institutions(wikidata_results: list, target_names: list) -> list:
|
|
"""Fuzzy match Wikidata results against target institution names."""
|
|
matches = []
|
|
|
|
for result in wikidata_results:
|
|
wd_name = result.get('orgLabel', {}).get('value', '')
|
|
wd_qid = extract_qid(result['org']['value'])
|
|
wd_location = result.get('locationLabel', {}).get('value', 'Unknown')
|
|
wd_website = result.get('websiteLabel', {}).get('value', None)
|
|
wd_viaf = result.get('viafID', {}).get('value', None)
|
|
|
|
# Match against targets
|
|
for target in target_names:
|
|
score = fuzz.ratio(target.lower(), wd_name.lower())
|
|
partial_score = fuzz.partial_ratio(target.lower(), wd_name.lower())
|
|
token_score = fuzz.token_sort_ratio(target.lower(), wd_name.lower())
|
|
|
|
max_score = max(score, partial_score, token_score)
|
|
|
|
if max_score >= 70: # Lower threshold for discovery
|
|
matches.append({
|
|
'target_name': target,
|
|
'wikidata_name': wd_name,
|
|
'q_number': wd_qid,
|
|
'location': wd_location,
|
|
'website': wd_website,
|
|
'viaf': wd_viaf,
|
|
'match_score': max_score,
|
|
'match_type': 'fuzzy',
|
|
'scores': {
|
|
'ratio': score,
|
|
'partial': partial_score,
|
|
'token': token_score
|
|
}
|
|
})
|
|
|
|
# Sort by score
|
|
matches.sort(key=lambda x: x['match_score'], reverse=True)
|
|
return matches
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("CHILEAN GLAM INSTITUTIONS - BATCH 10 WIKIDATA QUERY")
|
|
print("Target: Official institutions, research centers, mixed institutions")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Target institutions from our dataset
|
|
targets = {
|
|
'official': ['Servicio Nacional del Patrimonio Cultural'],
|
|
'research': ['Fundación Buen Pastor', 'Fundación Iglesias Patrimoniales'],
|
|
'mixed': [
|
|
'Instituto Alemán Puerto Montt',
|
|
'Centro Cultural Sofia Hott',
|
|
'Centro de Interpretación Histórica'
|
|
]
|
|
}
|
|
|
|
all_results = []
|
|
|
|
# Query 1: Official institutions
|
|
print("🔍 Querying official institutions...")
|
|
try:
|
|
results = query_chilean_official_institutions()
|
|
print(f" Found {len(results)} official institutions")
|
|
matches = fuzzy_match_institutions(results, targets['official'])
|
|
all_results.extend([{**m, 'query_type': 'official'} for m in matches])
|
|
time.sleep(2) # Rate limiting
|
|
except Exception as e:
|
|
print(f" ❌ Error: {e}")
|
|
print()
|
|
|
|
# Query 2: Foundations
|
|
print("🔍 Querying foundations...")
|
|
try:
|
|
results = query_chilean_foundations()
|
|
print(f" Found {len(results)} foundations")
|
|
matches = fuzzy_match_institutions(results, targets['research'])
|
|
all_results.extend([{**m, 'query_type': 'foundation'} for m in matches])
|
|
time.sleep(2)
|
|
except Exception as e:
|
|
print(f" ❌ Error: {e}")
|
|
print()
|
|
|
|
# Query 3: Cultural centers
|
|
print("🔍 Querying cultural centers...")
|
|
try:
|
|
results = query_chilean_cultural_centers()
|
|
print(f" Found {len(results)} cultural centers")
|
|
matches = fuzzy_match_institutions(results, targets['mixed'])
|
|
all_results.extend([{**m, 'query_type': 'cultural_center'} for m in matches])
|
|
time.sleep(2)
|
|
except Exception as e:
|
|
print(f" ❌ Error: {e}")
|
|
print()
|
|
|
|
# Query 4: German institutes
|
|
print("🔍 Querying German institutes...")
|
|
try:
|
|
results = query_german_institutes_chile()
|
|
print(f" Found {len(results)} German institutes")
|
|
matches = fuzzy_match_institutions(results, ['Instituto Alemán Puerto Montt'])
|
|
all_results.extend([{**m, 'query_type': 'german_institute'} for m in matches])
|
|
time.sleep(2)
|
|
except Exception as e:
|
|
print(f" ❌ Error: {e}")
|
|
print()
|
|
|
|
# Save results
|
|
output_file = Path('data/instances/chile/wikidata_matches_batch10_priority.json')
|
|
print(f"💾 Saving results to: {output_file}")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(all_results, f, indent=2, ensure_ascii=False)
|
|
print()
|
|
|
|
# Summary
|
|
print("=" * 80)
|
|
print("QUERY SUMMARY")
|
|
print("=" * 80)
|
|
print()
|
|
print(f"Total potential matches: {len(all_results)}")
|
|
|
|
if all_results:
|
|
print()
|
|
print("Top matches by score:")
|
|
for i, match in enumerate(all_results[:10], 1):
|
|
print(f"{i:2d}. {match['target_name']}")
|
|
print(f" → {match['wikidata_name']} ({match['q_number']})")
|
|
print(f" Score: {match['match_score']:.1f}% | Type: {match['query_type']}")
|
|
print(f" Location: {match['location']}")
|
|
if match.get('website'):
|
|
print(f" Website: {match['website']}")
|
|
print()
|
|
else:
|
|
print("⚠️ No matches found above threshold (70%)")
|
|
|
|
print("🎯 Next step: Review matches and create enrich_chilean_batch10.py")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|