glam/scripts/manual_wikidata_search_batch13.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

192 lines
7.4 KiB
Python

#!/usr/bin/env python3
"""
Chilean Batch 13: Manual Wikidata Search for High-Priority Institutions
Target: 3 specific institutions to reach 70% coverage (63/90)
Focus institutions:
1. Museo de las Iglesias (Castro, Chiloé) - UNESCO connection
2. Museo del Libro del Mar (San Antonio) - Unique maritime museum
3. Archivo General de Asuntos Indígenas (CONADI, Temuco) - Government archive
Strategy: Exact name matching, no fuzzy matching, manual verification required.
"""
import json
from SPARQLWrapper import SPARQLWrapper, JSON
from typing import Dict, List, Optional, Any
def query_wikidata_exact(institution_name: str, location_city: str, region: str) -> Optional[List[Dict]]:
"""
Query Wikidata for an institution using exact name matching.
Returns None if no exact match found, otherwise returns list of result dictionaries.
"""
endpoint = "https://query.wikidata.org/sparql"
sparql = SPARQLWrapper(endpoint)
sparql.setReturnFormat(JSON)
# Try multiple query strategies
queries = []
# Strategy 1: Exact Spanish name
queries.append(f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?location ?locationLabel ?instanceOf ?instanceOfLabel WHERE {{
?item rdfs:label "{institution_name}"@es .
?item wdt:P31 ?instanceOf .
OPTIONAL {{ ?item wdt:P131 ?location }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }}
}}
LIMIT 10
""")
# Strategy 2: Contains name search with location filter
queries.append(f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?location ?locationLabel ?instanceOf ?instanceOfLabel WHERE {{
?item rdfs:label ?label .
FILTER(CONTAINS(LCASE(?label), LCASE("{institution_name}")))
?item wdt:P31 ?instanceOf .
?item wdt:P17 wd:Q298 . # Country: Chile
OPTIONAL {{ ?item wdt:P131 ?location }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }}
}}
LIMIT 10
""")
# Strategy 3: Search by location and type
if "Museo" in institution_name:
instance_filter = "?item wdt:P31/wdt:P279* wd:Q33506 ." # Museum
elif "Archivo" in institution_name:
instance_filter = "?item wdt:P31/wdt:P279* wd:Q166118 ." # Archive
else:
instance_filter = ""
if instance_filter:
queries.append(f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?location ?locationLabel ?instanceOf ?instanceOfLabel WHERE {{
{instance_filter}
?item rdfs:label ?label .
?item wdt:P17 wd:Q298 . # Country: Chile
OPTIONAL {{ ?item wdt:P131 ?location }}
?item wdt:P31 ?instanceOf .
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }}
}}
LIMIT 50
""")
all_results = []
for i, query in enumerate(queries, 1):
print(f" Trying query strategy {i}...")
sparql.setQuery(query)
try:
results: Any = sparql.query().convert()
if isinstance(results, dict):
bindings = results.get('results', {}).get('bindings', [])
if bindings:
print(f" Found {len(bindings)} results")
all_results.extend(bindings)
except Exception as e:
print(f" Query failed: {e}")
return all_results
def main():
"""Search for 3 high-priority institutions."""
target_institutions = [
{
"name": "Museo de las Iglesias",
"full_name": "Museo de las Iglesias de Chiloé",
"city": "Castro",
"region": "Chiloé",
"rationale": "Connected to UNESCO World Heritage Site (Churches of Chiloé)",
"search_terms": ["Museo de las Iglesias", "Museo Iglesias Chiloé", "Chiloé Churches Museum"]
},
{
"name": "Museo del Libro del Mar",
"full_name": "Museo del Libro del Mar",
"city": "San Antonio",
"region": "San Antonio",
"rationale": "Unique maritime book museum, specific subject focus",
"search_terms": ["Museo del Libro del Mar", "Museo Libro Mar San Antonio"]
},
{
"name": "Archivo General de Asuntos Indígenas (CONADI)",
"full_name": "Archivo General de Asuntos Indígenas",
"city": "Temuco",
"region": "Cautín",
"rationale": "National government archive for indigenous affairs",
"search_terms": ["Archivo General de Asuntos Indígenas", "CONADI", "Corporación Nacional de Desarrollo Indígena"]
}
]
all_search_results = {}
print("=" * 80)
print("Chilean Batch 13: Manual Wikidata Search")
print("Target: 3 institutions to reach 70% coverage (63/90)")
print("=" * 80)
print()
for institution in target_institutions:
print(f"\n{'=' * 80}")
print(f"Institution: {institution['name']}")
print(f"Location: {institution['city']}, {institution['region']}")
print(f"Rationale: {institution['rationale']}")
print(f"{'=' * 80}")
institution_results = {
"metadata": institution,
"wikidata_results": []
}
# Try each search term
for search_term in institution['search_terms']:
print(f"\nSearching for: '{search_term}'")
results = query_wikidata_exact(search_term, institution['city'], institution['region'])
if results:
print(f"Found {len(results)} potential matches")
for result in results[:5]: # Show top 5
item_id = result['item']['value'].split('/')[-1]
item_label = result.get('itemLabel', {}).get('value', 'No label')
item_desc = result.get('itemDescription', {}).get('value', 'No description')
location = result.get('locationLabel', {}).get('value', 'No location')
instance = result.get('instanceOfLabel', {}).get('value', 'No type')
print(f" {item_id}: {item_label}")
print(f" Description: {item_desc}")
print(f" Location: {location}")
print(f" Type: {instance}")
print()
institution_results['wikidata_results'].append({
'q_number': item_id,
'label': item_label,
'description': item_desc,
'location': location,
'instance_of': instance,
'search_term_used': search_term
})
else:
print(" No results found")
all_search_results[institution['name']] = institution_results
# Save results to JSON
output_file = 'scripts/batch13_manual_search_results.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(all_search_results, f, indent=2, ensure_ascii=False)
print(f"\n{'=' * 80}")
print(f"Results saved to: {output_file}")
print("=" * 80)
print()
print("NEXT STEPS:")
print("1. Review search results manually")
print("2. Verify Q-numbers correspond to correct institutions")
print("3. Create batch13 enrichment script for validated matches")
print("4. Apply enrichment to reach 70% coverage target")
print()
if __name__ == '__main__':
main()