- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
192 lines
7.4 KiB
Python
192 lines
7.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Chilean Batch 13: Manual Wikidata Search for High-Priority Institutions
|
|
Target: 3 specific institutions to reach 70% coverage (63/90)
|
|
|
|
Focus institutions:
|
|
1. Museo de las Iglesias (Castro, Chiloé) - UNESCO connection
|
|
2. Museo del Libro del Mar (San Antonio) - Unique maritime museum
|
|
3. Archivo General de Asuntos Indígenas (CONADI, Temuco) - Government archive
|
|
|
|
Strategy: Exact name matching, no fuzzy matching, manual verification required.
|
|
"""
|
|
|
|
import json
|
|
from SPARQLWrapper import SPARQLWrapper, JSON
|
|
from typing import Dict, List, Optional, Any
|
|
|
|
def query_wikidata_exact(institution_name: str, location_city: str, region: str) -> Optional[List[Dict]]:
|
|
"""
|
|
Query Wikidata for an institution using exact name matching.
|
|
Returns None if no exact match found, otherwise returns list of result dictionaries.
|
|
"""
|
|
endpoint = "https://query.wikidata.org/sparql"
|
|
sparql = SPARQLWrapper(endpoint)
|
|
sparql.setReturnFormat(JSON)
|
|
|
|
# Try multiple query strategies
|
|
queries = []
|
|
|
|
# Strategy 1: Exact Spanish name
|
|
queries.append(f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?location ?locationLabel ?instanceOf ?instanceOfLabel WHERE {{
|
|
?item rdfs:label "{institution_name}"@es .
|
|
?item wdt:P31 ?instanceOf .
|
|
OPTIONAL {{ ?item wdt:P131 ?location }}
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }}
|
|
}}
|
|
LIMIT 10
|
|
""")
|
|
|
|
# Strategy 2: Contains name search with location filter
|
|
queries.append(f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?location ?locationLabel ?instanceOf ?instanceOfLabel WHERE {{
|
|
?item rdfs:label ?label .
|
|
FILTER(CONTAINS(LCASE(?label), LCASE("{institution_name}")))
|
|
?item wdt:P31 ?instanceOf .
|
|
?item wdt:P17 wd:Q298 . # Country: Chile
|
|
OPTIONAL {{ ?item wdt:P131 ?location }}
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }}
|
|
}}
|
|
LIMIT 10
|
|
""")
|
|
|
|
# Strategy 3: Search by location and type
|
|
if "Museo" in institution_name:
|
|
instance_filter = "?item wdt:P31/wdt:P279* wd:Q33506 ." # Museum
|
|
elif "Archivo" in institution_name:
|
|
instance_filter = "?item wdt:P31/wdt:P279* wd:Q166118 ." # Archive
|
|
else:
|
|
instance_filter = ""
|
|
|
|
if instance_filter:
|
|
queries.append(f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?location ?locationLabel ?instanceOf ?instanceOfLabel WHERE {{
|
|
{instance_filter}
|
|
?item rdfs:label ?label .
|
|
?item wdt:P17 wd:Q298 . # Country: Chile
|
|
OPTIONAL {{ ?item wdt:P131 ?location }}
|
|
?item wdt:P31 ?instanceOf .
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }}
|
|
}}
|
|
LIMIT 50
|
|
""")
|
|
|
|
all_results = []
|
|
|
|
for i, query in enumerate(queries, 1):
|
|
print(f" Trying query strategy {i}...")
|
|
sparql.setQuery(query)
|
|
try:
|
|
results: Any = sparql.query().convert()
|
|
if isinstance(results, dict):
|
|
bindings = results.get('results', {}).get('bindings', [])
|
|
if bindings:
|
|
print(f" Found {len(bindings)} results")
|
|
all_results.extend(bindings)
|
|
except Exception as e:
|
|
print(f" Query failed: {e}")
|
|
|
|
return all_results
|
|
|
|
def main():
|
|
"""Search for 3 high-priority institutions."""
|
|
|
|
target_institutions = [
|
|
{
|
|
"name": "Museo de las Iglesias",
|
|
"full_name": "Museo de las Iglesias de Chiloé",
|
|
"city": "Castro",
|
|
"region": "Chiloé",
|
|
"rationale": "Connected to UNESCO World Heritage Site (Churches of Chiloé)",
|
|
"search_terms": ["Museo de las Iglesias", "Museo Iglesias Chiloé", "Chiloé Churches Museum"]
|
|
},
|
|
{
|
|
"name": "Museo del Libro del Mar",
|
|
"full_name": "Museo del Libro del Mar",
|
|
"city": "San Antonio",
|
|
"region": "San Antonio",
|
|
"rationale": "Unique maritime book museum, specific subject focus",
|
|
"search_terms": ["Museo del Libro del Mar", "Museo Libro Mar San Antonio"]
|
|
},
|
|
{
|
|
"name": "Archivo General de Asuntos Indígenas (CONADI)",
|
|
"full_name": "Archivo General de Asuntos Indígenas",
|
|
"city": "Temuco",
|
|
"region": "Cautín",
|
|
"rationale": "National government archive for indigenous affairs",
|
|
"search_terms": ["Archivo General de Asuntos Indígenas", "CONADI", "Corporación Nacional de Desarrollo Indígena"]
|
|
}
|
|
]
|
|
|
|
all_search_results = {}
|
|
|
|
print("=" * 80)
|
|
print("Chilean Batch 13: Manual Wikidata Search")
|
|
print("Target: 3 institutions to reach 70% coverage (63/90)")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
for institution in target_institutions:
|
|
print(f"\n{'=' * 80}")
|
|
print(f"Institution: {institution['name']}")
|
|
print(f"Location: {institution['city']}, {institution['region']}")
|
|
print(f"Rationale: {institution['rationale']}")
|
|
print(f"{'=' * 80}")
|
|
|
|
institution_results = {
|
|
"metadata": institution,
|
|
"wikidata_results": []
|
|
}
|
|
|
|
# Try each search term
|
|
for search_term in institution['search_terms']:
|
|
print(f"\nSearching for: '{search_term}'")
|
|
results = query_wikidata_exact(search_term, institution['city'], institution['region'])
|
|
|
|
if results:
|
|
print(f"Found {len(results)} potential matches")
|
|
for result in results[:5]: # Show top 5
|
|
item_id = result['item']['value'].split('/')[-1]
|
|
item_label = result.get('itemLabel', {}).get('value', 'No label')
|
|
item_desc = result.get('itemDescription', {}).get('value', 'No description')
|
|
location = result.get('locationLabel', {}).get('value', 'No location')
|
|
instance = result.get('instanceOfLabel', {}).get('value', 'No type')
|
|
|
|
print(f" {item_id}: {item_label}")
|
|
print(f" Description: {item_desc}")
|
|
print(f" Location: {location}")
|
|
print(f" Type: {instance}")
|
|
print()
|
|
|
|
institution_results['wikidata_results'].append({
|
|
'q_number': item_id,
|
|
'label': item_label,
|
|
'description': item_desc,
|
|
'location': location,
|
|
'instance_of': instance,
|
|
'search_term_used': search_term
|
|
})
|
|
else:
|
|
print(" No results found")
|
|
|
|
all_search_results[institution['name']] = institution_results
|
|
|
|
# Save results to JSON
|
|
output_file = 'scripts/batch13_manual_search_results.json'
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(all_search_results, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\n{'=' * 80}")
|
|
print(f"Results saved to: {output_file}")
|
|
print("=" * 80)
|
|
print()
|
|
print("NEXT STEPS:")
|
|
print("1. Review search results manually")
|
|
print("2. Verify Q-numbers correspond to correct institutions")
|
|
print("3. Create batch13 enrichment script for validated matches")
|
|
print("4. Apply enrichment to reach 70% coverage target")
|
|
print()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|