- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
211 lines
7.8 KiB
Python
Executable file
211 lines
7.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Chilean Batch 14: Targeted Wikidata Search for Final 2 Matches
|
|
Strategy: Focus on institutions with distinctive names or known entities
|
|
|
|
Target institutions (prioritized by likelihood of Wikidata presence):
|
|
1. Museo Rodulfo Philippi / Rudolph Philippi - Named after famous German scientist
|
|
2. Fundación Iglesias Patrimoniales - Heritage foundation for Chiloé churches (UNESCO)
|
|
3. Instituto Alemán Puerto Montt - German school with heritage collections
|
|
4. Centro Cultural Sofia Hott - Named after specific person
|
|
"""
|
|
|
|
import json
|
|
from SPARQLWrapper import SPARQLWrapper, JSON
|
|
from typing import Dict, List, Optional, Any
|
|
|
|
def query_wikidata_person_institution(person_name: str, location: str) -> Optional[List[Dict]]:
|
|
"""
|
|
Query Wikidata for institutions named after specific people.
|
|
"""
|
|
endpoint = "https://query.wikidata.org/sparql"
|
|
sparql = SPARQLWrapper(endpoint)
|
|
sparql.setReturnFormat(JSON)
|
|
|
|
# Search for museums/institutions named after the person
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?location ?locationLabel ?instanceOf ?instanceOfLabel WHERE {{
|
|
{{
|
|
# Museums in Chile
|
|
?item wdt:P31/wdt:P279* wd:Q33506 .
|
|
?item wdt:P17 wd:Q298 .
|
|
}} UNION {{
|
|
# Cultural institutions in Chile
|
|
?item wdt:P31/wdt:P279* wd:Q7075 .
|
|
?item wdt:P17 wd:Q298 .
|
|
}}
|
|
|
|
# Name contains person's name
|
|
?item rdfs:label ?label .
|
|
FILTER(CONTAINS(LCASE(?label), LCASE("{person_name}")))
|
|
|
|
OPTIONAL {{ ?item wdt:P131 ?location }}
|
|
?item wdt:P31 ?instanceOf .
|
|
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en,de" }}
|
|
}}
|
|
LIMIT 20
|
|
"""
|
|
|
|
print(f" Searching for institutions named after '{person_name}'...")
|
|
sparql.setQuery(query)
|
|
|
|
try:
|
|
results: Any = sparql.query().convert()
|
|
if isinstance(results, dict):
|
|
bindings = results.get('results', {}).get('bindings', [])
|
|
return bindings if bindings else None
|
|
except Exception as e:
|
|
print(f" Query failed: {e}")
|
|
|
|
return None
|
|
|
|
def query_wikidata_exact_name(institution_name: str, city: Optional[str] = None) -> Optional[List[Dict]]:
|
|
"""
|
|
Query Wikidata for exact institution name.
|
|
"""
|
|
endpoint = "https://query.wikidata.org/sparql"
|
|
sparql = SPARQLWrapper(endpoint)
|
|
sparql.setReturnFormat(JSON)
|
|
|
|
location_filter = ""
|
|
if city:
|
|
location_filter = f'FILTER(CONTAINS(LCASE(?locationLabel), LCASE("{city}")))'
|
|
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?location ?locationLabel ?instanceOf ?instanceOfLabel WHERE {{
|
|
?item rdfs:label "{institution_name}"@es .
|
|
?item wdt:P31 ?instanceOf .
|
|
?item wdt:P17 wd:Q298 .
|
|
OPTIONAL {{ ?item wdt:P131 ?location }}
|
|
{location_filter}
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }}
|
|
}}
|
|
LIMIT 10
|
|
"""
|
|
|
|
print(f" Searching for exact name: '{institution_name}'...")
|
|
sparql.setQuery(query)
|
|
|
|
try:
|
|
results: Any = sparql.query().convert()
|
|
if isinstance(results, dict):
|
|
bindings = results.get('results', {}).get('bindings', [])
|
|
return bindings if bindings else None
|
|
except Exception as e:
|
|
print(f" Query failed: {e}")
|
|
|
|
return None
|
|
|
|
def main():
|
|
"""Search for final 2 matches to reach 70% coverage."""
|
|
|
|
search_strategies = [
|
|
{
|
|
"name": "Museo Rodulfo Philippi",
|
|
"city": "Chañaral",
|
|
"search_type": "person_named",
|
|
"search_terms": ["Philippi", "Rodulfo Philippi", "Rudolf Philippi"],
|
|
"rationale": "Named after Rodolfo Amando Philippi, famous German-Chilean naturalist"
|
|
},
|
|
{
|
|
"name": "Museo Rudolph Philippi",
|
|
"city": "Valdivia",
|
|
"search_type": "person_named",
|
|
"search_terms": ["Philippi", "Rudolph Philippi", "Rudolf Philippi"],
|
|
"rationale": "Another museum named after same scientist (alternate spelling)"
|
|
},
|
|
{
|
|
"name": "Instituto Alemán Puerto Montt",
|
|
"city": "Puerto Montt",
|
|
"search_type": "exact_name",
|
|
"search_terms": ["Instituto Alemán Puerto Montt", "Deutsche Schule Puerto Montt"],
|
|
"rationale": "German school, may have Wikidata entry"
|
|
},
|
|
{
|
|
"name": "Fundación Iglesias Patrimoniales",
|
|
"city": "Chiloé",
|
|
"search_type": "exact_name",
|
|
"search_terms": ["Fundación Iglesias Patrimoniales", "Fundación Iglesias Patrimoniales de Chiloé"],
|
|
"rationale": "Foundation for UNESCO World Heritage churches"
|
|
},
|
|
{
|
|
"name": "Centro Cultural Sofia Hott",
|
|
"city": "Osorno",
|
|
"search_type": "person_named",
|
|
"search_terms": ["Sofia Hott", "Sofía Hott"],
|
|
"rationale": "Cultural center named after specific person"
|
|
}
|
|
]
|
|
|
|
all_search_results = {}
|
|
|
|
print("=" * 80)
|
|
print("Chilean Batch 14: Targeted Wikidata Search")
|
|
print("Target: Find 2 more matches to reach 70% coverage (63/90)")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
for strategy in search_strategies:
|
|
print(f"\n{'=' * 80}")
|
|
print(f"Institution: {strategy['name']}")
|
|
print(f"Location: {strategy['city']}")
|
|
print(f"Strategy: {strategy['search_type']}")
|
|
print(f"Rationale: {strategy['rationale']}")
|
|
print(f"{'=' * 80}")
|
|
|
|
institution_results = {
|
|
"metadata": strategy,
|
|
"wikidata_results": []
|
|
}
|
|
|
|
for search_term in strategy['search_terms']:
|
|
if strategy['search_type'] == 'person_named':
|
|
results = query_wikidata_person_institution(search_term, strategy['city'])
|
|
else:
|
|
results = query_wikidata_exact_name(search_term, strategy['city'])
|
|
|
|
if results:
|
|
print(f" Found {len(results)} results")
|
|
for result in results[:5]:
|
|
item_id = result['item']['value'].split('/')[-1]
|
|
item_label = result.get('itemLabel', {}).get('value', 'No label')
|
|
item_desc = result.get('itemDescription', {}).get('value', 'No description')
|
|
location = result.get('locationLabel', {}).get('value', 'No location')
|
|
instance = result.get('instanceOfLabel', {}).get('value', 'No type')
|
|
|
|
print(f" {item_id}: {item_label}")
|
|
print(f" Desc: {item_desc}")
|
|
print(f" Location: {location}")
|
|
print(f" Type: {instance}")
|
|
|
|
institution_results['wikidata_results'].append({
|
|
'q_number': item_id,
|
|
'label': item_label,
|
|
'description': item_desc,
|
|
'location': location,
|
|
'instance_of': instance,
|
|
'search_term_used': search_term
|
|
})
|
|
else:
|
|
print(f" No results found")
|
|
|
|
all_search_results[strategy['name']] = institution_results
|
|
|
|
# Save results
|
|
output_file = 'scripts/batch14_targeted_search_results.json'
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(all_search_results, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\n{'=' * 80}")
|
|
print(f"Results saved to: {output_file}")
|
|
print("=" * 80)
|
|
print()
|
|
print("NEXT STEPS:")
|
|
print("1. Review results for valid matches")
|
|
print("2. Verify Q-numbers match correct institutions")
|
|
print("3. Apply enrichment to reach 70% target")
|
|
print()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|