glam/scripts/manual_wikidata_search_batch14.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

211 lines
7.8 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Chilean Batch 14: Targeted Wikidata Search for Final 2 Matches
Strategy: Focus on institutions with distinctive names or known entities
Target institutions (prioritized by likelihood of Wikidata presence):
1. Museo Rodulfo Philippi / Rudolph Philippi - Named after famous German scientist
2. Fundación Iglesias Patrimoniales - Heritage foundation for Chiloé churches (UNESCO)
3. Instituto Alemán Puerto Montt - German school with heritage collections
4. Centro Cultural Sofia Hott - Named after specific person
"""
import json
from SPARQLWrapper import SPARQLWrapper, JSON
from typing import Dict, List, Optional, Any
def query_wikidata_person_institution(person_name: str, location: str) -> Optional[List[Dict]]:
"""
Query Wikidata for institutions named after specific people.
"""
endpoint = "https://query.wikidata.org/sparql"
sparql = SPARQLWrapper(endpoint)
sparql.setReturnFormat(JSON)
# Search for museums/institutions named after the person
query = f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?location ?locationLabel ?instanceOf ?instanceOfLabel WHERE {{
{{
# Museums in Chile
?item wdt:P31/wdt:P279* wd:Q33506 .
?item wdt:P17 wd:Q298 .
}} UNION {{
# Cultural institutions in Chile
?item wdt:P31/wdt:P279* wd:Q7075 .
?item wdt:P17 wd:Q298 .
}}
# Name contains person's name
?item rdfs:label ?label .
FILTER(CONTAINS(LCASE(?label), LCASE("{person_name}")))
OPTIONAL {{ ?item wdt:P131 ?location }}
?item wdt:P31 ?instanceOf .
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en,de" }}
}}
LIMIT 20
"""
print(f" Searching for institutions named after '{person_name}'...")
sparql.setQuery(query)
try:
results: Any = sparql.query().convert()
if isinstance(results, dict):
bindings = results.get('results', {}).get('bindings', [])
return bindings if bindings else None
except Exception as e:
print(f" Query failed: {e}")
return None
def query_wikidata_exact_name(institution_name: str, city: Optional[str] = None) -> Optional[List[Dict]]:
"""
Query Wikidata for exact institution name.
"""
endpoint = "https://query.wikidata.org/sparql"
sparql = SPARQLWrapper(endpoint)
sparql.setReturnFormat(JSON)
location_filter = ""
if city:
location_filter = f'FILTER(CONTAINS(LCASE(?locationLabel), LCASE("{city}")))'
query = f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?location ?locationLabel ?instanceOf ?instanceOfLabel WHERE {{
?item rdfs:label "{institution_name}"@es .
?item wdt:P31 ?instanceOf .
?item wdt:P17 wd:Q298 .
OPTIONAL {{ ?item wdt:P131 ?location }}
{location_filter}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }}
}}
LIMIT 10
"""
print(f" Searching for exact name: '{institution_name}'...")
sparql.setQuery(query)
try:
results: Any = sparql.query().convert()
if isinstance(results, dict):
bindings = results.get('results', {}).get('bindings', [])
return bindings if bindings else None
except Exception as e:
print(f" Query failed: {e}")
return None
def main():
"""Search for final 2 matches to reach 70% coverage."""
search_strategies = [
{
"name": "Museo Rodulfo Philippi",
"city": "Chañaral",
"search_type": "person_named",
"search_terms": ["Philippi", "Rodulfo Philippi", "Rudolf Philippi"],
"rationale": "Named after Rodolfo Amando Philippi, famous German-Chilean naturalist"
},
{
"name": "Museo Rudolph Philippi",
"city": "Valdivia",
"search_type": "person_named",
"search_terms": ["Philippi", "Rudolph Philippi", "Rudolf Philippi"],
"rationale": "Another museum named after same scientist (alternate spelling)"
},
{
"name": "Instituto Alemán Puerto Montt",
"city": "Puerto Montt",
"search_type": "exact_name",
"search_terms": ["Instituto Alemán Puerto Montt", "Deutsche Schule Puerto Montt"],
"rationale": "German school, may have Wikidata entry"
},
{
"name": "Fundación Iglesias Patrimoniales",
"city": "Chiloé",
"search_type": "exact_name",
"search_terms": ["Fundación Iglesias Patrimoniales", "Fundación Iglesias Patrimoniales de Chiloé"],
"rationale": "Foundation for UNESCO World Heritage churches"
},
{
"name": "Centro Cultural Sofia Hott",
"city": "Osorno",
"search_type": "person_named",
"search_terms": ["Sofia Hott", "Sofía Hott"],
"rationale": "Cultural center named after specific person"
}
]
all_search_results = {}
print("=" * 80)
print("Chilean Batch 14: Targeted Wikidata Search")
print("Target: Find 2 more matches to reach 70% coverage (63/90)")
print("=" * 80)
print()
for strategy in search_strategies:
print(f"\n{'=' * 80}")
print(f"Institution: {strategy['name']}")
print(f"Location: {strategy['city']}")
print(f"Strategy: {strategy['search_type']}")
print(f"Rationale: {strategy['rationale']}")
print(f"{'=' * 80}")
institution_results = {
"metadata": strategy,
"wikidata_results": []
}
for search_term in strategy['search_terms']:
if strategy['search_type'] == 'person_named':
results = query_wikidata_person_institution(search_term, strategy['city'])
else:
results = query_wikidata_exact_name(search_term, strategy['city'])
if results:
print(f" Found {len(results)} results")
for result in results[:5]:
item_id = result['item']['value'].split('/')[-1]
item_label = result.get('itemLabel', {}).get('value', 'No label')
item_desc = result.get('itemDescription', {}).get('value', 'No description')
location = result.get('locationLabel', {}).get('value', 'No location')
instance = result.get('instanceOfLabel', {}).get('value', 'No type')
print(f" {item_id}: {item_label}")
print(f" Desc: {item_desc}")
print(f" Location: {location}")
print(f" Type: {instance}")
institution_results['wikidata_results'].append({
'q_number': item_id,
'label': item_label,
'description': item_desc,
'location': location,
'instance_of': instance,
'search_term_used': search_term
})
else:
print(f" No results found")
all_search_results[strategy['name']] = institution_results
# Save results
output_file = 'scripts/batch14_targeted_search_results.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(all_search_results, f, indent=2, ensure_ascii=False)
print(f"\n{'=' * 80}")
print(f"Results saved to: {output_file}")
print("=" * 80)
print()
print("NEXT STEPS:")
print("1. Review results for valid matches")
print("2. Verify Q-numbers match correct institutions")
print("3. Apply enrichment to reach 70% target")
print()
if __name__ == '__main__':
main()