glam/scripts/manual_wikidata_search_batch6.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

138 lines
4.5 KiB
Python

#!/usr/bin/env python3
"""
Interactive script to search Wikidata for Brazilian institutions.
Helps build enrichment mappings by querying Wikidata SPARQL endpoint.
"""
import requests
import time
from typing import List, Dict, Optional
WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
def search_wikidata_sparql(search_terms: str, limit: int = 10) -> List[Dict]:
"""Search Wikidata using SPARQL for heritage institutions."""
query = f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?coordinate ?website WHERE {{
# Search in labels and aliases
?item rdfs:label|skos:altLabel ?label .
FILTER(CONTAINS(LCASE(?label), LCASE("{search_terms}")))
# Brazilian institutions
?item wdt:P17 wd:Q155 . # country = Brazil
# Optional data
OPTIONAL {{ ?item wdt:P625 ?coordinate }}
OPTIONAL {{ ?item wdt:P856 ?website }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "pt,en" }}
}}
LIMIT {limit}
"""
headers = {
'User-Agent': 'GLAM-Data-Extractor/0.1 (heritage institutions research)',
'Accept': 'application/sparql-results+json'
}
try:
response = requests.get(
WIKIDATA_SPARQL_ENDPOINT,
params={'query': query, 'format': 'json'},
headers=headers,
timeout=30
)
response.raise_for_status()
results = response.json()
matches = []
for binding in results.get('results', {}).get('bindings', []):
qid = binding['item']['value'].split('/')[-1]
label = binding.get('itemLabel', {}).get('value', 'No label')
description = binding.get('itemDescription', {}).get('value', '')
website = binding.get('website', {}).get('value', '')
matches.append({
'qid': qid,
'label': label,
'description': description,
'website': website
})
return matches
except Exception as e:
print(f"❌ Error querying Wikidata: {e}")
return []
def main():
"""Search for Brazilian institutions in Wikidata."""
# Priority targets from our dataset
targets = [
("Museu Sacaca", "museum indigenous macapá"),
("Dom Bosco", "museum campo grande"),
("Homem Sergipano", "museum aracaju anthropology"),
("Geopark Araripe", "geopark crato ceará"),
("Goiás", "unesco heritage city"),
("São Luís", "unesco heritage historic center"),
("Arquivo Público", "brasília archive federal district"),
("Memorial Rio Grande", "pelotas memorial museum"),
("Museu Povos Acreanos", "acre rio branco museum"),
("MARCO", "campo grande contemporary art"),
]
print("🔍 Searching Wikidata for Brazilian Institutions")
print("=" * 80)
all_matches = {}
for inst_name, search_terms in targets:
print(f"\n📍 Searching: {inst_name}")
print(f" Terms: {search_terms}")
print("-" * 80)
matches = search_wikidata_sparql(search_terms)
if matches:
for i, match in enumerate(matches, 1):
print(f" {i}. {match['qid']} - {match['label']}")
if match['description']:
print(f" {match['description']}")
if match['website']:
print(f" 🌐 {match['website']}")
# Store best match
if matches:
all_matches[inst_name] = matches[0]
else:
print(" ❌ No matches found")
# Rate limiting
time.sleep(2)
# Print summary
print("\n" + "=" * 80)
print("📊 ENRICHMENT MAPPING SUMMARY")
print("=" * 80)
if all_matches:
print("\nFound Wikidata matches:")
for inst_name, match in all_matches.items():
print(f"{inst_name}{match['qid']} ({match['label']})")
else:
print("\n❌ No matches found")
# Generate enrichment mapping code
if all_matches:
print("\n" + "=" * 80)
print("📝 ENRICHMENT MAPPING CODE (for batch script):")
print("=" * 80)
print("\nenrichment_mappings = {")
for inst_name, match in all_matches.items():
print(f" '{inst_name}': '{match['qid']}', # {match['label']}")
print("}")
if __name__ == "__main__":
main()