- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
138 lines
4.5 KiB
Python
138 lines
4.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Interactive script to search Wikidata for Brazilian institutions.
|
|
Helps build enrichment mappings by querying Wikidata SPARQL endpoint.
|
|
"""
|
|
|
|
import requests
|
|
import time
|
|
from typing import List, Dict, Optional
|
|
|
|
WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
|
|
|
|
def search_wikidata_sparql(search_terms: str, limit: int = 10) -> List[Dict]:
|
|
"""Search Wikidata using SPARQL for heritage institutions."""
|
|
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?coordinate ?website WHERE {{
|
|
# Search in labels and aliases
|
|
?item rdfs:label|skos:altLabel ?label .
|
|
FILTER(CONTAINS(LCASE(?label), LCASE("{search_terms}")))
|
|
|
|
# Brazilian institutions
|
|
?item wdt:P17 wd:Q155 . # country = Brazil
|
|
|
|
# Optional data
|
|
OPTIONAL {{ ?item wdt:P625 ?coordinate }}
|
|
OPTIONAL {{ ?item wdt:P856 ?website }}
|
|
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "pt,en" }}
|
|
}}
|
|
LIMIT {limit}
|
|
"""
|
|
|
|
headers = {
|
|
'User-Agent': 'GLAM-Data-Extractor/0.1 (heritage institutions research)',
|
|
'Accept': 'application/sparql-results+json'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(
|
|
WIKIDATA_SPARQL_ENDPOINT,
|
|
params={'query': query, 'format': 'json'},
|
|
headers=headers,
|
|
timeout=30
|
|
)
|
|
response.raise_for_status()
|
|
results = response.json()
|
|
|
|
matches = []
|
|
for binding in results.get('results', {}).get('bindings', []):
|
|
qid = binding['item']['value'].split('/')[-1]
|
|
label = binding.get('itemLabel', {}).get('value', 'No label')
|
|
description = binding.get('itemDescription', {}).get('value', '')
|
|
website = binding.get('website', {}).get('value', '')
|
|
|
|
matches.append({
|
|
'qid': qid,
|
|
'label': label,
|
|
'description': description,
|
|
'website': website
|
|
})
|
|
|
|
return matches
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error querying Wikidata: {e}")
|
|
return []
|
|
|
|
def main():
|
|
"""Search for Brazilian institutions in Wikidata."""
|
|
|
|
# Priority targets from our dataset
|
|
targets = [
|
|
("Museu Sacaca", "museum indigenous macapá"),
|
|
("Dom Bosco", "museum campo grande"),
|
|
("Homem Sergipano", "museum aracaju anthropology"),
|
|
("Geopark Araripe", "geopark crato ceará"),
|
|
("Goiás", "unesco heritage city"),
|
|
("São Luís", "unesco heritage historic center"),
|
|
("Arquivo Público", "brasília archive federal district"),
|
|
("Memorial Rio Grande", "pelotas memorial museum"),
|
|
("Museu Povos Acreanos", "acre rio branco museum"),
|
|
("MARCO", "campo grande contemporary art"),
|
|
]
|
|
|
|
print("🔍 Searching Wikidata for Brazilian Institutions")
|
|
print("=" * 80)
|
|
|
|
all_matches = {}
|
|
|
|
for inst_name, search_terms in targets:
|
|
print(f"\n📍 Searching: {inst_name}")
|
|
print(f" Terms: {search_terms}")
|
|
print("-" * 80)
|
|
|
|
matches = search_wikidata_sparql(search_terms)
|
|
|
|
if matches:
|
|
for i, match in enumerate(matches, 1):
|
|
print(f" {i}. {match['qid']} - {match['label']}")
|
|
if match['description']:
|
|
print(f" {match['description']}")
|
|
if match['website']:
|
|
print(f" 🌐 {match['website']}")
|
|
|
|
# Store best match
|
|
if matches:
|
|
all_matches[inst_name] = matches[0]
|
|
else:
|
|
print(" ❌ No matches found")
|
|
|
|
# Rate limiting
|
|
time.sleep(2)
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 80)
|
|
print("📊 ENRICHMENT MAPPING SUMMARY")
|
|
print("=" * 80)
|
|
|
|
if all_matches:
|
|
print("\nFound Wikidata matches:")
|
|
for inst_name, match in all_matches.items():
|
|
print(f" ✅ {inst_name} → {match['qid']} ({match['label']})")
|
|
else:
|
|
print("\n❌ No matches found")
|
|
|
|
# Generate enrichment mapping code
|
|
if all_matches:
|
|
print("\n" + "=" * 80)
|
|
print("📝 ENRICHMENT MAPPING CODE (for batch script):")
|
|
print("=" * 80)
|
|
print("\nenrichment_mappings = {")
|
|
for inst_name, match in all_matches.items():
|
|
print(f" '{inst_name}': '{match['qid']}', # {match['label']}")
|
|
print("}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|