glam/scripts/query_wikidata_mexican_institutions.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

156 lines
5.1 KiB
Python

#!/usr/bin/env python3
"""
Query Wikidata for Mexican heritage institutions using SPARQL.
Saves results to JSON for fuzzy matching in Batch 2 enrichment.
"""
from SPARQLWrapper import SPARQLWrapper, JSON
import json
from pathlib import Path
def query_wikidata_mexican_glam():
"""Query Wikidata for Mexican GLAM institutions."""
endpoint = SPARQLWrapper("https://query.wikidata.org/sparql")
endpoint.setRequestMethod('POST')
# Query for museums, libraries, archives, galleries in Mexico
query = """
SELECT DISTINCT ?item ?itemLabel ?itemAltLabel ?viaf ?isil ?coords ?typeLabel ?location ?locationLabel WHERE {
# Instance of museum, library, archive, or gallery
VALUES ?type {
wd:Q33506 # museum
wd:Q7075 # library
wd:Q166118 # archive
wd:Q1007870 # art museum
wd:Q22698 # park
wd:Q207694 # art gallery
}
?item wdt:P31/wdt:P279* ?type .
# Located in Mexico
?item wdt:P17 wd:Q96 .
# Optional: external identifiers
OPTIONAL { ?item wdt:P214 ?viaf } # VIAF ID
OPTIONAL { ?item wdt:P791 ?isil } # ISIL code
OPTIONAL { ?item wdt:P625 ?coords } # Coordinates
# Optional: location (city/state)
OPTIONAL {
?item wdt:P131 ?location .
?location rdfs:label ?locationLabel .
FILTER(LANG(?locationLabel) = "es" || LANG(?locationLabel) = "en")
}
# Get labels in Spanish and English
SERVICE wikibase:label {
bd:serviceParam wikibase:language "es,en" .
?item rdfs:label ?itemLabel .
?type rdfs:label ?typeLabel .
}
# Optional: alternative names
OPTIONAL {
?item skos:altLabel ?itemAltLabel .
FILTER(LANG(?itemAltLabel) = "es" || LANG(?itemAltLabel) = "en")
}
}
LIMIT 500
"""
endpoint.setQuery(query)
endpoint.setReturnFormat(JSON)
print("Querying Wikidata for Mexican heritage institutions...")
print("This may take 30-60 seconds...")
try:
results = endpoint.query().convert()
return results['results']['bindings']
except Exception as e:
print(f"Error querying Wikidata: {e}")
return []
def parse_wikidata_results(results):
"""Parse SPARQL results into structured format."""
institutions = {}
for result in results:
# Extract Q-number from URI
item_uri = result['item']['value']
q_number = item_uri.split('/')[-1]
# Get name
name = result.get('itemLabel', {}).get('value', '')
# Skip generic labels
if name.startswith('Q') or not name:
continue
# Initialize or update institution record
if q_number not in institutions:
institutions[q_number] = {
'q_number': q_number,
'name': name,
'alternative_names': [],
'viaf': None,
'isil': None,
'coordinates': None,
'type': result.get('typeLabel', {}).get('value', ''),
'location': result.get('locationLabel', {}).get('value', '')
}
# Add alternative name if present
if 'itemAltLabel' in result:
alt_name = result['itemAltLabel']['value']
if alt_name not in institutions[q_number]['alternative_names']:
institutions[q_number]['alternative_names'].append(alt_name)
# Add identifiers
if 'viaf' in result and not institutions[q_number]['viaf']:
institutions[q_number]['viaf'] = result['viaf']['value']
if 'isil' in result and not institutions[q_number]['isil']:
institutions[q_number]['isil'] = result['isil']['value']
if 'coords' in result and not institutions[q_number]['coordinates']:
institutions[q_number]['coordinates'] = result['coords']['value']
return list(institutions.values())
def main():
# Query Wikidata
results = query_wikidata_mexican_glam()
if not results:
print("No results found or query failed.")
return
print(f"Received {len(results)} results from Wikidata")
# Parse results
institutions = parse_wikidata_results(results)
print(f"Parsed {len(institutions)} unique Mexican institutions")
# Save to JSON
output_path = Path(__file__).parent.parent / 'data' / 'wikidata' / 'mexican_institutions_wikidata.json'
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(institutions, f, ensure_ascii=False, indent=2)
print(f"\nSaved {len(institutions)} institutions to {output_path}")
# Print sample results
print("\nSample results:")
for inst in institutions[:10]:
print(f" - {inst['name']} ({inst['q_number']})")
if inst['viaf']:
print(f" VIAF: {inst['viaf']}")
if inst['isil']:
print(f" ISIL: {inst['isil']}")
if __name__ == '__main__':
main()