- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
156 lines
5.1 KiB
Python
156 lines
5.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Query Wikidata for Mexican heritage institutions using SPARQL.
|
|
Saves results to JSON for fuzzy matching in Batch 2 enrichment.
|
|
"""
|
|
|
|
from SPARQLWrapper import SPARQLWrapper, JSON
|
|
import json
|
|
from pathlib import Path
|
|
|
|
def query_wikidata_mexican_glam():
|
|
"""Query Wikidata for Mexican GLAM institutions."""
|
|
|
|
endpoint = SPARQLWrapper("https://query.wikidata.org/sparql")
|
|
endpoint.setRequestMethod('POST')
|
|
|
|
# Query for museums, libraries, archives, galleries in Mexico
|
|
query = """
|
|
SELECT DISTINCT ?item ?itemLabel ?itemAltLabel ?viaf ?isil ?coords ?typeLabel ?location ?locationLabel WHERE {
|
|
# Instance of museum, library, archive, or gallery
|
|
VALUES ?type {
|
|
wd:Q33506 # museum
|
|
wd:Q7075 # library
|
|
wd:Q166118 # archive
|
|
wd:Q1007870 # art museum
|
|
wd:Q22698 # park
|
|
wd:Q207694 # art gallery
|
|
}
|
|
?item wdt:P31/wdt:P279* ?type .
|
|
|
|
# Located in Mexico
|
|
?item wdt:P17 wd:Q96 .
|
|
|
|
# Optional: external identifiers
|
|
OPTIONAL { ?item wdt:P214 ?viaf } # VIAF ID
|
|
OPTIONAL { ?item wdt:P791 ?isil } # ISIL code
|
|
OPTIONAL { ?item wdt:P625 ?coords } # Coordinates
|
|
|
|
# Optional: location (city/state)
|
|
OPTIONAL {
|
|
?item wdt:P131 ?location .
|
|
?location rdfs:label ?locationLabel .
|
|
FILTER(LANG(?locationLabel) = "es" || LANG(?locationLabel) = "en")
|
|
}
|
|
|
|
# Get labels in Spanish and English
|
|
SERVICE wikibase:label {
|
|
bd:serviceParam wikibase:language "es,en" .
|
|
?item rdfs:label ?itemLabel .
|
|
?type rdfs:label ?typeLabel .
|
|
}
|
|
|
|
# Optional: alternative names
|
|
OPTIONAL {
|
|
?item skos:altLabel ?itemAltLabel .
|
|
FILTER(LANG(?itemAltLabel) = "es" || LANG(?itemAltLabel) = "en")
|
|
}
|
|
}
|
|
LIMIT 500
|
|
"""
|
|
|
|
endpoint.setQuery(query)
|
|
endpoint.setReturnFormat(JSON)
|
|
|
|
print("Querying Wikidata for Mexican heritage institutions...")
|
|
print("This may take 30-60 seconds...")
|
|
|
|
try:
|
|
results = endpoint.query().convert()
|
|
return results['results']['bindings']
|
|
except Exception as e:
|
|
print(f"Error querying Wikidata: {e}")
|
|
return []
|
|
|
|
def parse_wikidata_results(results):
|
|
"""Parse SPARQL results into structured format."""
|
|
|
|
institutions = {}
|
|
|
|
for result in results:
|
|
# Extract Q-number from URI
|
|
item_uri = result['item']['value']
|
|
q_number = item_uri.split('/')[-1]
|
|
|
|
# Get name
|
|
name = result.get('itemLabel', {}).get('value', '')
|
|
|
|
# Skip generic labels
|
|
if name.startswith('Q') or not name:
|
|
continue
|
|
|
|
# Initialize or update institution record
|
|
if q_number not in institutions:
|
|
institutions[q_number] = {
|
|
'q_number': q_number,
|
|
'name': name,
|
|
'alternative_names': [],
|
|
'viaf': None,
|
|
'isil': None,
|
|
'coordinates': None,
|
|
'type': result.get('typeLabel', {}).get('value', ''),
|
|
'location': result.get('locationLabel', {}).get('value', '')
|
|
}
|
|
|
|
# Add alternative name if present
|
|
if 'itemAltLabel' in result:
|
|
alt_name = result['itemAltLabel']['value']
|
|
if alt_name not in institutions[q_number]['alternative_names']:
|
|
institutions[q_number]['alternative_names'].append(alt_name)
|
|
|
|
# Add identifiers
|
|
if 'viaf' in result and not institutions[q_number]['viaf']:
|
|
institutions[q_number]['viaf'] = result['viaf']['value']
|
|
|
|
if 'isil' in result and not institutions[q_number]['isil']:
|
|
institutions[q_number]['isil'] = result['isil']['value']
|
|
|
|
if 'coords' in result and not institutions[q_number]['coordinates']:
|
|
institutions[q_number]['coordinates'] = result['coords']['value']
|
|
|
|
return list(institutions.values())
|
|
|
|
def main():
|
|
# Query Wikidata
|
|
results = query_wikidata_mexican_glam()
|
|
|
|
if not results:
|
|
print("No results found or query failed.")
|
|
return
|
|
|
|
print(f"Received {len(results)} results from Wikidata")
|
|
|
|
# Parse results
|
|
institutions = parse_wikidata_results(results)
|
|
print(f"Parsed {len(institutions)} unique Mexican institutions")
|
|
|
|
# Save to JSON
|
|
output_path = Path(__file__).parent.parent / 'data' / 'wikidata' / 'mexican_institutions_wikidata.json'
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(institutions, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"\nSaved {len(institutions)} institutions to {output_path}")
|
|
|
|
# Print sample results
|
|
print("\nSample results:")
|
|
for inst in institutions[:10]:
|
|
print(f" - {inst['name']} ({inst['q_number']})")
|
|
if inst['viaf']:
|
|
print(f" VIAF: {inst['viaf']}")
|
|
if inst['isil']:
|
|
print(f" ISIL: {inst['isil']}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|