glam/scripts/execute_archive_query_corrected.py
2025-11-19 23:25:22 +01:00

178 lines
6.1 KiB
Python

#!/usr/bin/env python3
"""
Execute corrected Archive SPARQL query with:
1. DISTINCT to prevent duplicate outputs
2. All 318 Q-numbers from hyponyms_curated.yaml excluded
3. POST method to handle large query
"""
import json
import yaml
from SPARQLWrapper import SPARQLWrapper, JSON, POST
from pathlib import Path
import time
# Paths
CURATED_FILE = Path("/Users/kempersc/apps/glam/data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml")
OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/wikidata/GLAMORCUBEPSXHFN/A/hyponyms_corrected_new.json")
# Extract all Q-numbers from curated file
print("Loading curated Q-numbers...")
with open(CURATED_FILE, 'r') as f:
curated = yaml.safe_load(f)
q_numbers = set()
# From hyponym section
if 'hyponym' in curated:
for item in curated['hyponym']:
label = item.get('label', '')
if label.startswith('Q'):
q_numbers.add(label)
# From entity section
if 'entity' in curated:
for item in curated['entity']:
label = item.get('label', '')
if label.startswith('Q'):
q_numbers.add(label)
# Handle typo 'labels' instead of 'label'
labels = item.get('labels', '')
if labels.startswith('Q'):
q_numbers.add(labels)
# From exclude section
if 'exclude' in curated:
for item in curated['exclude']:
label = item.get('label', '')
if label.startswith('Q'):
q_numbers.add(label)
print(f"✓ Loaded {len(q_numbers)} Q-numbers to exclude")
# Build FILTER clause dynamically
# Split into chunks of 50 to avoid SPARQL syntax limits
q_list = sorted(list(q_numbers))
filter_clauses = []
chunk_size = 50
for i in range(0, len(q_list), chunk_size):
chunk = q_list[i:i+chunk_size]
entities = ", ".join([f"wd:{q}" for q in chunk])
filter_clauses.append(f" FILTER(?class NOT IN ({entities}))")
filter_clause_full = "\n".join(filter_clauses)
# Build SPARQL query
sparql_query = f"""# GLAMORCUBEPSXHF Vocabulary Extraction - CORRECTED
# Class: A (ARCHIVE)
# Excludes: ALL {len(q_numbers)} Q-numbers from hyponyms_curated.yaml
# Date: 2025-11-13
# Changes: Added DISTINCT, excluded all curated Q-numbers
SELECT DISTINCT ?class ?classLabel
(GROUP_CONCAT(DISTINCT ?altLabel; separator=" | ") as ?altLabels)
WHERE {{
{{
# Archive (general)
?class wdt:P279+ wd:Q166118 .
}}
# Get labels in priority languages (40+ languages for archive vocabulary)
OPTIONAL {{
?class rdfs:label ?classLabel .
FILTER(LANG(?classLabel) IN ("en", "es", "fr", "de", "nl", "pt", "ar", "zh", "ja", "ru",
"hi", "id", "ms", "th", "vi", "ko", "tr", "fa", "pl", "it",
"uk", "sv", "cs", "he", "bn", "mr", "ta", "te", "ur", "pa",
"el", "ro", "hu", "da", "no", "fi", "ca", "sr", "bg", "hr"))
}}
# Get alternative labels (aliases)
OPTIONAL {{
?class skos:altLabel ?altLabel .
FILTER(LANG(?altLabel) IN ("en", "es", "fr", "de", "nl", "pt", "ar", "zh", "ja", "ru",
"hi", "id", "ms", "th", "vi", "ko", "tr", "fa", "pl", "it",
"uk", "sv", "cs", "he", "bn", "mr", "ta", "te", "ur", "pa",
"el", "ro", "hu", "da", "no", "fi", "ca", "sr", "bg", "hr"))
}}
# ============================================
# EXCLUDE ALL {len(q_numbers)} Q-NUMBERS FROM hyponyms_curated.yaml
# ============================================
{filter_clause_full}
# Automatic label service (fallback)
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,es,fr,de,nl,pt,ar,zh,ja,ru,hi,id,ms,th,vi,ko,tr,fa,pl,it,uk,sv,cs,he,bn,mr,ta,te,ur,pa,el,ro,hu,da,no,fi,ca,sr,bg,hr". }}
}}
GROUP BY ?class ?classLabel
ORDER BY ?classLabel
LIMIT 5000
"""
print(f"\n✓ Built SPARQL query ({len(sparql_query)} characters)")
print(f"✓ Filter clauses: {len(filter_clauses)} chunks")
# Execute query using POST method
print("\nExecuting SPARQL query via POST method...")
endpoint = "https://query.wikidata.org/sparql"
sparql = SPARQLWrapper(endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)
sparql.setMethod(POST) # Use POST to handle large query
start_time = time.time()
try:
results = sparql.query().convert()
elapsed = time.time() - start_time
print(f"✓ Query completed in {elapsed:.2f} seconds")
# Save results
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
# Calculate statistics
bindings = results.get('results', {}).get('bindings', [])
unique_classes = set()
for binding in bindings:
class_uri = binding.get('class', {}).get('value', '')
if class_uri:
q_id = class_uri.split('/')[-1]
unique_classes.add(q_id)
print(f"\n✓ Results saved to: {OUTPUT_FILE}")
print(f"✓ File size: {OUTPUT_FILE.stat().st_size / 1024 / 1024:.2f} MB")
print(f"✓ Total bindings (with language variants): {len(bindings)}")
print(f"✓ Unique archive classes: {len(unique_classes)}")
# Check for duplicates in results
class_counts = {}
for binding in bindings:
class_uri = binding.get('class', {}).get('value', '')
q_id = class_uri.split('/')[-1]
class_counts[q_id] = class_counts.get(q_id, 0) + 1
duplicates = {q: count for q, count in class_counts.items() if count > 1}
if duplicates:
print(f"\n⚠️ WARNING: Found {len(duplicates)} Q-IDs with multiple rows:")
for q, count in sorted(duplicates.items())[:10]:
print(f" {q}: {count} rows")
else:
print(f"\n✓ No duplicates found (all Q-IDs appear exactly once)")
# Sample first 10 results
print(f"\nFirst 10 results:")
for i, binding in enumerate(bindings[:10], 1):
class_uri = binding.get('class', {}).get('value', '')
class_label = binding.get('classLabel', {}).get('value', '')
q_id = class_uri.split('/')[-1]
print(f" {i}. {q_id}: {class_label}")
except Exception as e:
print(f"\n❌ ERROR: {e}")
raise
print("\n✅ Query execution complete!")