glam/scripts/execute_archive_query_corrected.py

#!/usr/bin/env python3
"""
Execute corrected Archive SPARQL query with:
1. DISTINCT to prevent duplicate outputs
2. All 318 Q-numbers from hyponyms_curated.yaml excluded
3. POST method to handle large query
"""

import json
import yaml
from SPARQLWrapper import SPARQLWrapper, JSON, POST
from pathlib import Path
import time

# Paths
CURATED_FILE = Path("/Users/kempersc/apps/glam/data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml")
OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/wikidata/GLAMORCUBEPSXHFN/A/hyponyms_corrected_new.json")

# Extract all Q-numbers from curated file
print("Loading curated Q-numbers...")
with open(CURATED_FILE, 'r') as f:
    curated = yaml.safe_load(f)

q_numbers = set()

# From hyponym section
if 'hyponym' in curated:
    for item in curated['hyponym']:
        label = item.get('label', '')
        if label.startswith('Q'):
            q_numbers.add(label)

# From entity section
if 'entity' in curated:
    for item in curated['entity']:
        label = item.get('label', '')
        if label.startswith('Q'):
            q_numbers.add(label)
        # Handle typo 'labels' instead of 'label'
        labels = item.get('labels', '')
        if labels.startswith('Q'):
            q_numbers.add(labels)

# From exclude section
if 'exclude' in curated:
    for item in curated['exclude']:
        label = item.get('label', '')
        if label.startswith('Q'):
            q_numbers.add(label)

print(f"✓ Loaded {len(q_numbers)} Q-numbers to exclude")

# Build FILTER clause dynamically
# Split into chunks of 50 to avoid SPARQL syntax limits
q_list = sorted(list(q_numbers))
filter_clauses = []

chunk_size = 50
for i in range(0, len(q_list), chunk_size):
    chunk = q_list[i:i+chunk_size]
    entities = ", ".join([f"wd:{q}" for q in chunk])
    filter_clauses.append(f"  FILTER(?class NOT IN ({entities}))")

filter_clause_full = "\n".join(filter_clauses)

# Build SPARQL query
sparql_query = f"""# GLAMORCUBEPSXHF Vocabulary Extraction - CORRECTED
# Class: A (ARCHIVE)
# Excludes: ALL {len(q_numbers)} Q-numbers from hyponyms_curated.yaml
# Date: 2025-11-13
# Changes: Added DISTINCT, excluded all curated Q-numbers

SELECT DISTINCT ?class ?classLabel
  (GROUP_CONCAT(DISTINCT ?altLabel; separator=" | ") as ?altLabels)
WHERE {{
  {{
    # Archive (general)
    ?class wdt:P279+ wd:Q166118 .
  }}

  # Get labels in priority languages (40+ languages for archive vocabulary)
  OPTIONAL {{
    ?class rdfs:label ?classLabel .
    FILTER(LANG(?classLabel) IN ("en", "es", "fr", "de", "nl", "pt", "ar", "zh", "ja", "ru",
                       "hi", "id", "ms", "th", "vi", "ko", "tr", "fa", "pl", "it",
                       "uk", "sv", "cs", "he", "bn", "mr", "ta", "te", "ur", "pa",
                       "el", "ro", "hu", "da", "no", "fi", "ca", "sr", "bg", "hr"))
  }}

  # Get alternative labels (aliases)
  OPTIONAL {{
    ?class skos:altLabel ?altLabel .
    FILTER(LANG(?altLabel) IN ("en", "es", "fr", "de", "nl", "pt", "ar", "zh", "ja", "ru",
                       "hi", "id", "ms", "th", "vi", "ko", "tr", "fa", "pl", "it",
                       "uk", "sv", "cs", "he", "bn", "mr", "ta", "te", "ur", "pa",
                       "el", "ro", "hu", "da", "no", "fi", "ca", "sr", "bg", "hr"))
  }}

  # ============================================
  # EXCLUDE ALL {len(q_numbers)} Q-NUMBERS FROM hyponyms_curated.yaml
  # ============================================

{filter_clause_full}

  # Automatic label service (fallback)
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,es,fr,de,nl,pt,ar,zh,ja,ru,hi,id,ms,th,vi,ko,tr,fa,pl,it,uk,sv,cs,he,bn,mr,ta,te,ur,pa,el,ro,hu,da,no,fi,ca,sr,bg,hr". }}
}}
GROUP BY ?class ?classLabel
ORDER BY ?classLabel
LIMIT 5000
"""

print(f"\n✓ Built SPARQL query ({len(sparql_query)} characters)")
print(f"✓ Filter clauses: {len(filter_clauses)} chunks")

# Execute query using POST method
print("\nExecuting SPARQL query via POST method...")
endpoint = "https://query.wikidata.org/sparql"
sparql = SPARQLWrapper(endpoint)
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)
sparql.setMethod(POST)  # Use POST to handle large query

start_time = time.time()

try:
    results = sparql.query().convert()
    elapsed = time.time() - start_time

    print(f"✓ Query completed in {elapsed:.2f} seconds")

    # Save results
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    # Calculate statistics
    bindings = results.get('results', {}).get('bindings', [])
    unique_classes = set()
    for binding in bindings:
        class_uri = binding.get('class', {}).get('value', '')
        if class_uri:
            q_id = class_uri.split('/')[-1]
            unique_classes.add(q_id)

    print(f"\n✓ Results saved to: {OUTPUT_FILE}")
    print(f"✓ File size: {OUTPUT_FILE.stat().st_size / 1024 / 1024:.2f} MB")
    print(f"✓ Total bindings (with language variants): {len(bindings)}")
    print(f"✓ Unique archive classes: {len(unique_classes)}")

    # Check for duplicates in results
    class_counts = {}
    for binding in bindings:
        class_uri = binding.get('class', {}).get('value', '')
        q_id = class_uri.split('/')[-1]
        class_counts[q_id] = class_counts.get(q_id, 0) + 1

    duplicates = {q: count for q, count in class_counts.items() if count > 1}
    if duplicates:
        print(f"\n⚠️ WARNING: Found {len(duplicates)} Q-IDs with multiple rows:")
        for q, count in sorted(duplicates.items())[:10]:
            print(f"  {q}: {count} rows")
    else:
        print(f"\n✓ No duplicates found (all Q-IDs appear exactly once)")

    # Sample first 10 results
    print(f"\nFirst 10 results:")
    for i, binding in enumerate(bindings[:10], 1):
        class_uri = binding.get('class', {}).get('value', '')
        class_label = binding.get('classLabel', {}).get('value', '')
        q_id = class_uri.split('/')[-1]
        print(f"  {i}. {q_id}: {class_label}")

except Exception as e:
    print(f"\n❌ ERROR: {e}")
    raise

print("\n✅ Query execution complete!")