#!/usr/bin/env python3 """ Execute corrected Archive SPARQL query with: 1. DISTINCT to prevent duplicate outputs 2. All 318 Q-numbers from hyponyms_curated.yaml excluded 3. POST method to handle large query """ import json import yaml from SPARQLWrapper import SPARQLWrapper, JSON, POST from pathlib import Path import time # Paths CURATED_FILE = Path("/Users/kempersc/apps/glam/data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml") OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/wikidata/GLAMORCUBEPSXHFN/A/hyponyms_corrected_new.json") # Extract all Q-numbers from curated file print("Loading curated Q-numbers...") with open(CURATED_FILE, 'r') as f: curated = yaml.safe_load(f) q_numbers = set() # From hyponym section if 'hyponym' in curated: for item in curated['hyponym']: label = item.get('label', '') if label.startswith('Q'): q_numbers.add(label) # From entity section if 'entity' in curated: for item in curated['entity']: label = item.get('label', '') if label.startswith('Q'): q_numbers.add(label) # Handle typo 'labels' instead of 'label' labels = item.get('labels', '') if labels.startswith('Q'): q_numbers.add(labels) # From exclude section if 'exclude' in curated: for item in curated['exclude']: label = item.get('label', '') if label.startswith('Q'): q_numbers.add(label) print(f"✓ Loaded {len(q_numbers)} Q-numbers to exclude") # Build FILTER clause dynamically # Split into chunks of 50 to avoid SPARQL syntax limits q_list = sorted(list(q_numbers)) filter_clauses = [] chunk_size = 50 for i in range(0, len(q_list), chunk_size): chunk = q_list[i:i+chunk_size] entities = ", ".join([f"wd:{q}" for q in chunk]) filter_clauses.append(f" FILTER(?class NOT IN ({entities}))") filter_clause_full = "\n".join(filter_clauses) # Build SPARQL query sparql_query = f"""# GLAMORCUBEPSXHF Vocabulary Extraction - CORRECTED # Class: A (ARCHIVE) # Excludes: ALL {len(q_numbers)} Q-numbers from hyponyms_curated.yaml # Date: 2025-11-13 # Changes: Added DISTINCT, excluded all curated Q-numbers SELECT DISTINCT ?class ?classLabel (GROUP_CONCAT(DISTINCT ?altLabel; separator=" | ") as ?altLabels) WHERE {{ {{ # Archive (general) ?class wdt:P279+ wd:Q166118 . }} # Get labels in priority languages (40+ languages for archive vocabulary) OPTIONAL {{ ?class rdfs:label ?classLabel . FILTER(LANG(?classLabel) IN ("en", "es", "fr", "de", "nl", "pt", "ar", "zh", "ja", "ru", "hi", "id", "ms", "th", "vi", "ko", "tr", "fa", "pl", "it", "uk", "sv", "cs", "he", "bn", "mr", "ta", "te", "ur", "pa", "el", "ro", "hu", "da", "no", "fi", "ca", "sr", "bg", "hr")) }} # Get alternative labels (aliases) OPTIONAL {{ ?class skos:altLabel ?altLabel . FILTER(LANG(?altLabel) IN ("en", "es", "fr", "de", "nl", "pt", "ar", "zh", "ja", "ru", "hi", "id", "ms", "th", "vi", "ko", "tr", "fa", "pl", "it", "uk", "sv", "cs", "he", "bn", "mr", "ta", "te", "ur", "pa", "el", "ro", "hu", "da", "no", "fi", "ca", "sr", "bg", "hr")) }} # ============================================ # EXCLUDE ALL {len(q_numbers)} Q-NUMBERS FROM hyponyms_curated.yaml # ============================================ {filter_clause_full} # Automatic label service (fallback) SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,es,fr,de,nl,pt,ar,zh,ja,ru,hi,id,ms,th,vi,ko,tr,fa,pl,it,uk,sv,cs,he,bn,mr,ta,te,ur,pa,el,ro,hu,da,no,fi,ca,sr,bg,hr". }} }} GROUP BY ?class ?classLabel ORDER BY ?classLabel LIMIT 5000 """ print(f"\n✓ Built SPARQL query ({len(sparql_query)} characters)") print(f"✓ Filter clauses: {len(filter_clauses)} chunks") # Execute query using POST method print("\nExecuting SPARQL query via POST method...") endpoint = "https://query.wikidata.org/sparql" sparql = SPARQLWrapper(endpoint) sparql.setQuery(sparql_query) sparql.setReturnFormat(JSON) sparql.setMethod(POST) # Use POST to handle large query start_time = time.time() try: results = sparql.query().convert() elapsed = time.time() - start_time print(f"✓ Query completed in {elapsed:.2f} seconds") # Save results OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(results, f, indent=2, ensure_ascii=False) # Calculate statistics bindings = results.get('results', {}).get('bindings', []) unique_classes = set() for binding in bindings: class_uri = binding.get('class', {}).get('value', '') if class_uri: q_id = class_uri.split('/')[-1] unique_classes.add(q_id) print(f"\n✓ Results saved to: {OUTPUT_FILE}") print(f"✓ File size: {OUTPUT_FILE.stat().st_size / 1024 / 1024:.2f} MB") print(f"✓ Total bindings (with language variants): {len(bindings)}") print(f"✓ Unique archive classes: {len(unique_classes)}") # Check for duplicates in results class_counts = {} for binding in bindings: class_uri = binding.get('class', {}).get('value', '') q_id = class_uri.split('/')[-1] class_counts[q_id] = class_counts.get(q_id, 0) + 1 duplicates = {q: count for q, count in class_counts.items() if count > 1} if duplicates: print(f"\n⚠️ WARNING: Found {len(duplicates)} Q-IDs with multiple rows:") for q, count in sorted(duplicates.items())[:10]: print(f" {q}: {count} rows") else: print(f"\n✓ No duplicates found (all Q-IDs appear exactly once)") # Sample first 10 results print(f"\nFirst 10 results:") for i, binding in enumerate(bindings[:10], 1): class_uri = binding.get('class', {}).get('value', '') class_label = binding.get('classLabel', {}).get('value', '') q_id = class_uri.split('/')[-1] print(f" {i}. {q_id}: {class_label}") except Exception as e: print(f"\n❌ ERROR: {e}") raise print("\n✅ Query execution complete!")