178 lines
6.1 KiB
Python
178 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Execute corrected Archive SPARQL query with:
|
|
1. DISTINCT to prevent duplicate outputs
|
|
2. All 318 Q-numbers from hyponyms_curated.yaml excluded
|
|
3. POST method to handle large query
|
|
"""
|
|
|
|
import json
|
|
import yaml
|
|
from SPARQLWrapper import SPARQLWrapper, JSON, POST
|
|
from pathlib import Path
|
|
import time
|
|
|
|
# Paths
|
|
CURATED_FILE = Path("/Users/kempersc/apps/glam/data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml")
|
|
OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/wikidata/GLAMORCUBEPSXHFN/A/hyponyms_corrected_new.json")
|
|
|
|
# Extract all Q-numbers from curated file
|
|
print("Loading curated Q-numbers...")
|
|
with open(CURATED_FILE, 'r') as f:
|
|
curated = yaml.safe_load(f)
|
|
|
|
q_numbers = set()
|
|
|
|
# From hyponym section
|
|
if 'hyponym' in curated:
|
|
for item in curated['hyponym']:
|
|
label = item.get('label', '')
|
|
if label.startswith('Q'):
|
|
q_numbers.add(label)
|
|
|
|
# From entity section
|
|
if 'entity' in curated:
|
|
for item in curated['entity']:
|
|
label = item.get('label', '')
|
|
if label.startswith('Q'):
|
|
q_numbers.add(label)
|
|
# Handle typo 'labels' instead of 'label'
|
|
labels = item.get('labels', '')
|
|
if labels.startswith('Q'):
|
|
q_numbers.add(labels)
|
|
|
|
# From exclude section
|
|
if 'exclude' in curated:
|
|
for item in curated['exclude']:
|
|
label = item.get('label', '')
|
|
if label.startswith('Q'):
|
|
q_numbers.add(label)
|
|
|
|
print(f"✓ Loaded {len(q_numbers)} Q-numbers to exclude")
|
|
|
|
# Build FILTER clause dynamically
|
|
# Split into chunks of 50 to avoid SPARQL syntax limits
|
|
q_list = sorted(list(q_numbers))
|
|
filter_clauses = []
|
|
|
|
chunk_size = 50
|
|
for i in range(0, len(q_list), chunk_size):
|
|
chunk = q_list[i:i+chunk_size]
|
|
entities = ", ".join([f"wd:{q}" for q in chunk])
|
|
filter_clauses.append(f" FILTER(?class NOT IN ({entities}))")
|
|
|
|
filter_clause_full = "\n".join(filter_clauses)
|
|
|
|
# Build SPARQL query
|
|
sparql_query = f"""# GLAMORCUBEPSXHF Vocabulary Extraction - CORRECTED
|
|
# Class: A (ARCHIVE)
|
|
# Excludes: ALL {len(q_numbers)} Q-numbers from hyponyms_curated.yaml
|
|
# Date: 2025-11-13
|
|
# Changes: Added DISTINCT, excluded all curated Q-numbers
|
|
|
|
SELECT DISTINCT ?class ?classLabel
|
|
(GROUP_CONCAT(DISTINCT ?altLabel; separator=" | ") as ?altLabels)
|
|
WHERE {{
|
|
{{
|
|
# Archive (general)
|
|
?class wdt:P279+ wd:Q166118 .
|
|
}}
|
|
|
|
# Get labels in priority languages (40+ languages for archive vocabulary)
|
|
OPTIONAL {{
|
|
?class rdfs:label ?classLabel .
|
|
FILTER(LANG(?classLabel) IN ("en", "es", "fr", "de", "nl", "pt", "ar", "zh", "ja", "ru",
|
|
"hi", "id", "ms", "th", "vi", "ko", "tr", "fa", "pl", "it",
|
|
"uk", "sv", "cs", "he", "bn", "mr", "ta", "te", "ur", "pa",
|
|
"el", "ro", "hu", "da", "no", "fi", "ca", "sr", "bg", "hr"))
|
|
}}
|
|
|
|
# Get alternative labels (aliases)
|
|
OPTIONAL {{
|
|
?class skos:altLabel ?altLabel .
|
|
FILTER(LANG(?altLabel) IN ("en", "es", "fr", "de", "nl", "pt", "ar", "zh", "ja", "ru",
|
|
"hi", "id", "ms", "th", "vi", "ko", "tr", "fa", "pl", "it",
|
|
"uk", "sv", "cs", "he", "bn", "mr", "ta", "te", "ur", "pa",
|
|
"el", "ro", "hu", "da", "no", "fi", "ca", "sr", "bg", "hr"))
|
|
}}
|
|
|
|
# ============================================
|
|
# EXCLUDE ALL {len(q_numbers)} Q-NUMBERS FROM hyponyms_curated.yaml
|
|
# ============================================
|
|
|
|
{filter_clause_full}
|
|
|
|
# Automatic label service (fallback)
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,es,fr,de,nl,pt,ar,zh,ja,ru,hi,id,ms,th,vi,ko,tr,fa,pl,it,uk,sv,cs,he,bn,mr,ta,te,ur,pa,el,ro,hu,da,no,fi,ca,sr,bg,hr". }}
|
|
}}
|
|
GROUP BY ?class ?classLabel
|
|
ORDER BY ?classLabel
|
|
LIMIT 5000
|
|
"""
|
|
|
|
print(f"\n✓ Built SPARQL query ({len(sparql_query)} characters)")
|
|
print(f"✓ Filter clauses: {len(filter_clauses)} chunks")
|
|
|
|
# Execute query using POST method
|
|
print("\nExecuting SPARQL query via POST method...")
|
|
endpoint = "https://query.wikidata.org/sparql"
|
|
sparql = SPARQLWrapper(endpoint)
|
|
sparql.setQuery(sparql_query)
|
|
sparql.setReturnFormat(JSON)
|
|
sparql.setMethod(POST) # Use POST to handle large query
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
results = sparql.query().convert()
|
|
elapsed = time.time() - start_time
|
|
|
|
print(f"✓ Query completed in {elapsed:.2f} seconds")
|
|
|
|
# Save results
|
|
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
json.dump(results, f, indent=2, ensure_ascii=False)
|
|
|
|
# Calculate statistics
|
|
bindings = results.get('results', {}).get('bindings', [])
|
|
unique_classes = set()
|
|
for binding in bindings:
|
|
class_uri = binding.get('class', {}).get('value', '')
|
|
if class_uri:
|
|
q_id = class_uri.split('/')[-1]
|
|
unique_classes.add(q_id)
|
|
|
|
print(f"\n✓ Results saved to: {OUTPUT_FILE}")
|
|
print(f"✓ File size: {OUTPUT_FILE.stat().st_size / 1024 / 1024:.2f} MB")
|
|
print(f"✓ Total bindings (with language variants): {len(bindings)}")
|
|
print(f"✓ Unique archive classes: {len(unique_classes)}")
|
|
|
|
# Check for duplicates in results
|
|
class_counts = {}
|
|
for binding in bindings:
|
|
class_uri = binding.get('class', {}).get('value', '')
|
|
q_id = class_uri.split('/')[-1]
|
|
class_counts[q_id] = class_counts.get(q_id, 0) + 1
|
|
|
|
duplicates = {q: count for q, count in class_counts.items() if count > 1}
|
|
if duplicates:
|
|
print(f"\n⚠️ WARNING: Found {len(duplicates)} Q-IDs with multiple rows:")
|
|
for q, count in sorted(duplicates.items())[:10]:
|
|
print(f" {q}: {count} rows")
|
|
else:
|
|
print(f"\n✓ No duplicates found (all Q-IDs appear exactly once)")
|
|
|
|
# Sample first 10 results
|
|
print(f"\nFirst 10 results:")
|
|
for i, binding in enumerate(bindings[:10], 1):
|
|
class_uri = binding.get('class', {}).get('value', '')
|
|
class_label = binding.get('classLabel', {}).get('value', '')
|
|
q_id = class_uri.split('/')[-1]
|
|
print(f" {i}. {q_id}: {class_label}")
|
|
|
|
except Exception as e:
|
|
print(f"\n❌ ERROR: {e}")
|
|
raise
|
|
|
|
print("\n✅ Query execution complete!")
|