#!/usr/bin/env python3 """ Generate Gallery (G-class) SPARQL query with automated Q-number exclusions. This script: 1. Reads hyponyms_curated.yaml to extract all Q-numbers 2. Generates FILTER chunks (50 Q-numbers per chunk) 3. Creates a complete SPARQL query with all exclusions 4. Saves both the query and metadata YAML Usage: python scripts/generate_gallery_query_with_exclusions.py Input: - data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml Output: - data/wikidata/GLAMORCUBEPSXHFN/G/queries/gallery_query_updated_.sparql - data/wikidata/GLAMORCUBEPSXHFN/G/queries/gallery_query_updated_.yaml """ import re from datetime import datetime, timezone from pathlib import Path from typing import List, Set # Base SPARQL query template with 14 VERIFIED base classes for galleries # Updated 2025-11-16 based on Wikidata MCP + Exa verification SPARQL_TEMPLATE = """# Gallery (G) Class - Missing Hyponym Discovery Query # Generated: {timestamp} # Excludes {q_count} Q-numbers from hyponyms_curated.yaml # Base classes: 14 VERIFIED (art galleries, institutions, specialized types, alternative spaces) # # VERIFICATION: All Q-numbers verified via Wikidata MCP tool on 2025-11-16 # See: data/wikidata/GLAMORCUBEPSXHFN/G/VERIFIED_Q_NUMBERS.md # # IMPORTANT: This query finds hyponyms of the base classes specified below. # The FILTER statements exclude curated Q-numbers from RESULTS, not from traversal. # Results are capped at 10,000 to prevent timeout. SELECT DISTINCT ?hyponym ?hyponymLabel ?hyponymAltLabel WHERE {{ {{ # Core gallery types (Priority 1) # Art gallery (space/building) - Q1007870 ?hyponym wdt:P279+ wd:Q1007870 . }} UNION {{ # Art gallery (institution/organization) - Q98818526 ?hyponym wdt:P279+ wd:Q98818526 . }} UNION {{ # Art museum (institution) - Q3196771 ?hyponym wdt:P279+ wd:Q3196771 . }} UNION {{ # Art museum (building/space) - Q207694 ?hyponym wdt:P279+ wd:Q207694 . }} UNION {{ # Specialized Gallery Types (Priority 2) # Kunsthalle (temporary exhibitions) - Q1475403 ?hyponym wdt:P279+ wd:Q1475403 . }} UNION {{ # Commercial art gallery - Q56856618 ?hyponym wdt:P279+ wd:Q56856618 . }} UNION {{ # Contemporary art gallery - Q16038801 ?hyponym wdt:P279+ wd:Q16038801 . }} UNION {{ # Noncommercial art gallery - Q67165238 ?hyponym wdt:P279+ wd:Q67165238 . }} UNION {{ # Institutional Variants (Priority 3) # National gallery - Q3844310 ?hyponym wdt:P279+ wd:Q3844310 . }} UNION {{ # Artist-run space - Q4034417 ?hyponym wdt:P279+ wd:Q4034417 . }} UNION {{ # Alternative exhibition space - Q16002704 ?hyponym wdt:P279+ wd:Q16002704 . }} UNION {{ # Physical Variants (Priority 4) # Sculpture garden - Q1759852 ?hyponym wdt:P279+ wd:Q1759852 . }} UNION {{ # Contemporary art museum (type) - Q108860927 ?hyponym wdt:P279+ wd:Q108860927 . }} UNION {{ # Online art gallery - Q7094057 ?hyponym wdt:P279+ wd:Q7094057 . }} # FILTER: Exclude ALL curated Q-numbers from results {filter_chunks} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,es,fr,de,nl,pt,ar,zh,ja,ru,hi,id,ms,th,vi,ko,tr,fa,pl,it,uk,sv,cs,he,bn,mr,ta,te,ur,pa,el,ro,hu,da,no,fi,ca,sr,bg,hr,sk,sl". }} }} ORDER BY ?hyponymLabel LIMIT 10000 """ METADATA_TEMPLATE = """# Gallery Query Metadata # Generated: {timestamp} # Updated: 2025-11-16 with VERIFIED Q-numbers query_file: gallery_query_updated_{timestamp_file}.sparql generated_at: {timestamp} base_classes_count: 14 excluded_q_numbers_count: {q_count} filter_chunks_count: {chunk_count} source_file: data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml verification_report: data/wikidata/GLAMORCUBEPSXHFN/G/VERIFIED_Q_NUMBERS.md base_classes: # Core gallery types (Priority 1) - Q1007870 # Art gallery (space/building) - Q98818526 # Art gallery (institution/organization) - Q3196771 # Art museum (institution) - Q207694 # Art museum (building/space) # Specialized Gallery Types (Priority 2) - Q1475403 # Kunsthalle (temporary exhibitions) - Q56856618 # Commercial art gallery - Q16038801 # Contemporary art gallery - Q67165238 # Noncommercial art gallery # Institutional Variants (Priority 3) - Q3844310 # National gallery - Q4034417 # Artist-run space - Q16002704 # Alternative exhibition space # Physical Variants (Priority 4) - Q1759852 # Sculpture garden - Q108860927 # Contemporary art museum (type) - Q7094057 # Online art gallery verification_notes: | All Q-numbers verified via Wikidata MCP tool on 2025-11-16. Original 7 base classes expanded to 14 after verification discovered: - 5 INCORRECT Q-numbers (removed) - 8 NEW relevant Q-numbers (added) Key discoveries: - Institution vs. building distinction (Q98818526 vs Q1007870) - Artist-run and alternative spaces (Q4034417, Q16002704) - National galleries as distinct type (Q3844310) - Online/digital galleries (Q7094057) extraction_notes: | Q-numbers extracted from hyponyms_curated.yaml. Includes both 'label:' fields (primary Q-numbers) and 'duplicate:' fields (alternative IDs). FILTER statements organized in chunks of 50 Q-numbers for query optimization. """ def extract_q_numbers_from_yaml(yaml_path: Path) -> Set[str]: """ Extract all Q-numbers from hyponyms_curated.yaml using regex. Extracts Q-numbers from two sources: 1. 'label:' fields - primary Q-number for the entity 2. 'duplicate:' fields - alternative Q-numbers that refer to the same entity This approach is more robust than YAML parsing since the file has formatting issues. Args: yaml_path: Path to hyponyms_curated.yaml Returns: Set of Q-numbers (e.g., {'Q167346', 'Q43501', ...}) """ with open(yaml_path, 'r', encoding='utf-8') as f: content = f.read() q_numbers = set() # Pattern 1: Extract from "label: Q" lines label_pattern = r'^\s*-?\s*label:\s+(Q\d+)' for match in re.finditer(label_pattern, content, re.MULTILINE): q_number = match.group(1).strip() q_numbers.add(q_number) # Pattern 2: Extract from "duplicate:" lists duplicate_pattern = r'^\s+duplicate:\s*\n((?:\s+-\s+Q\d+\s*\n?)+)' for match in re.finditer(duplicate_pattern, content, re.MULTILINE): duplicate_block = match.group(1) # Extract individual Q-numbers from the duplicate block q_in_block = re.findall(r'Q\d+', duplicate_block) q_numbers.update(q_in_block) return q_numbers def chunk_q_numbers(q_numbers: Set[str], chunk_size: int = 50) -> List[List[str]]: """ Split Q-numbers into chunks for FILTER statements. Args: q_numbers: Set of Q-numbers chunk_size: Number of Q-numbers per chunk (default: 50) Returns: List of Q-number chunks (each chunk is a list) """ sorted_q_numbers = sorted(q_numbers, key=lambda x: int(x[1:])) # Sort by numeric part chunks = [] for i in range(0, len(sorted_q_numbers), chunk_size): chunk = sorted_q_numbers[i:i + chunk_size] chunks.append(chunk) return chunks def generate_filter_chunks(chunks: List[List[str]]) -> str: """ Generate FILTER statements from Q-number chunks. Args: chunks: List of Q-number chunks Returns: FILTER statements as string """ filter_lines = [] for chunk in chunks: q_list = ', '.join([f'wd:{q}' for q in chunk]) filter_line = f' FILTER(?hyponym NOT IN ({q_list}))' filter_lines.append(filter_line) return '\n'.join(filter_lines) def generate_query(yaml_path: Path, output_dir: Path) -> tuple[Path, Path]: """ Generate SPARQL query with exclusions from hyponyms_curated.yaml. Args: yaml_path: Path to hyponyms_curated.yaml output_dir: Output directory for query files Returns: Tuple of (query_file_path, metadata_file_path) """ # Extract Q-numbers print(f"Reading Q-numbers from {yaml_path}") q_numbers = extract_q_numbers_from_yaml(yaml_path) print(f"Extracted {len(q_numbers)} Q-numbers") # Create chunks chunks = chunk_q_numbers(q_numbers, chunk_size=50) print(f"Created {len(chunks)} FILTER chunks") # Generate FILTER statements filter_chunks = generate_filter_chunks(chunks) # Generate timestamp now = datetime.now(timezone.utc) timestamp = now.strftime('%Y-%m-%dT%H:%M:%S+00:00') timestamp_file = now.strftime('%Y%m%dT%H%M%S') # Generate SPARQL query sparql_content = SPARQL_TEMPLATE.format( timestamp=timestamp, q_count=len(q_numbers), filter_chunks=filter_chunks ) # Generate metadata YAML metadata_content = METADATA_TEMPLATE.format( timestamp=timestamp, timestamp_file=timestamp_file, q_count=len(q_numbers), chunk_count=len(chunks) ) # Write files output_dir.mkdir(parents=True, exist_ok=True) query_file = output_dir / f'gallery_query_updated_{timestamp_file}.sparql' metadata_file = output_dir / f'gallery_query_updated_{timestamp_file}.yaml' with open(query_file, 'w', encoding='utf-8') as f: f.write(sparql_content) print(f"✅ Wrote query: {query_file}") with open(metadata_file, 'w', encoding='utf-8') as f: f.write(metadata_content) print(f"✅ Wrote metadata: {metadata_file}") return query_file, metadata_file def main(): """Main entry point.""" # Define paths project_root = Path(__file__).parent.parent yaml_path = project_root / 'data' / 'wikidata' / 'GLAMORCUBEPSXHFN' / 'hyponyms_curated.yaml' output_dir = project_root / 'data' / 'wikidata' / 'GLAMORCUBEPSXHFN' / 'G' / 'queries' # Validate input file exists if not yaml_path.exists(): print(f"❌ Error: {yaml_path} not found") return # Generate query print("=" * 60) print("GALLERY (G-CLASS) QUERY GENERATOR") print("=" * 60) query_file, metadata_file = generate_query(yaml_path, output_dir) print("=" * 60) print("✅ Query generation complete!") print(f"Query file: {query_file}") print(f"Metadata file: {metadata_file}") print("=" * 60) if __name__ == '__main__': main()