#!/usr/bin/env python3 """ Generate Botanical/Zoo (B-class) SPARQL query with automated Q-number exclusions. This script: 1. Reads hyponyms_curated.yaml to extract all Q-numbers 2. Generates FILTER chunks (50 Q-numbers per chunk) 3. Creates a complete SPARQL query with all exclusions 4. Saves both the query and metadata YAML Usage: python scripts/generate_botanical_query_with_exclusions.py Input: - data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml Output: - data/wikidata/GLAMORCUBEPSXHFN/B/queries/botanical_query_updated_.sparql - data/wikidata/GLAMORCUBEPSXHFN/B/queries/botanical_query_updated_.yaml """ import yaml import re from datetime import datetime, timezone from pathlib import Path from typing import List, Set # Base SPARQL query template with 27 base classes SPARQL_TEMPLATE = """# Botanical Garden & Zoo (B) Class - Missing Hyponym Discovery Query # Generated: {timestamp} # Excludes {q_count} Q-numbers from hyponyms_curated.yaml # Base classes: 27 (botanical gardens, zoos, aquariums, arboreta, etc.) # # IMPORTANT: This query finds hyponyms of the base classes specified below. # The FILTER statements exclude curated Q-numbers from RESULTS, not from traversal. # Results are capped at 10,000 to prevent timeout. SELECT DISTINCT ?hyponym ?hyponymLabel ?hyponymAltLabel WHERE {{ {{ # Botanical gardens - Q167346 ?hyponym wdt:P279+ wd:Q167346 . }} UNION {{ # Zoos - Q43501 ?hyponym wdt:P279+ wd:Q43501 . }} UNION {{ # Aquariums - Q2281788 ?hyponym wdt:P279+ wd:Q2281788 . }} UNION {{ # Arboreta - Q7712619 ?hyponym wdt:P279+ wd:Q7712619 . }} UNION {{ # Herbarium - Q181916 ?hyponym wdt:P279+ wd:Q181916 . }} UNION {{ # Natural history museums - Q1970365 ?hyponym wdt:P279+ wd:Q1970365 . }} UNION {{ # Wildlife reserves - Q20268591 ?hyponym wdt:P279+ wd:Q20268591 . }} UNION {{ # Nature reserves - Q179049 ?hyponym wdt:P279+ wd:Q179049 . }} UNION {{ # National parks - Q46169 ?hyponym wdt:P279+ wd:Q46169 . }} UNION {{ # Protected areas - Q473972 ?hyponym wdt:P279+ wd:Q473972 . }} UNION {{ # Biosphere reserves - Q158454 ?hyponym wdt:P279+ wd:Q158454 . }} UNION {{ # Safari parks - Q21164403 ?hyponym wdt:P279+ wd:Q21164403 . }} UNION {{ # Safari parks - Q9480202 ?hyponym wdt:P279+ wd:Q9480202 . }} UNION {{ # Wildlife sanctuaries - Q8085554 ?hyponym wdt:P279+ wd:Q8085554 . }} UNION {{ # Marine reserve - Q2616170 ?hyponym wdt:P279+ wd:Q2616170 . }} UNION {{ # Conservation areas - Q936257 ?hyponym wdt:P279+ wd:Q936257 . }} UNION {{ # Seed banks - Q1426613 ?hyponym wdt:P279+ wd:Q1426613 . }} UNION {{ # Biorepository - Q4915239 ?hyponym wdt:P279+ wd:Q4915239 . }} UNION {{ # Natural history collection - Q2982911 ?hyponym wdt:P279+ wd:Q2982911 . }} UNION {{ # Gene bank - Q1905347 ?hyponym wdt:P279+ wd:Q1905347 . }} UNION {{ # Biobank - Q864217 ?hyponym wdt:P279+ wd:Q864217 . }} UNION {{ # Soilbank - Q2189151 ?hyponym wdt:P279+ wd:Q2189151 . }} UNION {{ # Herbaria - Q8508664 ?hyponym wdt:P279+ wd:Q8508664 . }} UNION {{ # Culture collections - Q11489453 ?hyponym wdt:P279+ wd:Q11489453 . }} UNION {{ # Natural monuments - Q23790 ?hyponym wdt:P279+ wd:Q23790 . }} UNION {{ # Natural heritage - Q386426 ?hyponym wdt:P279+ wd:Q386426 . }} UNION {{ # Natural heritage - Q526826 ?hyponym wdt:P279+ wd:Q526826 . }} # FILTER: Exclude ALL curated Q-numbers from results {filter_chunks} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,es,fr,de,nl,pt,ar,zh,ja,ru,hi,id,ms,th,vi,ko,tr,fa,pl,it,uk,sv,cs,he,bn,mr,ta,te,ur,pa,el,ro,hu,da,no,fi,ca,sr,bg,hr,sk,sl". }} }} ORDER BY ?hyponymLabel LIMIT 10000 """ METADATA_TEMPLATE = """# Botanical Query Metadata # Generated: {timestamp} query_file: botanical_query_updated_{timestamp_file}.sparql generated_at: {timestamp} base_classes_count: 27 excluded_q_numbers_count: {q_count} filter_chunks_count: {chunk_count} source_file: data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml base_classes: - Q167346 # Botanical gardens - Q43501 # Zoos - Q2281788 # Aquariums - Q7712619 # Arboreta - Q181916 # Herbarium - Q1970365 # Natural history museums - Q20268591 # Wildlife reserves - Q179049 # Nature reserves - Q46169 # National parks - Q473972 # Protected areas - Q158454 # Biosphere reserves - Q21164403 # Safari parks - Q9480202 # Safari parks (alt) - Q8085554 # Wildlife sanctuaries - Q2616170 # Marine reserve - Q936257 # Conservation areas - Q1426613 # Seed banks - Q4915239 # Biorepository - Q2982911 # Natural history collection - Q1905347 # Gene bank - Q864217 # Biobank - Q2189151 # Soilbank - Q8508664 # Herbaria - Q11489453 # Culture collections - Q23790 # Natural monuments - Q386426 # Natural heritage - Q526826 # Natural heritage (alt) extraction_notes: | Q-numbers extracted from hyponyms_curated.yaml 'hypernym' section. Each Q-number in the 'label' field is excluded from query results. FILTER statements organized in chunks of 50 Q-numbers for query optimization. """ def extract_q_numbers_from_yaml(yaml_path: Path) -> Set[str]: """ Extract all Q-numbers from hyponyms_curated.yaml using regex. Extracts Q-numbers from two sources: 1. 'label:' fields - primary Q-number for the entity 2. 'duplicate:' fields - alternative Q-numbers that refer to the same entity This approach is more robust than YAML parsing since the file has formatting issues. Args: yaml_path: Path to hyponyms_curated.yaml Returns: Set of Q-numbers (e.g., {'Q167346', 'Q43501', ...}) """ with open(yaml_path, 'r', encoding='utf-8') as f: content = f.read() q_numbers = set() # Pattern 1: Extract from "label: Q" lines # This captures Q-numbers in the format: # - label: Q12345 label_pattern = r'^\s*-?\s*label:\s+(Q\d+)' for match in re.finditer(label_pattern, content, re.MULTILINE): q_number = match.group(1).strip() q_numbers.add(q_number) # Pattern 2: Extract from "duplicate:" lists # This captures Q-numbers in the format: # duplicate: # - Q31838911 # - Q98765432 duplicate_pattern = r'^\s+duplicate:\s*\n((?:\s+-\s+Q\d+\s*\n?)+)' for match in re.finditer(duplicate_pattern, content, re.MULTILINE): duplicate_block = match.group(1) # Extract individual Q-numbers from the duplicate block q_in_block = re.findall(r'Q\d+', duplicate_block) q_numbers.update(q_in_block) return q_numbers def chunk_q_numbers(q_numbers: Set[str], chunk_size: int = 50) -> List[List[str]]: """ Split Q-numbers into chunks for FILTER statements. Args: q_numbers: Set of Q-numbers chunk_size: Number of Q-numbers per chunk (default: 50) Returns: List of Q-number chunks (each chunk is a list) """ sorted_q_numbers = sorted(q_numbers, key=lambda x: int(x[1:])) # Sort by numeric part chunks = [] for i in range(0, len(sorted_q_numbers), chunk_size): chunk = sorted_q_numbers[i:i + chunk_size] chunks.append(chunk) return chunks def generate_filter_chunks(chunks: List[List[str]]) -> str: """ Generate FILTER statements from Q-number chunks. Args: chunks: List of Q-number chunks Returns: FILTER statements as string """ filter_lines = [] for chunk in chunks: q_list = ', '.join([f'wd:{q}' for q in chunk]) filter_line = f' FILTER(?hyponym NOT IN ({q_list}))' filter_lines.append(filter_line) return '\n'.join(filter_lines) def generate_query(yaml_path: Path, output_dir: Path) -> tuple[Path, Path]: """ Generate SPARQL query with exclusions from hyponyms_curated.yaml. Args: yaml_path: Path to hyponyms_curated.yaml output_dir: Output directory for query files Returns: Tuple of (query_file_path, metadata_file_path) """ # Extract Q-numbers print(f"Reading Q-numbers from {yaml_path}") q_numbers = extract_q_numbers_from_yaml(yaml_path) print(f"Extracted {len(q_numbers)} Q-numbers") # Create chunks chunks = chunk_q_numbers(q_numbers, chunk_size=50) print(f"Created {len(chunks)} FILTER chunks") # Generate FILTER statements filter_chunks = generate_filter_chunks(chunks) # Generate timestamp now = datetime.now(timezone.utc) timestamp = now.strftime('%Y-%m-%dT%H:%M:%S+00:00') timestamp_file = now.strftime('%Y%m%dT%H%M%S') # Generate SPARQL query sparql_content = SPARQL_TEMPLATE.format( timestamp=timestamp, q_count=len(q_numbers), filter_chunks=filter_chunks ) # Generate metadata YAML metadata_content = METADATA_TEMPLATE.format( timestamp=timestamp, timestamp_file=timestamp_file, q_count=len(q_numbers), chunk_count=len(chunks) ) # Write files output_dir.mkdir(parents=True, exist_ok=True) query_file = output_dir / f'botanical_query_updated_{timestamp_file}.sparql' metadata_file = output_dir / f'botanical_query_updated_{timestamp_file}.yaml' with open(query_file, 'w', encoding='utf-8') as f: f.write(sparql_content) print(f"✅ Wrote query: {query_file}") with open(metadata_file, 'w', encoding='utf-8') as f: f.write(metadata_content) print(f"✅ Wrote metadata: {metadata_file}") return query_file, metadata_file def main(): """Main entry point.""" # Define paths project_root = Path(__file__).parent.parent yaml_path = project_root / 'data' / 'wikidata' / 'GLAMORCUBEPSXHFN' / 'hyponyms_curated.yaml' output_dir = project_root / 'data' / 'wikidata' / 'GLAMORCUBEPSXHFN' / 'B' / 'queries' # Validate input file exists if not yaml_path.exists(): print(f"❌ Error: {yaml_path} not found") return # Generate query print("=" * 60) print("BOTANICAL QUERY GENERATOR") print("=" * 60) query_file, metadata_file = generate_query(yaml_path, output_dir) print("=" * 60) print("✅ Query generation complete!") print(f"Query file: {query_file}") print(f"Metadata file: {metadata_file}") print("=" * 60) if __name__ == '__main__': main()