glam/scripts/generate_botanical_query_with_exclusions.py

#!/usr/bin/env python3
"""
Generate Botanical/Zoo (B-class) SPARQL query with automated Q-number exclusions.

This script:
1. Reads hyponyms_curated.yaml to extract all Q-numbers
2. Generates FILTER chunks (50 Q-numbers per chunk)
3. Creates a complete SPARQL query with all exclusions
4. Saves both the query and metadata YAML

Usage:
    python scripts/generate_botanical_query_with_exclusions.py

Input:
    - data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml

Output:
    - data/wikidata/GLAMORCUBEPSXHFN/B/queries/botanical_query_updated_<timestamp>.sparql
    - data/wikidata/GLAMORCUBEPSXHFN/B/queries/botanical_query_updated_<timestamp>.yaml
"""

import yaml
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Set

# Base SPARQL query template with 27 base classes
SPARQL_TEMPLATE = """# Botanical Garden & Zoo (B) Class - Missing Hyponym Discovery Query
# Generated: {timestamp}
# Excludes {q_count} Q-numbers from hyponyms_curated.yaml
# Base classes: 27 (botanical gardens, zoos, aquariums, arboreta, etc.)
#
# IMPORTANT: This query finds hyponyms of the base classes specified below.
# The FILTER statements exclude curated Q-numbers from RESULTS, not from traversal.
# Results are capped at 10,000 to prevent timeout.

SELECT DISTINCT ?hyponym ?hyponymLabel ?hyponymAltLabel WHERE {{
  {{
    # Botanical gardens - Q167346
    ?hyponym wdt:P279+ wd:Q167346 .
  }} UNION {{
    # Zoos - Q43501
    ?hyponym wdt:P279+ wd:Q43501 .
  }} UNION {{
    # Aquariums - Q2281788
    ?hyponym wdt:P279+ wd:Q2281788 .
  }} UNION {{
    # Arboreta - Q7712619
    ?hyponym wdt:P279+ wd:Q7712619 .
  }} UNION {{
    # Herbarium - Q181916
    ?hyponym wdt:P279+ wd:Q181916 .
  }} UNION {{
    # Natural history museums - Q1970365
    ?hyponym wdt:P279+ wd:Q1970365 .
  }} UNION {{
    # Wildlife reserves - Q20268591
    ?hyponym wdt:P279+ wd:Q20268591 .
  }} UNION {{
    # Nature reserves - Q179049
    ?hyponym wdt:P279+ wd:Q179049 .
  }} UNION {{
    # National parks - Q46169
    ?hyponym wdt:P279+ wd:Q46169 .
  }} UNION {{
    # Protected areas - Q473972
    ?hyponym wdt:P279+ wd:Q473972 .
  }} UNION {{
    # Biosphere reserves - Q158454
    ?hyponym wdt:P279+ wd:Q158454 .
  }} UNION {{
    # Safari parks - Q21164403
    ?hyponym wdt:P279+ wd:Q21164403 .
  }} UNION {{
    # Safari parks - Q9480202
    ?hyponym wdt:P279+ wd:Q9480202 .
  }} UNION {{
    # Wildlife sanctuaries - Q8085554
    ?hyponym wdt:P279+ wd:Q8085554 .
  }} UNION {{
    # Marine reserve - Q2616170
    ?hyponym wdt:P279+ wd:Q2616170 .
  }} UNION {{
    # Conservation areas - Q936257
    ?hyponym wdt:P279+ wd:Q936257 .
  }} UNION {{
    # Seed banks - Q1426613
    ?hyponym wdt:P279+ wd:Q1426613 .
  }} UNION {{
    # Biorepository - Q4915239
    ?hyponym wdt:P279+ wd:Q4915239 .
  }} UNION {{
    # Natural history collection - Q2982911
    ?hyponym wdt:P279+ wd:Q2982911 .
  }} UNION {{
    # Gene bank - Q1905347
    ?hyponym wdt:P279+ wd:Q1905347 .
  }} UNION {{
    # Biobank - Q864217
    ?hyponym wdt:P279+ wd:Q864217 .
  }} UNION {{
    # Soilbank - Q2189151
    ?hyponym wdt:P279+ wd:Q2189151 .
  }} UNION {{
    # Herbaria - Q8508664
    ?hyponym wdt:P279+ wd:Q8508664 .
  }} UNION {{
    # Culture collections - Q11489453
    ?hyponym wdt:P279+ wd:Q11489453 .
  }} UNION {{
    # Natural monuments - Q23790
    ?hyponym wdt:P279+ wd:Q23790 .
  }} UNION {{
    # Natural heritage - Q386426
    ?hyponym wdt:P279+ wd:Q386426 .
  }} UNION {{
    # Natural heritage - Q526826
    ?hyponym wdt:P279+ wd:Q526826 .
  }}

  # FILTER: Exclude ALL curated Q-numbers from results
{filter_chunks}

  SERVICE wikibase:label {{
    bd:serviceParam wikibase:language "en,es,fr,de,nl,pt,ar,zh,ja,ru,hi,id,ms,th,vi,ko,tr,fa,pl,it,uk,sv,cs,he,bn,mr,ta,te,ur,pa,el,ro,hu,da,no,fi,ca,sr,bg,hr,sk,sl".
  }}
}}
ORDER BY ?hyponymLabel
LIMIT 10000
"""

METADATA_TEMPLATE = """# Botanical Query Metadata
# Generated: {timestamp}

query_file: botanical_query_updated_{timestamp_file}.sparql
generated_at: {timestamp}
base_classes_count: 27
excluded_q_numbers_count: {q_count}
filter_chunks_count: {chunk_count}
source_file: data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml

base_classes:
  - Q167346  # Botanical gardens
  - Q43501   # Zoos
  - Q2281788 # Aquariums
  - Q7712619 # Arboreta
  - Q181916  # Herbarium
  - Q1970365 # Natural history museums
  - Q20268591 # Wildlife reserves
  - Q179049  # Nature reserves
  - Q46169   # National parks
  - Q473972  # Protected areas
  - Q158454  # Biosphere reserves
  - Q21164403 # Safari parks
  - Q9480202 # Safari parks (alt)
  - Q8085554 # Wildlife sanctuaries
  - Q2616170 # Marine reserve
  - Q936257  # Conservation areas
  - Q1426613 # Seed banks
  - Q4915239 # Biorepository
  - Q2982911 # Natural history collection
  - Q1905347 # Gene bank
  - Q864217  # Biobank
  - Q2189151 # Soilbank
  - Q8508664 # Herbaria
  - Q11489453 # Culture collections
  - Q23790   # Natural monuments
  - Q386426  # Natural heritage
  - Q526826  # Natural heritage (alt)

extraction_notes: |
  Q-numbers extracted from hyponyms_curated.yaml 'hypernym' section.
  Each Q-number in the 'label' field is excluded from query results.
  FILTER statements organized in chunks of 50 Q-numbers for query optimization.
"""


def extract_q_numbers_from_yaml(yaml_path: Path) -> Set[str]:
    """
    Extract all Q-numbers from hyponyms_curated.yaml using regex.

    Extracts Q-numbers from two sources:
    1. 'label:' fields - primary Q-number for the entity
    2. 'duplicate:' fields - alternative Q-numbers that refer to the same entity

    This approach is more robust than YAML parsing since the file has formatting issues.

    Args:
        yaml_path: Path to hyponyms_curated.yaml

    Returns:
        Set of Q-numbers (e.g., {'Q167346', 'Q43501', ...})
    """
    with open(yaml_path, 'r', encoding='utf-8') as f:
        content = f.read()

    q_numbers = set()

    # Pattern 1: Extract from "label: Q<digits>" lines
    # This captures Q-numbers in the format:
    #   - label: Q12345
    label_pattern = r'^\s*-?\s*label:\s+(Q\d+)'

    for match in re.finditer(label_pattern, content, re.MULTILINE):
        q_number = match.group(1).strip()
        q_numbers.add(q_number)

    # Pattern 2: Extract from "duplicate:" lists
    # This captures Q-numbers in the format:
    #   duplicate:
    #     - Q31838911
    #     - Q98765432
    duplicate_pattern = r'^\s+duplicate:\s*\n((?:\s+-\s+Q\d+\s*\n?)+)'

    for match in re.finditer(duplicate_pattern, content, re.MULTILINE):
        duplicate_block = match.group(1)
        # Extract individual Q-numbers from the duplicate block
        q_in_block = re.findall(r'Q\d+', duplicate_block)
        q_numbers.update(q_in_block)

    return q_numbers


def chunk_q_numbers(q_numbers: Set[str], chunk_size: int = 50) -> List[List[str]]:
    """
    Split Q-numbers into chunks for FILTER statements.

    Args:
        q_numbers: Set of Q-numbers
        chunk_size: Number of Q-numbers per chunk (default: 50)

    Returns:
        List of Q-number chunks (each chunk is a list)
    """
    sorted_q_numbers = sorted(q_numbers, key=lambda x: int(x[1:]))  # Sort by numeric part
    chunks = []

    for i in range(0, len(sorted_q_numbers), chunk_size):
        chunk = sorted_q_numbers[i:i + chunk_size]
        chunks.append(chunk)

    return chunks


def generate_filter_chunks(chunks: List[List[str]]) -> str:
    """
    Generate FILTER statements from Q-number chunks.

    Args:
        chunks: List of Q-number chunks

    Returns:
        FILTER statements as string
    """
    filter_lines = []

    for chunk in chunks:
        q_list = ', '.join([f'wd:{q}' for q in chunk])
        filter_line = f'  FILTER(?hyponym NOT IN ({q_list}))'
        filter_lines.append(filter_line)

    return '\n'.join(filter_lines)


def generate_query(yaml_path: Path, output_dir: Path) -> tuple[Path, Path]:
    """
    Generate SPARQL query with exclusions from hyponyms_curated.yaml.

    Args:
        yaml_path: Path to hyponyms_curated.yaml
        output_dir: Output directory for query files

    Returns:
        Tuple of (query_file_path, metadata_file_path)
    """
    # Extract Q-numbers
    print(f"Reading Q-numbers from {yaml_path}")
    q_numbers = extract_q_numbers_from_yaml(yaml_path)
    print(f"Extracted {len(q_numbers)} Q-numbers")

    # Create chunks
    chunks = chunk_q_numbers(q_numbers, chunk_size=50)
    print(f"Created {len(chunks)} FILTER chunks")

    # Generate FILTER statements
    filter_chunks = generate_filter_chunks(chunks)

    # Generate timestamp
    now = datetime.now(timezone.utc)
    timestamp = now.strftime('%Y-%m-%dT%H:%M:%S+00:00')
    timestamp_file = now.strftime('%Y%m%dT%H%M%S')

    # Generate SPARQL query
    sparql_content = SPARQL_TEMPLATE.format(
        timestamp=timestamp,
        q_count=len(q_numbers),
        filter_chunks=filter_chunks
    )

    # Generate metadata YAML
    metadata_content = METADATA_TEMPLATE.format(
        timestamp=timestamp,
        timestamp_file=timestamp_file,
        q_count=len(q_numbers),
        chunk_count=len(chunks)
    )

    # Write files
    output_dir.mkdir(parents=True, exist_ok=True)

    query_file = output_dir / f'botanical_query_updated_{timestamp_file}.sparql'
    metadata_file = output_dir / f'botanical_query_updated_{timestamp_file}.yaml'

    with open(query_file, 'w', encoding='utf-8') as f:
        f.write(sparql_content)
    print(f"✅ Wrote query: {query_file}")

    with open(metadata_file, 'w', encoding='utf-8') as f:
        f.write(metadata_content)
    print(f"✅ Wrote metadata: {metadata_file}")

    return query_file, metadata_file


def main():
    """Main entry point."""
    # Define paths
    project_root = Path(__file__).parent.parent
    yaml_path = project_root / 'data' / 'wikidata' / 'GLAMORCUBEPSXHFN' / 'hyponyms_curated.yaml'
    output_dir = project_root / 'data' / 'wikidata' / 'GLAMORCUBEPSXHFN' / 'B' / 'queries'

    # Validate input file exists
    if not yaml_path.exists():
        print(f"❌ Error: {yaml_path} not found")
        return

    # Generate query
    print("=" * 60)
    print("BOTANICAL QUERY GENERATOR")
    print("=" * 60)

    query_file, metadata_file = generate_query(yaml_path, output_dir)

    print("=" * 60)
    print("✅ Query generation complete!")
    print(f"Query file: {query_file}")
    print(f"Metadata file: {metadata_file}")
    print("=" * 60)


if __name__ == '__main__':
    main()