353 lines
10 KiB
Python
353 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate Botanical/Zoo (B-class) SPARQL query with automated Q-number exclusions.
|
|
|
|
This script:
|
|
1. Reads hyponyms_curated.yaml to extract all Q-numbers
|
|
2. Generates FILTER chunks (50 Q-numbers per chunk)
|
|
3. Creates a complete SPARQL query with all exclusions
|
|
4. Saves both the query and metadata YAML
|
|
|
|
Usage:
|
|
python scripts/generate_botanical_query_with_exclusions.py
|
|
|
|
Input:
|
|
- data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml
|
|
|
|
Output:
|
|
- data/wikidata/GLAMORCUBEPSXHFN/B/queries/botanical_query_updated_<timestamp>.sparql
|
|
- data/wikidata/GLAMORCUBEPSXHFN/B/queries/botanical_query_updated_<timestamp>.yaml
|
|
"""
|
|
|
|
import yaml
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import List, Set
|
|
|
|
# Base SPARQL query template with 27 base classes
|
|
SPARQL_TEMPLATE = """# Botanical Garden & Zoo (B) Class - Missing Hyponym Discovery Query
|
|
# Generated: {timestamp}
|
|
# Excludes {q_count} Q-numbers from hyponyms_curated.yaml
|
|
# Base classes: 27 (botanical gardens, zoos, aquariums, arboreta, etc.)
|
|
#
|
|
# IMPORTANT: This query finds hyponyms of the base classes specified below.
|
|
# The FILTER statements exclude curated Q-numbers from RESULTS, not from traversal.
|
|
# Results are capped at 10,000 to prevent timeout.
|
|
|
|
SELECT DISTINCT ?hyponym ?hyponymLabel ?hyponymAltLabel WHERE {{
|
|
{{
|
|
# Botanical gardens - Q167346
|
|
?hyponym wdt:P279+ wd:Q167346 .
|
|
}} UNION {{
|
|
# Zoos - Q43501
|
|
?hyponym wdt:P279+ wd:Q43501 .
|
|
}} UNION {{
|
|
# Aquariums - Q2281788
|
|
?hyponym wdt:P279+ wd:Q2281788 .
|
|
}} UNION {{
|
|
# Arboreta - Q7712619
|
|
?hyponym wdt:P279+ wd:Q7712619 .
|
|
}} UNION {{
|
|
# Herbarium - Q181916
|
|
?hyponym wdt:P279+ wd:Q181916 .
|
|
}} UNION {{
|
|
# Natural history museums - Q1970365
|
|
?hyponym wdt:P279+ wd:Q1970365 .
|
|
}} UNION {{
|
|
# Wildlife reserves - Q20268591
|
|
?hyponym wdt:P279+ wd:Q20268591 .
|
|
}} UNION {{
|
|
# Nature reserves - Q179049
|
|
?hyponym wdt:P279+ wd:Q179049 .
|
|
}} UNION {{
|
|
# National parks - Q46169
|
|
?hyponym wdt:P279+ wd:Q46169 .
|
|
}} UNION {{
|
|
# Protected areas - Q473972
|
|
?hyponym wdt:P279+ wd:Q473972 .
|
|
}} UNION {{
|
|
# Biosphere reserves - Q158454
|
|
?hyponym wdt:P279+ wd:Q158454 .
|
|
}} UNION {{
|
|
# Safari parks - Q21164403
|
|
?hyponym wdt:P279+ wd:Q21164403 .
|
|
}} UNION {{
|
|
# Safari parks - Q9480202
|
|
?hyponym wdt:P279+ wd:Q9480202 .
|
|
}} UNION {{
|
|
# Wildlife sanctuaries - Q8085554
|
|
?hyponym wdt:P279+ wd:Q8085554 .
|
|
}} UNION {{
|
|
# Marine reserve - Q2616170
|
|
?hyponym wdt:P279+ wd:Q2616170 .
|
|
}} UNION {{
|
|
# Conservation areas - Q936257
|
|
?hyponym wdt:P279+ wd:Q936257 .
|
|
}} UNION {{
|
|
# Seed banks - Q1426613
|
|
?hyponym wdt:P279+ wd:Q1426613 .
|
|
}} UNION {{
|
|
# Biorepository - Q4915239
|
|
?hyponym wdt:P279+ wd:Q4915239 .
|
|
}} UNION {{
|
|
# Natural history collection - Q2982911
|
|
?hyponym wdt:P279+ wd:Q2982911 .
|
|
}} UNION {{
|
|
# Gene bank - Q1905347
|
|
?hyponym wdt:P279+ wd:Q1905347 .
|
|
}} UNION {{
|
|
# Biobank - Q864217
|
|
?hyponym wdt:P279+ wd:Q864217 .
|
|
}} UNION {{
|
|
# Soilbank - Q2189151
|
|
?hyponym wdt:P279+ wd:Q2189151 .
|
|
}} UNION {{
|
|
# Herbaria - Q8508664
|
|
?hyponym wdt:P279+ wd:Q8508664 .
|
|
}} UNION {{
|
|
# Culture collections - Q11489453
|
|
?hyponym wdt:P279+ wd:Q11489453 .
|
|
}} UNION {{
|
|
# Natural monuments - Q23790
|
|
?hyponym wdt:P279+ wd:Q23790 .
|
|
}} UNION {{
|
|
# Natural heritage - Q386426
|
|
?hyponym wdt:P279+ wd:Q386426 .
|
|
}} UNION {{
|
|
# Natural heritage - Q526826
|
|
?hyponym wdt:P279+ wd:Q526826 .
|
|
}}
|
|
|
|
# FILTER: Exclude ALL curated Q-numbers from results
|
|
{filter_chunks}
|
|
|
|
SERVICE wikibase:label {{
|
|
bd:serviceParam wikibase:language "en,es,fr,de,nl,pt,ar,zh,ja,ru,hi,id,ms,th,vi,ko,tr,fa,pl,it,uk,sv,cs,he,bn,mr,ta,te,ur,pa,el,ro,hu,da,no,fi,ca,sr,bg,hr,sk,sl".
|
|
}}
|
|
}}
|
|
ORDER BY ?hyponymLabel
|
|
LIMIT 10000
|
|
"""
|
|
|
|
METADATA_TEMPLATE = """# Botanical Query Metadata
|
|
# Generated: {timestamp}
|
|
|
|
query_file: botanical_query_updated_{timestamp_file}.sparql
|
|
generated_at: {timestamp}
|
|
base_classes_count: 27
|
|
excluded_q_numbers_count: {q_count}
|
|
filter_chunks_count: {chunk_count}
|
|
source_file: data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml
|
|
|
|
base_classes:
|
|
- Q167346 # Botanical gardens
|
|
- Q43501 # Zoos
|
|
- Q2281788 # Aquariums
|
|
- Q7712619 # Arboreta
|
|
- Q181916 # Herbarium
|
|
- Q1970365 # Natural history museums
|
|
- Q20268591 # Wildlife reserves
|
|
- Q179049 # Nature reserves
|
|
- Q46169 # National parks
|
|
- Q473972 # Protected areas
|
|
- Q158454 # Biosphere reserves
|
|
- Q21164403 # Safari parks
|
|
- Q9480202 # Safari parks (alt)
|
|
- Q8085554 # Wildlife sanctuaries
|
|
- Q2616170 # Marine reserve
|
|
- Q936257 # Conservation areas
|
|
- Q1426613 # Seed banks
|
|
- Q4915239 # Biorepository
|
|
- Q2982911 # Natural history collection
|
|
- Q1905347 # Gene bank
|
|
- Q864217 # Biobank
|
|
- Q2189151 # Soilbank
|
|
- Q8508664 # Herbaria
|
|
- Q11489453 # Culture collections
|
|
- Q23790 # Natural monuments
|
|
- Q386426 # Natural heritage
|
|
- Q526826 # Natural heritage (alt)
|
|
|
|
extraction_notes: |
|
|
Q-numbers extracted from hyponyms_curated.yaml 'hypernym' section.
|
|
Each Q-number in the 'label' field is excluded from query results.
|
|
FILTER statements organized in chunks of 50 Q-numbers for query optimization.
|
|
"""
|
|
|
|
|
|
def extract_q_numbers_from_yaml(yaml_path: Path) -> Set[str]:
|
|
"""
|
|
Extract all Q-numbers from hyponyms_curated.yaml using regex.
|
|
|
|
Extracts Q-numbers from two sources:
|
|
1. 'label:' fields - primary Q-number for the entity
|
|
2. 'duplicate:' fields - alternative Q-numbers that refer to the same entity
|
|
|
|
This approach is more robust than YAML parsing since the file has formatting issues.
|
|
|
|
Args:
|
|
yaml_path: Path to hyponyms_curated.yaml
|
|
|
|
Returns:
|
|
Set of Q-numbers (e.g., {'Q167346', 'Q43501', ...})
|
|
"""
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
q_numbers = set()
|
|
|
|
# Pattern 1: Extract from "label: Q<digits>" lines
|
|
# This captures Q-numbers in the format:
|
|
# - label: Q12345
|
|
label_pattern = r'^\s*-?\s*label:\s+(Q\d+)'
|
|
|
|
for match in re.finditer(label_pattern, content, re.MULTILINE):
|
|
q_number = match.group(1).strip()
|
|
q_numbers.add(q_number)
|
|
|
|
# Pattern 2: Extract from "duplicate:" lists
|
|
# This captures Q-numbers in the format:
|
|
# duplicate:
|
|
# - Q31838911
|
|
# - Q98765432
|
|
duplicate_pattern = r'^\s+duplicate:\s*\n((?:\s+-\s+Q\d+\s*\n?)+)'
|
|
|
|
for match in re.finditer(duplicate_pattern, content, re.MULTILINE):
|
|
duplicate_block = match.group(1)
|
|
# Extract individual Q-numbers from the duplicate block
|
|
q_in_block = re.findall(r'Q\d+', duplicate_block)
|
|
q_numbers.update(q_in_block)
|
|
|
|
return q_numbers
|
|
|
|
|
|
def chunk_q_numbers(q_numbers: Set[str], chunk_size: int = 50) -> List[List[str]]:
|
|
"""
|
|
Split Q-numbers into chunks for FILTER statements.
|
|
|
|
Args:
|
|
q_numbers: Set of Q-numbers
|
|
chunk_size: Number of Q-numbers per chunk (default: 50)
|
|
|
|
Returns:
|
|
List of Q-number chunks (each chunk is a list)
|
|
"""
|
|
sorted_q_numbers = sorted(q_numbers, key=lambda x: int(x[1:])) # Sort by numeric part
|
|
chunks = []
|
|
|
|
for i in range(0, len(sorted_q_numbers), chunk_size):
|
|
chunk = sorted_q_numbers[i:i + chunk_size]
|
|
chunks.append(chunk)
|
|
|
|
return chunks
|
|
|
|
|
|
def generate_filter_chunks(chunks: List[List[str]]) -> str:
|
|
"""
|
|
Generate FILTER statements from Q-number chunks.
|
|
|
|
Args:
|
|
chunks: List of Q-number chunks
|
|
|
|
Returns:
|
|
FILTER statements as string
|
|
"""
|
|
filter_lines = []
|
|
|
|
for chunk in chunks:
|
|
q_list = ', '.join([f'wd:{q}' for q in chunk])
|
|
filter_line = f' FILTER(?hyponym NOT IN ({q_list}))'
|
|
filter_lines.append(filter_line)
|
|
|
|
return '\n'.join(filter_lines)
|
|
|
|
|
|
def generate_query(yaml_path: Path, output_dir: Path) -> tuple[Path, Path]:
|
|
"""
|
|
Generate SPARQL query with exclusions from hyponyms_curated.yaml.
|
|
|
|
Args:
|
|
yaml_path: Path to hyponyms_curated.yaml
|
|
output_dir: Output directory for query files
|
|
|
|
Returns:
|
|
Tuple of (query_file_path, metadata_file_path)
|
|
"""
|
|
# Extract Q-numbers
|
|
print(f"Reading Q-numbers from {yaml_path}")
|
|
q_numbers = extract_q_numbers_from_yaml(yaml_path)
|
|
print(f"Extracted {len(q_numbers)} Q-numbers")
|
|
|
|
# Create chunks
|
|
chunks = chunk_q_numbers(q_numbers, chunk_size=50)
|
|
print(f"Created {len(chunks)} FILTER chunks")
|
|
|
|
# Generate FILTER statements
|
|
filter_chunks = generate_filter_chunks(chunks)
|
|
|
|
# Generate timestamp
|
|
now = datetime.now(timezone.utc)
|
|
timestamp = now.strftime('%Y-%m-%dT%H:%M:%S+00:00')
|
|
timestamp_file = now.strftime('%Y%m%dT%H%M%S')
|
|
|
|
# Generate SPARQL query
|
|
sparql_content = SPARQL_TEMPLATE.format(
|
|
timestamp=timestamp,
|
|
q_count=len(q_numbers),
|
|
filter_chunks=filter_chunks
|
|
)
|
|
|
|
# Generate metadata YAML
|
|
metadata_content = METADATA_TEMPLATE.format(
|
|
timestamp=timestamp,
|
|
timestamp_file=timestamp_file,
|
|
q_count=len(q_numbers),
|
|
chunk_count=len(chunks)
|
|
)
|
|
|
|
# Write files
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
query_file = output_dir / f'botanical_query_updated_{timestamp_file}.sparql'
|
|
metadata_file = output_dir / f'botanical_query_updated_{timestamp_file}.yaml'
|
|
|
|
with open(query_file, 'w', encoding='utf-8') as f:
|
|
f.write(sparql_content)
|
|
print(f"✅ Wrote query: {query_file}")
|
|
|
|
with open(metadata_file, 'w', encoding='utf-8') as f:
|
|
f.write(metadata_content)
|
|
print(f"✅ Wrote metadata: {metadata_file}")
|
|
|
|
return query_file, metadata_file
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
# Define paths
|
|
project_root = Path(__file__).parent.parent
|
|
yaml_path = project_root / 'data' / 'wikidata' / 'GLAMORCUBEPSXHFN' / 'hyponyms_curated.yaml'
|
|
output_dir = project_root / 'data' / 'wikidata' / 'GLAMORCUBEPSXHFN' / 'B' / 'queries'
|
|
|
|
# Validate input file exists
|
|
if not yaml_path.exists():
|
|
print(f"❌ Error: {yaml_path} not found")
|
|
return
|
|
|
|
# Generate query
|
|
print("=" * 60)
|
|
print("BOTANICAL QUERY GENERATOR")
|
|
print("=" * 60)
|
|
|
|
query_file, metadata_file = generate_query(yaml_path, output_dir)
|
|
|
|
print("=" * 60)
|
|
print("✅ Query generation complete!")
|
|
print(f"Query file: {query_file}")
|
|
print(f"Metadata file: {metadata_file}")
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|