glam/scripts/generate_gallery_query_with_exclusions.py
2025-11-19 23:25:22 +01:00

323 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Generate Gallery (G-class) SPARQL query with automated Q-number exclusions.
This script:
1. Reads hyponyms_curated.yaml to extract all Q-numbers
2. Generates FILTER chunks (50 Q-numbers per chunk)
3. Creates a complete SPARQL query with all exclusions
4. Saves both the query and metadata YAML
Usage:
python scripts/generate_gallery_query_with_exclusions.py
Input:
- data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml
Output:
- data/wikidata/GLAMORCUBEPSXHFN/G/queries/gallery_query_updated_<timestamp>.sparql
- data/wikidata/GLAMORCUBEPSXHFN/G/queries/gallery_query_updated_<timestamp>.yaml
"""
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Set
# Base SPARQL query template with 14 VERIFIED base classes for galleries
# Updated 2025-11-16 based on Wikidata MCP + Exa verification
SPARQL_TEMPLATE = """# Gallery (G) Class - Missing Hyponym Discovery Query
# Generated: {timestamp}
# Excludes {q_count} Q-numbers from hyponyms_curated.yaml
# Base classes: 14 VERIFIED (art galleries, institutions, specialized types, alternative spaces)
#
# VERIFICATION: All Q-numbers verified via Wikidata MCP tool on 2025-11-16
# See: data/wikidata/GLAMORCUBEPSXHFN/G/VERIFIED_Q_NUMBERS.md
#
# IMPORTANT: This query finds hyponyms of the base classes specified below.
# The FILTER statements exclude curated Q-numbers from RESULTS, not from traversal.
# Results are capped at 10,000 to prevent timeout.
SELECT DISTINCT ?hyponym ?hyponymLabel ?hyponymAltLabel WHERE {{
{{
# Core gallery types (Priority 1)
# Art gallery (space/building) - Q1007870
?hyponym wdt:P279+ wd:Q1007870 .
}} UNION {{
# Art gallery (institution/organization) - Q98818526
?hyponym wdt:P279+ wd:Q98818526 .
}} UNION {{
# Art museum (institution) - Q3196771
?hyponym wdt:P279+ wd:Q3196771 .
}} UNION {{
# Art museum (building/space) - Q207694
?hyponym wdt:P279+ wd:Q207694 .
}} UNION {{
# Specialized Gallery Types (Priority 2)
# Kunsthalle (temporary exhibitions) - Q1475403
?hyponym wdt:P279+ wd:Q1475403 .
}} UNION {{
# Commercial art gallery - Q56856618
?hyponym wdt:P279+ wd:Q56856618 .
}} UNION {{
# Contemporary art gallery - Q16038801
?hyponym wdt:P279+ wd:Q16038801 .
}} UNION {{
# Noncommercial art gallery - Q67165238
?hyponym wdt:P279+ wd:Q67165238 .
}} UNION {{
# Institutional Variants (Priority 3)
# National gallery - Q3844310
?hyponym wdt:P279+ wd:Q3844310 .
}} UNION {{
# Artist-run space - Q4034417
?hyponym wdt:P279+ wd:Q4034417 .
}} UNION {{
# Alternative exhibition space - Q16002704
?hyponym wdt:P279+ wd:Q16002704 .
}} UNION {{
# Physical Variants (Priority 4)
# Sculpture garden - Q1759852
?hyponym wdt:P279+ wd:Q1759852 .
}} UNION {{
# Contemporary art museum (type) - Q108860927
?hyponym wdt:P279+ wd:Q108860927 .
}} UNION {{
# Online art gallery - Q7094057
?hyponym wdt:P279+ wd:Q7094057 .
}}
# FILTER: Exclude ALL curated Q-numbers from results
{filter_chunks}
SERVICE wikibase:label {{
bd:serviceParam wikibase:language "en,es,fr,de,nl,pt,ar,zh,ja,ru,hi,id,ms,th,vi,ko,tr,fa,pl,it,uk,sv,cs,he,bn,mr,ta,te,ur,pa,el,ro,hu,da,no,fi,ca,sr,bg,hr,sk,sl".
}}
}}
ORDER BY ?hyponymLabel
LIMIT 10000
"""
METADATA_TEMPLATE = """# Gallery Query Metadata
# Generated: {timestamp}
# Updated: 2025-11-16 with VERIFIED Q-numbers
query_file: gallery_query_updated_{timestamp_file}.sparql
generated_at: {timestamp}
base_classes_count: 14
excluded_q_numbers_count: {q_count}
filter_chunks_count: {chunk_count}
source_file: data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml
verification_report: data/wikidata/GLAMORCUBEPSXHFN/G/VERIFIED_Q_NUMBERS.md
base_classes:
# Core gallery types (Priority 1)
- Q1007870 # Art gallery (space/building)
- Q98818526 # Art gallery (institution/organization)
- Q3196771 # Art museum (institution)
- Q207694 # Art museum (building/space)
# Specialized Gallery Types (Priority 2)
- Q1475403 # Kunsthalle (temporary exhibitions)
- Q56856618 # Commercial art gallery
- Q16038801 # Contemporary art gallery
- Q67165238 # Noncommercial art gallery
# Institutional Variants (Priority 3)
- Q3844310 # National gallery
- Q4034417 # Artist-run space
- Q16002704 # Alternative exhibition space
# Physical Variants (Priority 4)
- Q1759852 # Sculpture garden
- Q108860927 # Contemporary art museum (type)
- Q7094057 # Online art gallery
verification_notes: |
All Q-numbers verified via Wikidata MCP tool on 2025-11-16.
Original 7 base classes expanded to 14 after verification discovered:
- 5 INCORRECT Q-numbers (removed)
- 8 NEW relevant Q-numbers (added)
Key discoveries:
- Institution vs. building distinction (Q98818526 vs Q1007870)
- Artist-run and alternative spaces (Q4034417, Q16002704)
- National galleries as distinct type (Q3844310)
- Online/digital galleries (Q7094057)
extraction_notes: |
Q-numbers extracted from hyponyms_curated.yaml.
Includes both 'label:' fields (primary Q-numbers) and 'duplicate:' fields (alternative IDs).
FILTER statements organized in chunks of 50 Q-numbers for query optimization.
"""
def extract_q_numbers_from_yaml(yaml_path: Path) -> Set[str]:
"""
Extract all Q-numbers from hyponyms_curated.yaml using regex.
Extracts Q-numbers from two sources:
1. 'label:' fields - primary Q-number for the entity
2. 'duplicate:' fields - alternative Q-numbers that refer to the same entity
This approach is more robust than YAML parsing since the file has formatting issues.
Args:
yaml_path: Path to hyponyms_curated.yaml
Returns:
Set of Q-numbers (e.g., {'Q167346', 'Q43501', ...})
"""
with open(yaml_path, 'r', encoding='utf-8') as f:
content = f.read()
q_numbers = set()
# Pattern 1: Extract from "label: Q<digits>" lines
label_pattern = r'^\s*-?\s*label:\s+(Q\d+)'
for match in re.finditer(label_pattern, content, re.MULTILINE):
q_number = match.group(1).strip()
q_numbers.add(q_number)
# Pattern 2: Extract from "duplicate:" lists
duplicate_pattern = r'^\s+duplicate:\s*\n((?:\s+-\s+Q\d+\s*\n?)+)'
for match in re.finditer(duplicate_pattern, content, re.MULTILINE):
duplicate_block = match.group(1)
# Extract individual Q-numbers from the duplicate block
q_in_block = re.findall(r'Q\d+', duplicate_block)
q_numbers.update(q_in_block)
return q_numbers
def chunk_q_numbers(q_numbers: Set[str], chunk_size: int = 50) -> List[List[str]]:
"""
Split Q-numbers into chunks for FILTER statements.
Args:
q_numbers: Set of Q-numbers
chunk_size: Number of Q-numbers per chunk (default: 50)
Returns:
List of Q-number chunks (each chunk is a list)
"""
sorted_q_numbers = sorted(q_numbers, key=lambda x: int(x[1:])) # Sort by numeric part
chunks = []
for i in range(0, len(sorted_q_numbers), chunk_size):
chunk = sorted_q_numbers[i:i + chunk_size]
chunks.append(chunk)
return chunks
def generate_filter_chunks(chunks: List[List[str]]) -> str:
"""
Generate FILTER statements from Q-number chunks.
Args:
chunks: List of Q-number chunks
Returns:
FILTER statements as string
"""
filter_lines = []
for chunk in chunks:
q_list = ', '.join([f'wd:{q}' for q in chunk])
filter_line = f' FILTER(?hyponym NOT IN ({q_list}))'
filter_lines.append(filter_line)
return '\n'.join(filter_lines)
def generate_query(yaml_path: Path, output_dir: Path) -> tuple[Path, Path]:
"""
Generate SPARQL query with exclusions from hyponyms_curated.yaml.
Args:
yaml_path: Path to hyponyms_curated.yaml
output_dir: Output directory for query files
Returns:
Tuple of (query_file_path, metadata_file_path)
"""
# Extract Q-numbers
print(f"Reading Q-numbers from {yaml_path}")
q_numbers = extract_q_numbers_from_yaml(yaml_path)
print(f"Extracted {len(q_numbers)} Q-numbers")
# Create chunks
chunks = chunk_q_numbers(q_numbers, chunk_size=50)
print(f"Created {len(chunks)} FILTER chunks")
# Generate FILTER statements
filter_chunks = generate_filter_chunks(chunks)
# Generate timestamp
now = datetime.now(timezone.utc)
timestamp = now.strftime('%Y-%m-%dT%H:%M:%S+00:00')
timestamp_file = now.strftime('%Y%m%dT%H%M%S')
# Generate SPARQL query
sparql_content = SPARQL_TEMPLATE.format(
timestamp=timestamp,
q_count=len(q_numbers),
filter_chunks=filter_chunks
)
# Generate metadata YAML
metadata_content = METADATA_TEMPLATE.format(
timestamp=timestamp,
timestamp_file=timestamp_file,
q_count=len(q_numbers),
chunk_count=len(chunks)
)
# Write files
output_dir.mkdir(parents=True, exist_ok=True)
query_file = output_dir / f'gallery_query_updated_{timestamp_file}.sparql'
metadata_file = output_dir / f'gallery_query_updated_{timestamp_file}.yaml'
with open(query_file, 'w', encoding='utf-8') as f:
f.write(sparql_content)
print(f"✅ Wrote query: {query_file}")
with open(metadata_file, 'w', encoding='utf-8') as f:
f.write(metadata_content)
print(f"✅ Wrote metadata: {metadata_file}")
return query_file, metadata_file
def main():
"""Main entry point."""
# Define paths
project_root = Path(__file__).parent.parent
yaml_path = project_root / 'data' / 'wikidata' / 'GLAMORCUBEPSXHFN' / 'hyponyms_curated.yaml'
output_dir = project_root / 'data' / 'wikidata' / 'GLAMORCUBEPSXHFN' / 'G' / 'queries'
# Validate input file exists
if not yaml_path.exists():
print(f"❌ Error: {yaml_path} not found")
return
# Generate query
print("=" * 60)
print("GALLERY (G-CLASS) QUERY GENERATOR")
print("=" * 60)
query_file, metadata_file = generate_query(yaml_path, output_dir)
print("=" * 60)
print("✅ Query generation complete!")
print(f"Query file: {query_file}")
print(f"Metadata file: {metadata_file}")
print("=" * 60)
if __name__ == '__main__':
main()