#!/usr/bin/env python3 """ Fix and improve Q-number extraction for hyponyms_curated.yaml This script provides an improved Q-number extraction function that: 1. Handles both 'label:' and 'labels:' (typo case) 2. Extracts from all nested locations 3. Provides comprehensive coverage Usage: from scripts.extract_q_numbers_robust import extract_all_q_numbers q_numbers = extract_all_q_numbers(yaml_path) """ import re from pathlib import Path from typing import Set def extract_all_q_numbers(yaml_path: Path) -> Set[str]: """ Extract ALL Q-numbers from hyponyms_curated.yaml using robust regex. This function handles: - 'label:' fields (singular - correct) - 'labels:' fields (plural - typo case) - 'duplicate:' fields - Q-numbers in nested structures - Q-numbers anywhere in the file Args: yaml_path: Path to hyponyms_curated.yaml Returns: Set of Q-numbers (e.g., {'Q167346', 'Q43501', ...}) """ with open(yaml_path, 'r', encoding='utf-8') as f: content = f.read() q_numbers = set() # Pattern 1: Extract from "label: Q" lines (singular - correct) label_pattern = r'^\s*-?\s*label:\s+(Q\d+)' for match in re.finditer(label_pattern, content, re.MULTILINE): q_number = match.group(1).strip() q_numbers.add(q_number) # Pattern 2: Extract from "labels: Q" lines (plural - typo case) labels_pattern = r'^\s*-?\s*labels:\s+(Q\d+)' for match in re.finditer(labels_pattern, content, re.MULTILINE): q_number = match.group(1).strip() q_numbers.add(q_number) # Pattern 3: Extract from "duplicate:" lists duplicate_pattern = r'^\s+duplicate:\s*\n((?:\s+-\s+Q\d+\s*\n?)+)' for match in re.finditer(duplicate_pattern, content, re.MULTILINE): duplicate_block = match.group(1) q_in_block = re.findall(r'Q\d+', duplicate_block) q_numbers.update(q_in_block) # Pattern 4: Catch-all - any Q-number that might be in other locations # This ensures we don't miss Q-numbers in unexpected places all_q_pattern = r'\bQ\d+\b' all_q_numbers = set(re.findall(all_q_pattern, content)) # Merge all Q-numbers q_numbers.update(all_q_numbers) return q_numbers def extract_q_numbers_from_sections(yaml_path: Path, exclude_sections: list = []) -> Set[str]: """ Extract Q-numbers from specific sections only. Useful for excluding certain sections (like 'exclude') from extraction. Args: yaml_path: Path to hyponyms_curated.yaml exclude_sections: List of section names to exclude (default: ['exclude']) Returns: Set of Q-numbers from non-excluded sections """ if not exclude_sections: exclude_sections = ['exclude'] import yaml with open(yaml_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) q_numbers = set() # Sections to process all_sections = ['sources', 'hypernym', 'entity', 'entity_list', 'standard', 'collection'] sections_to_process = [s for s in all_sections if s not in exclude_sections] def extract_from_structure(obj): """Recursively extract Q-numbers from nested structures.""" if isinstance(obj, dict): for key, value in obj.items(): # Check if this is a label/labels field with Q-number if key in ['label', 'labels'] and isinstance(value, str): q_match = re.match(r'^(Q\d+)', value) if q_match: q_numbers.add(q_match.group(1)) # Recurse extract_from_structure(value) elif isinstance(obj, list): for item in obj: extract_from_structure(item) elif isinstance(obj, str): # Check if string starts with Q followed by digits q_match = re.match(r'^(Q\d+)', obj) if q_match: q_numbers.add(q_match.group(1)) # Extract from specified sections for section_name in sections_to_process: if section_name in data: extract_from_structure(data[section_name]) return q_numbers def find_data_quality_issues(yaml_path: Path) -> dict: """ Find data quality issues in hyponyms_curated.yaml. Returns: Dict with issue types and examples """ with open(yaml_path, 'r', encoding='utf-8') as f: content = f.read() issues = { 'typos': { 'labels_instead_of_label': [], 'type_instead_of_types': [], }, 'formatting': { 'inconsistent_indentation': [], }, 'data': { 'q_numbers_in_wrong_location': [] } } # Find "labels:" (plural - should be "label:") labels_plural_pattern = r'^\s*-?\s*(labels:)\s+(Q\d+)' for i, match in enumerate(re.finditer(labels_plural_pattern, content, re.MULTILINE)): if i < 10: # Limit to 10 examples issues['typos']['labels_instead_of_label'].append({ 'line': match.group(0).strip(), 'q_number': match.group(2) }) return issues def main(): """Test the extraction functions.""" project_root = Path(__file__).parent.parent yaml_path = project_root / "data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml" print("=" * 70) print("ROBUST Q-NUMBER EXTRACTION TEST") print("=" * 70) if not yaml_path.exists(): print(f"❌ File not found: {yaml_path}") return # Extract Q-numbers print(f"\n📂 Extracting from: {yaml_path.name}\n") q_numbers = extract_all_q_numbers(yaml_path) print(f"✅ Extracted {len(q_numbers)} Q-numbers") # Find data quality issues print(f"\n🔍 Checking for data quality issues...\n") issues = find_data_quality_issues(yaml_path) labels_issues = issues['typos']['labels_instead_of_label'] if labels_issues: print(f"⚠️ Found {len(labels_issues)} instances of 'labels:' (should be 'label:')") print(f" Examples:") for ex in labels_issues[:3]: print(f" {ex['line']}") else: print(f"✅ No 'labels:' typos found") print("\n" + "=" * 70) if __name__ == '__main__': main()