glam/scripts/extract_q_numbers_robust.py

#!/usr/bin/env python3
"""
Fix and improve Q-number extraction for hyponyms_curated.yaml

This script provides an improved Q-number extraction function that:
1. Handles both 'label:' and 'labels:' (typo case)
2. Extracts from all nested locations
3. Provides comprehensive coverage

Usage:
    from scripts.extract_q_numbers_robust import extract_all_q_numbers

    q_numbers = extract_all_q_numbers(yaml_path)
"""

import re
from pathlib import Path
from typing import Set


def extract_all_q_numbers(yaml_path: Path) -> Set[str]:
    """
    Extract ALL Q-numbers from hyponyms_curated.yaml using robust regex.

    This function handles:
    - 'label:' fields (singular - correct)
    - 'labels:' fields (plural - typo case)
    - 'duplicate:' fields
    - Q-numbers in nested structures
    - Q-numbers anywhere in the file

    Args:
        yaml_path: Path to hyponyms_curated.yaml

    Returns:
        Set of Q-numbers (e.g., {'Q167346', 'Q43501', ...})
    """
    with open(yaml_path, 'r', encoding='utf-8') as f:
        content = f.read()

    q_numbers = set()

    # Pattern 1: Extract from "label: Q<digits>" lines (singular - correct)
    label_pattern = r'^\s*-?\s*label:\s+(Q\d+)'
    for match in re.finditer(label_pattern, content, re.MULTILINE):
        q_number = match.group(1).strip()
        q_numbers.add(q_number)

    # Pattern 2: Extract from "labels: Q<digits>" lines (plural - typo case)
    labels_pattern = r'^\s*-?\s*labels:\s+(Q\d+)'
    for match in re.finditer(labels_pattern, content, re.MULTILINE):
        q_number = match.group(1).strip()
        q_numbers.add(q_number)

    # Pattern 3: Extract from "duplicate:" lists
    duplicate_pattern = r'^\s+duplicate:\s*\n((?:\s+-\s+Q\d+\s*\n?)+)'
    for match in re.finditer(duplicate_pattern, content, re.MULTILINE):
        duplicate_block = match.group(1)
        q_in_block = re.findall(r'Q\d+', duplicate_block)
        q_numbers.update(q_in_block)

    # Pattern 4: Catch-all - any Q-number that might be in other locations
    # This ensures we don't miss Q-numbers in unexpected places
    all_q_pattern = r'\bQ\d+\b'
    all_q_numbers = set(re.findall(all_q_pattern, content))

    # Merge all Q-numbers
    q_numbers.update(all_q_numbers)

    return q_numbers


def extract_q_numbers_from_sections(yaml_path: Path,
                                     exclude_sections: list = []) -> Set[str]:
    """
    Extract Q-numbers from specific sections only.

    Useful for excluding certain sections (like 'exclude') from extraction.

    Args:
        yaml_path: Path to hyponyms_curated.yaml
        exclude_sections: List of section names to exclude (default: ['exclude'])

    Returns:
        Set of Q-numbers from non-excluded sections
    """
    if not exclude_sections:
        exclude_sections = ['exclude']

    import yaml

    with open(yaml_path, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    q_numbers = set()

    # Sections to process
    all_sections = ['sources', 'hypernym', 'entity', 'entity_list', 'standard', 'collection']
    sections_to_process = [s for s in all_sections if s not in exclude_sections]

    def extract_from_structure(obj):
        """Recursively extract Q-numbers from nested structures."""
        if isinstance(obj, dict):
            for key, value in obj.items():
                # Check if this is a label/labels field with Q-number
                if key in ['label', 'labels'] and isinstance(value, str):
                    q_match = re.match(r'^(Q\d+)', value)
                    if q_match:
                        q_numbers.add(q_match.group(1))

                # Recurse
                extract_from_structure(value)

        elif isinstance(obj, list):
            for item in obj:
                extract_from_structure(item)

        elif isinstance(obj, str):
            # Check if string starts with Q followed by digits
            q_match = re.match(r'^(Q\d+)', obj)
            if q_match:
                q_numbers.add(q_match.group(1))

    # Extract from specified sections
    for section_name in sections_to_process:
        if section_name in data:
            extract_from_structure(data[section_name])

    return q_numbers


def find_data_quality_issues(yaml_path: Path) -> dict:
    """
    Find data quality issues in hyponyms_curated.yaml.

    Returns:
        Dict with issue types and examples
    """
    with open(yaml_path, 'r', encoding='utf-8') as f:
        content = f.read()

    issues = {
        'typos': {
            'labels_instead_of_label': [],
            'type_instead_of_types': [],
        },
        'formatting': {
            'inconsistent_indentation': [],
        },
        'data': {
            'q_numbers_in_wrong_location': []
        }
    }

    # Find "labels:" (plural - should be "label:")
    labels_plural_pattern = r'^\s*-?\s*(labels:)\s+(Q\d+)'
    for i, match in enumerate(re.finditer(labels_plural_pattern, content, re.MULTILINE)):
        if i < 10:  # Limit to 10 examples
            issues['typos']['labels_instead_of_label'].append({
                'line': match.group(0).strip(),
                'q_number': match.group(2)
            })

    return issues


def main():
    """Test the extraction functions."""
    project_root = Path(__file__).parent.parent
    yaml_path = project_root / "data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml"

    print("=" * 70)
    print("ROBUST Q-NUMBER EXTRACTION TEST")
    print("=" * 70)

    if not yaml_path.exists():
        print(f"❌ File not found: {yaml_path}")
        return

    # Extract Q-numbers
    print(f"\n📂 Extracting from: {yaml_path.name}\n")

    q_numbers = extract_all_q_numbers(yaml_path)
    print(f"✅ Extracted {len(q_numbers)} Q-numbers")

    # Find data quality issues
    print(f"\n🔍 Checking for data quality issues...\n")
    issues = find_data_quality_issues(yaml_path)

    labels_issues = issues['typos']['labels_instead_of_label']
    if labels_issues:
        print(f"⚠️  Found {len(labels_issues)} instances of 'labels:' (should be 'label:')")
        print(f"   Examples:")
        for ex in labels_issues[:3]:
            print(f"     {ex['line']}")
    else:
        print(f"✅ No 'labels:' typos found")

    print("\n" + "=" * 70)


if __name__ == '__main__':
    main()