glam/scripts/extract_q_numbers_robust.py
2025-11-19 23:25:22 +01:00

203 lines
6.3 KiB
Python

#!/usr/bin/env python3
"""
Fix and improve Q-number extraction for hyponyms_curated.yaml
This script provides an improved Q-number extraction function that:
1. Handles both 'label:' and 'labels:' (typo case)
2. Extracts from all nested locations
3. Provides comprehensive coverage
Usage:
from scripts.extract_q_numbers_robust import extract_all_q_numbers
q_numbers = extract_all_q_numbers(yaml_path)
"""
import re
from pathlib import Path
from typing import Set
def extract_all_q_numbers(yaml_path: Path) -> Set[str]:
"""
Extract ALL Q-numbers from hyponyms_curated.yaml using robust regex.
This function handles:
- 'label:' fields (singular - correct)
- 'labels:' fields (plural - typo case)
- 'duplicate:' fields
- Q-numbers in nested structures
- Q-numbers anywhere in the file
Args:
yaml_path: Path to hyponyms_curated.yaml
Returns:
Set of Q-numbers (e.g., {'Q167346', 'Q43501', ...})
"""
with open(yaml_path, 'r', encoding='utf-8') as f:
content = f.read()
q_numbers = set()
# Pattern 1: Extract from "label: Q<digits>" lines (singular - correct)
label_pattern = r'^\s*-?\s*label:\s+(Q\d+)'
for match in re.finditer(label_pattern, content, re.MULTILINE):
q_number = match.group(1).strip()
q_numbers.add(q_number)
# Pattern 2: Extract from "labels: Q<digits>" lines (plural - typo case)
labels_pattern = r'^\s*-?\s*labels:\s+(Q\d+)'
for match in re.finditer(labels_pattern, content, re.MULTILINE):
q_number = match.group(1).strip()
q_numbers.add(q_number)
# Pattern 3: Extract from "duplicate:" lists
duplicate_pattern = r'^\s+duplicate:\s*\n((?:\s+-\s+Q\d+\s*\n?)+)'
for match in re.finditer(duplicate_pattern, content, re.MULTILINE):
duplicate_block = match.group(1)
q_in_block = re.findall(r'Q\d+', duplicate_block)
q_numbers.update(q_in_block)
# Pattern 4: Catch-all - any Q-number that might be in other locations
# This ensures we don't miss Q-numbers in unexpected places
all_q_pattern = r'\bQ\d+\b'
all_q_numbers = set(re.findall(all_q_pattern, content))
# Merge all Q-numbers
q_numbers.update(all_q_numbers)
return q_numbers
def extract_q_numbers_from_sections(yaml_path: Path,
exclude_sections: list = []) -> Set[str]:
"""
Extract Q-numbers from specific sections only.
Useful for excluding certain sections (like 'exclude') from extraction.
Args:
yaml_path: Path to hyponyms_curated.yaml
exclude_sections: List of section names to exclude (default: ['exclude'])
Returns:
Set of Q-numbers from non-excluded sections
"""
if not exclude_sections:
exclude_sections = ['exclude']
import yaml
with open(yaml_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
q_numbers = set()
# Sections to process
all_sections = ['sources', 'hypernym', 'entity', 'entity_list', 'standard', 'collection']
sections_to_process = [s for s in all_sections if s not in exclude_sections]
def extract_from_structure(obj):
"""Recursively extract Q-numbers from nested structures."""
if isinstance(obj, dict):
for key, value in obj.items():
# Check if this is a label/labels field with Q-number
if key in ['label', 'labels'] and isinstance(value, str):
q_match = re.match(r'^(Q\d+)', value)
if q_match:
q_numbers.add(q_match.group(1))
# Recurse
extract_from_structure(value)
elif isinstance(obj, list):
for item in obj:
extract_from_structure(item)
elif isinstance(obj, str):
# Check if string starts with Q followed by digits
q_match = re.match(r'^(Q\d+)', obj)
if q_match:
q_numbers.add(q_match.group(1))
# Extract from specified sections
for section_name in sections_to_process:
if section_name in data:
extract_from_structure(data[section_name])
return q_numbers
def find_data_quality_issues(yaml_path: Path) -> dict:
"""
Find data quality issues in hyponyms_curated.yaml.
Returns:
Dict with issue types and examples
"""
with open(yaml_path, 'r', encoding='utf-8') as f:
content = f.read()
issues = {
'typos': {
'labels_instead_of_label': [],
'type_instead_of_types': [],
},
'formatting': {
'inconsistent_indentation': [],
},
'data': {
'q_numbers_in_wrong_location': []
}
}
# Find "labels:" (plural - should be "label:")
labels_plural_pattern = r'^\s*-?\s*(labels:)\s+(Q\d+)'
for i, match in enumerate(re.finditer(labels_plural_pattern, content, re.MULTILINE)):
if i < 10: # Limit to 10 examples
issues['typos']['labels_instead_of_label'].append({
'line': match.group(0).strip(),
'q_number': match.group(2)
})
return issues
def main():
"""Test the extraction functions."""
project_root = Path(__file__).parent.parent
yaml_path = project_root / "data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml"
print("=" * 70)
print("ROBUST Q-NUMBER EXTRACTION TEST")
print("=" * 70)
if not yaml_path.exists():
print(f"❌ File not found: {yaml_path}")
return
# Extract Q-numbers
print(f"\n📂 Extracting from: {yaml_path.name}\n")
q_numbers = extract_all_q_numbers(yaml_path)
print(f"✅ Extracted {len(q_numbers)} Q-numbers")
# Find data quality issues
print(f"\n🔍 Checking for data quality issues...\n")
issues = find_data_quality_issues(yaml_path)
labels_issues = issues['typos']['labels_instead_of_label']
if labels_issues:
print(f"⚠️ Found {len(labels_issues)} instances of 'labels:' (should be 'label:')")
print(f" Examples:")
for ex in labels_issues[:3]:
print(f" {ex['line']}")
else:
print(f"✅ No 'labels:' typos found")
print("\n" + "=" * 70)
if __name__ == '__main__':
main()