203 lines
6.3 KiB
Python
203 lines
6.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix and improve Q-number extraction for hyponyms_curated.yaml
|
|
|
|
This script provides an improved Q-number extraction function that:
|
|
1. Handles both 'label:' and 'labels:' (typo case)
|
|
2. Extracts from all nested locations
|
|
3. Provides comprehensive coverage
|
|
|
|
Usage:
|
|
from scripts.extract_q_numbers_robust import extract_all_q_numbers
|
|
|
|
q_numbers = extract_all_q_numbers(yaml_path)
|
|
"""
|
|
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Set
|
|
|
|
|
|
def extract_all_q_numbers(yaml_path: Path) -> Set[str]:
|
|
"""
|
|
Extract ALL Q-numbers from hyponyms_curated.yaml using robust regex.
|
|
|
|
This function handles:
|
|
- 'label:' fields (singular - correct)
|
|
- 'labels:' fields (plural - typo case)
|
|
- 'duplicate:' fields
|
|
- Q-numbers in nested structures
|
|
- Q-numbers anywhere in the file
|
|
|
|
Args:
|
|
yaml_path: Path to hyponyms_curated.yaml
|
|
|
|
Returns:
|
|
Set of Q-numbers (e.g., {'Q167346', 'Q43501', ...})
|
|
"""
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
q_numbers = set()
|
|
|
|
# Pattern 1: Extract from "label: Q<digits>" lines (singular - correct)
|
|
label_pattern = r'^\s*-?\s*label:\s+(Q\d+)'
|
|
for match in re.finditer(label_pattern, content, re.MULTILINE):
|
|
q_number = match.group(1).strip()
|
|
q_numbers.add(q_number)
|
|
|
|
# Pattern 2: Extract from "labels: Q<digits>" lines (plural - typo case)
|
|
labels_pattern = r'^\s*-?\s*labels:\s+(Q\d+)'
|
|
for match in re.finditer(labels_pattern, content, re.MULTILINE):
|
|
q_number = match.group(1).strip()
|
|
q_numbers.add(q_number)
|
|
|
|
# Pattern 3: Extract from "duplicate:" lists
|
|
duplicate_pattern = r'^\s+duplicate:\s*\n((?:\s+-\s+Q\d+\s*\n?)+)'
|
|
for match in re.finditer(duplicate_pattern, content, re.MULTILINE):
|
|
duplicate_block = match.group(1)
|
|
q_in_block = re.findall(r'Q\d+', duplicate_block)
|
|
q_numbers.update(q_in_block)
|
|
|
|
# Pattern 4: Catch-all - any Q-number that might be in other locations
|
|
# This ensures we don't miss Q-numbers in unexpected places
|
|
all_q_pattern = r'\bQ\d+\b'
|
|
all_q_numbers = set(re.findall(all_q_pattern, content))
|
|
|
|
# Merge all Q-numbers
|
|
q_numbers.update(all_q_numbers)
|
|
|
|
return q_numbers
|
|
|
|
|
|
def extract_q_numbers_from_sections(yaml_path: Path,
|
|
exclude_sections: list = []) -> Set[str]:
|
|
"""
|
|
Extract Q-numbers from specific sections only.
|
|
|
|
Useful for excluding certain sections (like 'exclude') from extraction.
|
|
|
|
Args:
|
|
yaml_path: Path to hyponyms_curated.yaml
|
|
exclude_sections: List of section names to exclude (default: ['exclude'])
|
|
|
|
Returns:
|
|
Set of Q-numbers from non-excluded sections
|
|
"""
|
|
if not exclude_sections:
|
|
exclude_sections = ['exclude']
|
|
|
|
import yaml
|
|
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
q_numbers = set()
|
|
|
|
# Sections to process
|
|
all_sections = ['sources', 'hypernym', 'entity', 'entity_list', 'standard', 'collection']
|
|
sections_to_process = [s for s in all_sections if s not in exclude_sections]
|
|
|
|
def extract_from_structure(obj):
|
|
"""Recursively extract Q-numbers from nested structures."""
|
|
if isinstance(obj, dict):
|
|
for key, value in obj.items():
|
|
# Check if this is a label/labels field with Q-number
|
|
if key in ['label', 'labels'] and isinstance(value, str):
|
|
q_match = re.match(r'^(Q\d+)', value)
|
|
if q_match:
|
|
q_numbers.add(q_match.group(1))
|
|
|
|
# Recurse
|
|
extract_from_structure(value)
|
|
|
|
elif isinstance(obj, list):
|
|
for item in obj:
|
|
extract_from_structure(item)
|
|
|
|
elif isinstance(obj, str):
|
|
# Check if string starts with Q followed by digits
|
|
q_match = re.match(r'^(Q\d+)', obj)
|
|
if q_match:
|
|
q_numbers.add(q_match.group(1))
|
|
|
|
# Extract from specified sections
|
|
for section_name in sections_to_process:
|
|
if section_name in data:
|
|
extract_from_structure(data[section_name])
|
|
|
|
return q_numbers
|
|
|
|
|
|
def find_data_quality_issues(yaml_path: Path) -> dict:
|
|
"""
|
|
Find data quality issues in hyponyms_curated.yaml.
|
|
|
|
Returns:
|
|
Dict with issue types and examples
|
|
"""
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
issues = {
|
|
'typos': {
|
|
'labels_instead_of_label': [],
|
|
'type_instead_of_types': [],
|
|
},
|
|
'formatting': {
|
|
'inconsistent_indentation': [],
|
|
},
|
|
'data': {
|
|
'q_numbers_in_wrong_location': []
|
|
}
|
|
}
|
|
|
|
# Find "labels:" (plural - should be "label:")
|
|
labels_plural_pattern = r'^\s*-?\s*(labels:)\s+(Q\d+)'
|
|
for i, match in enumerate(re.finditer(labels_plural_pattern, content, re.MULTILINE)):
|
|
if i < 10: # Limit to 10 examples
|
|
issues['typos']['labels_instead_of_label'].append({
|
|
'line': match.group(0).strip(),
|
|
'q_number': match.group(2)
|
|
})
|
|
|
|
return issues
|
|
|
|
|
|
def main():
|
|
"""Test the extraction functions."""
|
|
project_root = Path(__file__).parent.parent
|
|
yaml_path = project_root / "data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml"
|
|
|
|
print("=" * 70)
|
|
print("ROBUST Q-NUMBER EXTRACTION TEST")
|
|
print("=" * 70)
|
|
|
|
if not yaml_path.exists():
|
|
print(f"❌ File not found: {yaml_path}")
|
|
return
|
|
|
|
# Extract Q-numbers
|
|
print(f"\n📂 Extracting from: {yaml_path.name}\n")
|
|
|
|
q_numbers = extract_all_q_numbers(yaml_path)
|
|
print(f"✅ Extracted {len(q_numbers)} Q-numbers")
|
|
|
|
# Find data quality issues
|
|
print(f"\n🔍 Checking for data quality issues...\n")
|
|
issues = find_data_quality_issues(yaml_path)
|
|
|
|
labels_issues = issues['typos']['labels_instead_of_label']
|
|
if labels_issues:
|
|
print(f"⚠️ Found {len(labels_issues)} instances of 'labels:' (should be 'label:')")
|
|
print(f" Examples:")
|
|
for ex in labels_issues[:3]:
|
|
print(f" {ex['line']}")
|
|
else:
|
|
print(f"✅ No 'labels:' typos found")
|
|
|
|
print("\n" + "=" * 70)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|