#!/usr/bin/env python3 """ Test YAML integrity for hyponyms_curated.yaml This script tests: 1. Q-number extraction completeness (all Q-numbers are captured) 2. Nested field preservation (rico, time fields are NOT corrupted) Usage: python scripts/test_yaml_integrity.py """ import re from pathlib import Path from typing import Set import yaml PROJECT_ROOT = Path(__file__).parent.parent HYPONYMS_FILE = PROJECT_ROOT / "data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml" def extract_q_numbers_regex(yaml_path: Path) -> Set[str]: """ Extract Q-numbers using REGEX (current method in generate_gallery_query_with_exclusions.py). Extracts from: 1. 'label:' fields - primary Q-number 2. 'duplicate:' fields - alternative Q-numbers """ with open(yaml_path, 'r', encoding='utf-8') as f: content = f.read() q_numbers = set() # Pattern 1: Extract from "label: Q" lines label_pattern = r'^\s*-?\s*label:\s+(Q\d+)' for match in re.finditer(label_pattern, content, re.MULTILINE): q_number = match.group(1).strip() q_numbers.add(q_number) # Pattern 2: Extract from "duplicate:" lists duplicate_pattern = r'^\s+duplicate:\s*\n((?:\s+-\s+Q\d+\s*\n?)+)' for match in re.finditer(duplicate_pattern, content, re.MULTILINE): duplicate_block = match.group(1) q_in_block = re.findall(r'Q\d+', duplicate_block) q_numbers.update(q_in_block) return q_numbers def extract_q_numbers_comprehensive(yaml_path: Path) -> Set[str]: """ Extract ALL Q-numbers from file using comprehensive regex. This catches Q-numbers in: - label fields - duplicate fields - rico fields (nested) - time fields (nested) - Any other location """ with open(yaml_path, 'r', encoding='utf-8') as f: content = f.read() # Find ALL Q-numbers anywhere in the file all_q_pattern = r'\bQ\d+\b' all_q_numbers = set(re.findall(all_q_pattern, content)) return all_q_numbers def extract_q_numbers_yaml_parse(yaml_path: Path) -> Set[str]: """ Extract Q-numbers by YAML parsing (most accurate but slower). This walks through the entire YAML structure and extracts Q-numbers from: - label fields at any nesting level - Any string that starts with Q followed by digits """ with open(yaml_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) q_numbers = set() def extract_from_structure(obj, path=""): """Recursively extract Q-numbers from nested structures.""" if isinstance(obj, dict): for key, value in obj.items(): new_path = f"{path}.{key}" if path else key # Check if this is a label field with Q-number if key == 'label' and isinstance(value, str) and value.startswith('Q'): q_match = re.match(r'^(Q\d+)', value) if q_match: q_numbers.add(q_match.group(1)) # Recurse extract_from_structure(value, new_path) elif isinstance(obj, list): for i, item in enumerate(obj): new_path = f"{path}[{i}]" extract_from_structure(item, new_path) elif isinstance(obj, str): # Check if string starts with Q followed by digits q_match = re.match(r'^(Q\d+)', obj) if q_match: q_numbers.add(q_match.group(1)) # Extract from all sections for section_name in ['hypernym', 'entity', 'entity_list', 'standard', 'collection', 'exclude']: if section_name in data: extract_from_structure(data[section_name], section_name) return q_numbers def check_nested_field_corruption(yaml_path: Path) -> dict: """ Check if nested fields (rico, time) are properly structured. Returns: Dict with corruption status and details """ with open(yaml_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) results = { 'rico_fields': 0, 'rico_corrupted': 0, 'rico_examples': [], 'time_fields': 0, 'time_corrupted': 0, 'time_examples': [] } def check_structure(obj, parent_label=None): """Recursively check for rico/time field corruption.""" if isinstance(obj, dict): # Check for rico field if 'rico' in obj: results['rico_fields'] += 1 rico_value = obj['rico'] # Expected structure: list with dict containing 'label' key # Example: [{'label': 'recordSetTypes'}] if isinstance(rico_value, list) and len(rico_value) > 0: if isinstance(rico_value[0], dict) and 'label' in rico_value[0]: # Correct structure pass else: # Corrupted structure results['rico_corrupted'] += 1 results['rico_examples'].append({ 'label': parent_label, 'structure': rico_value }) else: # Unexpected structure results['rico_corrupted'] += 1 results['rico_examples'].append({ 'label': parent_label, 'structure': rico_value }) # Check for time field if 'time' in obj: results['time_fields'] += 1 time_value = obj['time'] # Expected structure: list with dict containing 'label' key # Example: [{'label': 'Renaissance'}] or [{'label': '1960s', 'type': [{'label': 'start'}]}] if isinstance(time_value, list) and len(time_value) > 0: if isinstance(time_value[0], dict) and 'label' in time_value[0]: # Correct structure pass else: # Corrupted structure results['time_corrupted'] += 1 results['time_examples'].append({ 'label': parent_label, 'structure': time_value }) else: # Unexpected structure results['time_corrupted'] += 1 results['time_examples'].append({ 'label': parent_label, 'structure': time_value }) # Get parent label current_label = obj.get('label', parent_label) # Recurse for key, value in obj.items(): check_structure(value, current_label) elif isinstance(obj, list): for item in obj: check_structure(item, parent_label) # Check all sections for section_name in ['hypernym', 'entity', 'entity_list', 'standard', 'collection']: if section_name in data: check_structure(data[section_name]) return results def main(): """Main test function.""" print("=" * 70) print("YAML INTEGRITY TEST - hyponyms_curated.yaml") print("=" * 70) if not HYPONYMS_FILE.exists(): print(f"❌ File not found: {HYPONYMS_FILE}") return print(f"\n📂 Testing: {HYPONYMS_FILE.name}\n") # Test 1: Q-number extraction completeness print("=" * 70) print("TEST 1: Q-NUMBER EXTRACTION COMPLETENESS") print("=" * 70) print("\n⏳ Extracting Q-numbers using different methods...\n") regex_q_numbers = extract_q_numbers_regex(HYPONYMS_FILE) comprehensive_q_numbers = extract_q_numbers_comprehensive(HYPONYMS_FILE) yaml_parse_q_numbers = extract_q_numbers_yaml_parse(HYPONYMS_FILE) print(f"Method 1 (REGEX - current): {len(regex_q_numbers)} Q-numbers") print(f"Method 2 (COMPREHENSIVE): {len(comprehensive_q_numbers)} Q-numbers") print(f"Method 3 (YAML PARSE): {len(yaml_parse_q_numbers)} Q-numbers") # Find missing Q-numbers missing_in_regex = comprehensive_q_numbers - regex_q_numbers missing_in_yaml_parse = comprehensive_q_numbers - yaml_parse_q_numbers print(f"\n📊 Analysis:") print(f" Missing in REGEX method: {len(missing_in_regex)} Q-numbers") print(f" Missing in YAML PARSE method: {len(missing_in_yaml_parse)} Q-numbers") if missing_in_regex: print(f"\n⚠️ WARNING: REGEX method is INCOMPLETE!") print(f" Missing Q-numbers (first 20): {sorted(list(missing_in_regex))[:20]}") # Investigate where missing Q-numbers appear with open(HYPONYMS_FILE, 'r', encoding='utf-8') as f: content = f.read() print(f"\n🔍 Where do missing Q-numbers appear?") for q in sorted(list(missing_in_regex))[:5]: lines = [line for line in content.split('\n') if q in line] if lines: print(f"\n {q}:") for line in lines[:2]: print(f" {line.strip()}") if missing_in_yaml_parse: print(f"\n⚠️ WARNING: YAML PARSE method is INCOMPLETE!") print(f" Missing Q-numbers (first 20): {sorted(list(missing_in_yaml_parse))[:20]}") if not missing_in_regex and not missing_in_yaml_parse: print(f"\n✅ All methods extracted the same Q-numbers") # Test 2: Nested field corruption check print("\n" + "=" * 70) print("TEST 2: NESTED FIELD CORRUPTION (rico, time)") print("=" * 70) print("\n⏳ Checking nested field structures...\n") corruption_results = check_nested_field_corruption(HYPONYMS_FILE) print(f"Rico fields found: {corruption_results['rico_fields']}") print(f"Rico fields corrupted: {corruption_results['rico_corrupted']}") if corruption_results['rico_corrupted'] > 0: print(f"\n❌ CORRUPTION DETECTED in rico fields!") print(f" Examples (first 3):") for ex in corruption_results['rico_examples'][:3]: print(f" Label: {ex['label']}") print(f" Structure: {ex['structure']}") else: print(f"✅ All rico fields are correctly structured") print(f"\nTime fields found: {corruption_results['time_fields']}") print(f"Time fields corrupted: {corruption_results['time_corrupted']}") if corruption_results['time_corrupted'] > 0: print(f"\n❌ CORRUPTION DETECTED in time fields!") print(f" Examples (first 3):") for ex in corruption_results['time_examples'][:3]: print(f" Label: {ex['label']}") print(f" Structure: {ex['structure']}") else: print(f"✅ All time fields are correctly structured") # Summary print("\n" + "=" * 70) print("TEST SUMMARY") print("=" * 70) issues_found = [] if missing_in_regex: issues_found.append(f"Q-number extraction incomplete ({len(missing_in_regex)} missing)") if corruption_results['rico_corrupted'] > 0: issues_found.append(f"Rico field corruption ({corruption_results['rico_corrupted']} fields)") if corruption_results['time_corrupted'] > 0: issues_found.append(f"Time field corruption ({corruption_results['time_corrupted']} fields)") if issues_found: print("\n❌ ISSUES FOUND:") for i, issue in enumerate(issues_found, 1): print(f" {i}. {issue}") print("\n⚠️ Recommendations:") if missing_in_regex: print(" - Update extract_q_numbers_from_yaml() to use comprehensive regex") print(" - Or switch to YAML parsing method for complete extraction") if corruption_results['rico_corrupted'] > 0 or corruption_results['time_corrupted'] > 0: print(" - Review enrich_hyponyms_with_wikidata.py for field preservation") print(" - Ensure nested structures are preserved during enrichment") else: print("\n✅ NO ISSUES FOUND - All tests passed!") print("\n" + "=" * 70) if __name__ == '__main__': main()