#!/usr/bin/env python3 """Fast data quality scan - optimized for speed.""" import os import re import yaml from pathlib import Path from collections import defaultdict from datetime import datetime # Use C loader for speed try: from yaml import CSafeLoader as SafeLoader except ImportError: from yaml import SafeLoader CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") def extract_ghcid_type(filename): match = re.match(r'NL-[A-Z]{2}-[A-Z]{3}-([A-Z])-', filename) return match.group(1) if match else None def scan_file_fast(filepath): """Fast scan using string operations where possible.""" filename = filepath.name issues = [] try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() except Exception as e: return ['parse_error'] # Quick string-based checks first # Absolute paths if '/Volumes/KINGSTON/' in content or '/Users/kempersc/' in content: issues.append('absolute_paths') # No URL if '\nurl:' not in content and 'url: ' not in content[:500]: issues.append('no_url') # HTTP instead of HTTPS if 'url: http://' in content: issues.append('http_not_https') # No digital_platforms if 'digital_platforms:' not in content: issues.append('no_digital_platforms') elif 'digital_platforms: []\n' in content or 'digital_platforms:\n-' not in content: issues.append('empty_digital_platforms') # No verified_claims if 'verified_claims:' not in content: issues.append('no_verified_claims') # Wikidata NOT_FOUND if "status: NOT_FOUND" in content: issues.append('wikidata_not_found') elif 'wikidata_enrichment:' not in content: issues.append('no_wikidata_enrichment') # Unknown type in filename ghcid_type = extract_ghcid_type(filename) if ghcid_type == 'U': issues.append('unknown_type_U') # Parse YAML only for complex checks try: data = yaml.load(content, Loader=SafeLoader) except: issues.append('yaml_parse_error') return issues if not data: issues.append('empty_file') return issues # Check GHCID type mismatch if 'original_entry' in data: oe = data['original_entry'] expected = None if 'type' in oe and oe['type'] and isinstance(oe['type'], list): expected = oe['type'][0] elif 'type_organisatie' in oe and oe['type_organisatie']: type_map = {'archive': 'A', 'archief': 'A', 'library': 'L', 'bibliotheek': 'L', 'museum': 'M', 'gallery': 'G'} expected = type_map.get(oe['type_organisatie'].lower()) if expected and ghcid_type and ghcid_type != expected: issues.append(f'wrong_type:{ghcid_type}→{expected}') # Check Google Maps mismatch if 'google_maps_enrichment' in data and 'original_entry' in data: gm_name = data['google_maps_enrichment'].get('name', '').lower() org_name = data['original_entry'].get('organisatie', '').lower() if gm_name and org_name: gm_words = set(gm_name.split()) - {'de', 'het', 'van', 'en', 'stichting'} org_words = set(org_name.split()) - {'de', 'het', 'van', 'en', 'stichting'} if gm_words and org_words: overlap = len(gm_words & org_words) similarity = overlap / max(len(gm_words), len(org_words)) if similarity < 0.25: issues.append('google_maps_mismatch') # Check coordinates if 'location' in data: loc = data['location'] lat = loc.get('latitude') lon = loc.get('longitude') if lat is None or lon is None: issues.append('missing_coordinates') elif not (50.5 < lat < 53.7 and 3.3 < lon < 7.3): issues.append('coords_outside_NL') else: issues.append('no_location') return issues def main(): print(f"Fast scan started: {datetime.now().isoformat()}") files = sorted(CUSTODIAN_DIR.glob("NL-*.yaml")) total = len(files) print(f"Scanning {total} Dutch custodian files...") issue_counts = defaultdict(int) files_with_issues = defaultdict(list) for i, fp in enumerate(files): issues = scan_file_fast(fp) for issue in issues: issue_counts[issue] += 1 files_with_issues[issue].append(fp.name) print(f"\nScan complete: {datetime.now().isoformat()}") print("\n" + "=" * 80) print("DATA QUALITY SUMMARY REPORT") print("=" * 80) print(f"\nTotal files: {total}") # Count files with any issue all_issue_files = set() for files_list in files_with_issues.values(): all_issue_files.update(files_list) print(f"Files with issues: {len(all_issue_files)} ({100*len(all_issue_files)/total:.1f}%)") print(f"Clean files: {total - len(all_issue_files)}") print("\n" + "-" * 80) print("ISSUE BREAKDOWN") print("-" * 80) # Sort by count for issue, count in sorted(issue_counts.items(), key=lambda x: -x[1]): pct = 100 * count / total bar = "█" * int(pct / 2) print(f"{issue:35} {count:5} ({pct:5.1f}%) {bar}") # Critical issues detail print("\n" + "=" * 80) print("CRITICAL ISSUES (require manual fix)") print("=" * 80) critical_issues = ['wrong_type:', 'google_maps_mismatch', 'absolute_paths', 'unknown_type_U'] for critical in critical_issues: matching = [(k, v) for k, v in files_with_issues.items() if critical in k or k == critical] if matching: for issue_key, file_list in matching: print(f"\n{issue_key} ({len(file_list)} files):") for f in file_list[:15]: print(f" - {f}") if len(file_list) > 15: print(f" ... and {len(file_list) - 15} more") # Save report report_path = CUSTODIAN_DIR.parent / 'reports' / 'dutch_data_quality_fast.yaml' report_path.parent.mkdir(exist_ok=True) report = { 'scan_timestamp': datetime.now().isoformat(), 'total_files': total, 'files_with_issues': len(all_issue_files), 'issue_counts': dict(sorted(issue_counts.items(), key=lambda x: -x[1])), 'files_by_issue': {k: v for k, v in files_with_issues.items()} } with open(report_path, 'w') as f: yaml.dump(report, f, default_flow_style=False, allow_unicode=True) print(f"\n\nFull report saved: {report_path}") if __name__ == '__main__': main()