#!/usr/bin/env python3 """ Comprehensive data quality scan for Dutch custodian YAML files. Identifies issues like wrong GHCID types, missing web claims, Google Maps mismatches, etc. """ import os import re import yaml from pathlib import Path from collections import defaultdict from datetime import datetime CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") # Issue categories issues = defaultdict(list) def extract_ghcid_type(filename): """Extract type code from GHCID filename (e.g., NL-ZH-ZOE-A-SAZS -> A)""" match = re.match(r'NL-[A-Z]{2}-[A-Z]{3}-([A-Z])-', filename) return match.group(1) if match else None def get_expected_type(data): """Determine expected type from original_entry or other fields""" # Check original_entry.type if 'original_entry' in data: oe = data['original_entry'] if 'type' in oe and oe['type']: types = oe['type'] if isinstance(types, list) and len(types) > 0: return types[0] if 'type_organisatie' in oe: type_org = oe['type_organisatie'] if type_org: type_map = { 'archive': 'A', 'archief': 'A', 'library': 'L', 'bibliotheek': 'L', 'museum': 'M', 'gallery': 'G', 'galerie': 'G', } return type_map.get(type_org.lower(), None) return None def check_google_maps_mismatch(data, filename): """Check if Google Maps name doesn't match organization name""" if 'google_maps_enrichment' not in data: return None gm = data['google_maps_enrichment'] gm_name = gm.get('name', '') # Get original org name org_name = '' if 'original_entry' in data: org_name = data['original_entry'].get('organisatie', '') if 'custodian_name' in data: cn = data['custodian_name'] if isinstance(cn, dict): org_name = cn.get('claim_value', org_name) if not gm_name or not org_name: return None # Simple similarity check - if names share less than 30% of words, flag it gm_words = set(gm_name.lower().split()) org_words = set(org_name.lower().split()) # Remove common words stopwords = {'de', 'het', 'van', 'en', 'in', 'te', 'der', 'voor', 'stichting', 'vereniging'} gm_words = gm_words - stopwords org_words = org_words - stopwords if len(gm_words) == 0 or len(org_words) == 0: return None overlap = len(gm_words & org_words) similarity = overlap / max(len(gm_words), len(org_words)) if similarity < 0.3: return { 'google_name': gm_name, 'org_name': org_name, 'similarity': round(similarity, 2) } return None def check_absolute_paths(data, filename): """Check for absolute paths that should be relative""" yaml_str = yaml.dump(data, default_flow_style=False) abs_paths = [] patterns = [ r'/Volumes/KINGSTON/', r'/Users/kempersc/', r'/mnt/', r'C:\\', r'D:\\' ] for pattern in patterns: if re.search(pattern, yaml_str): abs_paths.append(pattern.rstrip('/\\')) return abs_paths if abs_paths else None def check_web_claims(data, filename): """Check web claims quality""" issues_found = [] if 'web_claims' not in data: return ['no_web_claims'] wc = data['web_claims'] # Check if claims exist claims = wc.get('claims', []) if not claims: issues_found.append('empty_claims') # Check for verified_claims if 'verified_claims' not in wc: issues_found.append('no_verified_claims') else: vc = wc['verified_claims'] if isinstance(vc, dict): vc_claims = vc.get('claims', []) # Check for XPath provenance claims_without_xpath = 0 for claim in vc_claims: if isinstance(claim, dict) and 'xpath' not in claim: claims_without_xpath += 1 if claims_without_xpath > 0: issues_found.append(f'claims_missing_xpath:{claims_without_xpath}') return issues_found if issues_found else None def check_coordinates(data, filename): """Check for coordinate issues""" issues_found = [] # Check if location exists if 'location' not in data: issues_found.append('no_location') return issues_found loc = data['location'] lat = loc.get('latitude') lon = loc.get('longitude') if lat is None or lon is None: issues_found.append('missing_coordinates') elif not (50.5 < lat < 53.7 and 3.3 < lon < 7.3): # Rough Netherlands bounding box issues_found.append('coordinates_outside_netherlands') # Check if coordinates from Google Maps differ significantly from corrected if 'coordinate_provenance' in loc: prov = loc['coordinate_provenance'] if 'previous_coordinates' in prov: issues_found.append('has_coordinate_correction') return issues_found if issues_found else None def check_digital_platforms(data, filename): """Check for missing digital platforms""" if 'digital_platforms' not in data or not data['digital_platforms']: return ['no_digital_platforms'] platforms = data['digital_platforms'] if len(platforms) == 0: return ['empty_digital_platforms'] return None def check_identifiers(data, filename): """Check identifier completeness""" issues_found = [] if 'identifiers' not in data: issues_found.append('no_identifiers') return issues_found ids = data['identifiers'] id_types = [i.get('identifier_scheme') for i in ids if isinstance(i, dict)] if 'ISIL' not in id_types: issues_found.append('no_isil') if 'GHCID' not in id_types: issues_found.append('no_ghcid') return issues_found if issues_found else None def check_wikidata(data, filename): """Check Wikidata enrichment status""" if 'wikidata_enrichment' not in data: return 'no_wikidata_enrichment' wd = data['wikidata_enrichment'] status = wd.get('status', '') if status == 'NOT_FOUND': return 'wikidata_not_found' elif status in ['SUCCESS', 'ENRICHED']: return None else: return f'wikidata_status:{status}' def check_url(data, filename): """Check URL issues""" issues_found = [] url = data.get('url', '') if not url: issues_found.append('no_url') elif url.startswith('http://'): issues_found.append('http_not_https') # Check if URL was corrected (indicates previous wrong URL) if 'url_correction' in data: issues_found.append('has_url_correction') return issues_found if issues_found else None def scan_file(filepath): """Scan a single file for all issue types""" filename = filepath.name file_issues = {} try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) except Exception as e: return {'parse_error': str(e)} if not data: return {'empty_file': True} # 1. Check GHCID type mismatch ghcid_type = extract_ghcid_type(filename) expected_type = get_expected_type(data) if ghcid_type and expected_type and ghcid_type != expected_type: if ghcid_type == 'U' and expected_type != 'U': file_issues['wrong_ghcid_type'] = { 'current': ghcid_type, 'expected': expected_type } # Also check for U type that should be something else if ghcid_type == 'U': file_issues['unknown_type'] = True # 2. Check Google Maps mismatch gm_mismatch = check_google_maps_mismatch(data, filename) if gm_mismatch: file_issues['google_maps_mismatch'] = gm_mismatch # 3. Check absolute paths abs_paths = check_absolute_paths(data, filename) if abs_paths: file_issues['absolute_paths'] = abs_paths # 4. Check web claims wc_issues = check_web_claims(data, filename) if wc_issues: file_issues['web_claims_issues'] = wc_issues # 5. Check coordinates coord_issues = check_coordinates(data, filename) if coord_issues: file_issues['coordinate_issues'] = coord_issues # 6. Check digital platforms dp_issues = check_digital_platforms(data, filename) if dp_issues: file_issues['digital_platform_issues'] = dp_issues # 7. Check identifiers id_issues = check_identifiers(data, filename) if id_issues: file_issues['identifier_issues'] = id_issues # 8. Check Wikidata wd_issue = check_wikidata(data, filename) if wd_issue: file_issues['wikidata_issue'] = wd_issue # 9. Check URL url_issues = check_url(data, filename) if url_issues: file_issues['url_issues'] = url_issues return file_issues def main(): print(f"Scanning Dutch custodian files in {CUSTODIAN_DIR}") print(f"Scan started: {datetime.now().isoformat()}") print("=" * 80) # Collect all issues all_issues = {} issue_counts = defaultdict(int) files = sorted(CUSTODIAN_DIR.glob("NL-*.yaml")) total_files = len(files) print(f"Found {total_files} Dutch custodian files\n") for i, filepath in enumerate(files): if (i + 1) % 200 == 0: print(f"Progress: {i+1}/{total_files} files scanned...", flush=True) file_issues = scan_file(filepath) if file_issues: all_issues[filepath.name] = file_issues for issue_type in file_issues.keys(): issue_counts[issue_type] += 1 print(f"\nScan complete: {total_files} files analyzed") print("=" * 80) # Summary report print("\n" + "=" * 80) print("SUMMARY REPORT: Data Quality Issues") print("=" * 80) print(f"\nTotal files scanned: {total_files}") print(f"Files with issues: {len(all_issues)}") print(f"Files without issues: {total_files - len(all_issues)}") print("\n" + "-" * 80) print("ISSUE BREAKDOWN BY TYPE") print("-" * 80) # Sort issues by count sorted_issues = sorted(issue_counts.items(), key=lambda x: -x[1]) for issue_type, count in sorted_issues: pct = (count / total_files) * 100 print(f"{issue_type:40} {count:5} files ({pct:5.1f}%)") # Detailed breakdown for critical issues print("\n" + "=" * 80) print("CRITICAL ISSUES - REQUIRE IMMEDIATE ATTENTION") print("=" * 80) # 1. Wrong GHCID type wrong_type_files = [(f, d) for f, d in all_issues.items() if 'wrong_ghcid_type' in d] print(f"\n1. WRONG GHCID TYPE ({len(wrong_type_files)} files)") print("-" * 40) if wrong_type_files: for filename, data in wrong_type_files[:20]: info = data['wrong_ghcid_type'] print(f" {filename}: {info['current']} -> should be {info['expected']}") if len(wrong_type_files) > 20: print(f" ... and {len(wrong_type_files) - 20} more") else: print(" None found") # 2. Google Maps mismatches gm_mismatch_files = [(f, d) for f, d in all_issues.items() if 'google_maps_mismatch' in d] print(f"\n2. GOOGLE MAPS MISMATCHES ({len(gm_mismatch_files)} files)") print("-" * 40) if gm_mismatch_files: for filename, data in gm_mismatch_files[:20]: info = data['google_maps_mismatch'] print(f" {filename}") print(f" Google: {info['google_name']}") print(f" Org: {info['org_name']}") print(f" Similarity: {info['similarity']}") if len(gm_mismatch_files) > 20: print(f" ... and {len(gm_mismatch_files) - 20} more") else: print(" None found") # 3. Absolute paths abs_path_files = [(f, d) for f, d in all_issues.items() if 'absolute_paths' in d] print(f"\n3. ABSOLUTE PATHS ({len(abs_path_files)} files)") print("-" * 40) if abs_path_files: for filename, data in abs_path_files[:10]: print(f" {filename}: {data['absolute_paths']}") if len(abs_path_files) > 10: print(f" ... and {len(abs_path_files) - 10} more") else: print(" None found") # 4. Unknown type (U) unknown_type_files = [f for f, d in all_issues.items() if 'unknown_type' in d] print(f"\n4. UNKNOWN TYPE CODE 'U' ({len(unknown_type_files)} files)") print("-" * 40) if unknown_type_files: for filename in unknown_type_files[:30]: print(f" {filename}") if len(unknown_type_files) > 30: print(f" ... and {len(unknown_type_files) - 30} more") else: print(" None found") print("\n" + "=" * 80) print("ENRICHMENT GAPS") print("=" * 80) # Web claims issues no_verified_claims = [f for f, d in all_issues.items() if 'web_claims_issues' in d and 'no_verified_claims' in d['web_claims_issues']] print(f"\n5. NO VERIFIED WEB CLAIMS ({len(no_verified_claims)} files)") # Digital platforms no_platforms = [f for f, d in all_issues.items() if 'digital_platform_issues' in d] print(f"6. NO DIGITAL PLATFORMS ({len(no_platforms)} files)") # Wikidata no_wikidata = [f for f, d in all_issues.items() if d.get('wikidata_issue') in ['no_wikidata_enrichment', 'wikidata_not_found']] print(f"7. NO WIKIDATA ENRICHMENT ({len(no_wikidata)} files)") # URLs no_url = [f for f, d in all_issues.items() if 'url_issues' in d and 'no_url' in d['url_issues']] print(f"8. NO URL ({len(no_url)} files)") # Save detailed report report_file = CUSTODIAN_DIR.parent / 'reports' / 'dutch_data_quality_scan.yaml' report_file.parent.mkdir(exist_ok=True) report = { 'scan_timestamp': datetime.now().isoformat(), 'total_files': total_files, 'files_with_issues': len(all_issues), 'issue_counts': dict(sorted_issues), 'detailed_issues': all_issues } with open(report_file, 'w', encoding='utf-8') as f: yaml.dump(report, f, default_flow_style=False, allow_unicode=True) print(f"\n\nDetailed report saved to: {report_file}") print(f"Scan completed: {datetime.now().isoformat()}") if __name__ == '__main__': main()