glam/scripts/scan_dutch_data_quality.py

#!/usr/bin/env python3
"""
Comprehensive data quality scan for Dutch custodian YAML files.
Identifies issues like wrong GHCID types, missing web claims, Google Maps mismatches, etc.
"""

import os
import re
import yaml
from pathlib import Path
from collections import defaultdict
from datetime import datetime

CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")

# Issue categories
issues = defaultdict(list)

def extract_ghcid_type(filename):
    """Extract type code from GHCID filename (e.g., NL-ZH-ZOE-A-SAZS -> A)"""
    match = re.match(r'NL-[A-Z]{2}-[A-Z]{3}-([A-Z])-', filename)
    return match.group(1) if match else None

def get_expected_type(data):
    """Determine expected type from original_entry or other fields"""
    # Check original_entry.type
    if 'original_entry' in data:
        oe = data['original_entry']
        if 'type' in oe and oe['type']:
            types = oe['type']
            if isinstance(types, list) and len(types) > 0:
                return types[0]
        if 'type_organisatie' in oe:
            type_org = oe['type_organisatie']
            if type_org:
                type_map = {
                    'archive': 'A', 'archief': 'A',
                    'library': 'L', 'bibliotheek': 'L',
                    'museum': 'M',
                    'gallery': 'G', 'galerie': 'G',
                }
                return type_map.get(type_org.lower(), None)
    return None

def check_google_maps_mismatch(data, filename):
    """Check if Google Maps name doesn't match organization name"""
    if 'google_maps_enrichment' not in data:
        return None

    gm = data['google_maps_enrichment']
    gm_name = gm.get('name', '')

    # Get original org name
    org_name = ''
    if 'original_entry' in data:
        org_name = data['original_entry'].get('organisatie', '')
    if 'custodian_name' in data:
        cn = data['custodian_name']
        if isinstance(cn, dict):
            org_name = cn.get('claim_value', org_name)

    if not gm_name or not org_name:
        return None

    # Simple similarity check - if names share less than 30% of words, flag it
    gm_words = set(gm_name.lower().split())
    org_words = set(org_name.lower().split())

    # Remove common words
    stopwords = {'de', 'het', 'van', 'en', 'in', 'te', 'der', 'voor', 'stichting', 'vereniging'}
    gm_words = gm_words - stopwords
    org_words = org_words - stopwords

    if len(gm_words) == 0 or len(org_words) == 0:
        return None

    overlap = len(gm_words & org_words)
    similarity = overlap / max(len(gm_words), len(org_words))

    if similarity < 0.3:
        return {
            'google_name': gm_name,
            'org_name': org_name,
            'similarity': round(similarity, 2)
        }
    return None

def check_absolute_paths(data, filename):
    """Check for absolute paths that should be relative"""
    yaml_str = yaml.dump(data, default_flow_style=False)
    abs_paths = []

    patterns = [
        r'/Volumes/KINGSTON/',
        r'/Users/kempersc/',
        r'/mnt/',
        r'C:\\',
        r'D:\\'
    ]

    for pattern in patterns:
        if re.search(pattern, yaml_str):
            abs_paths.append(pattern.rstrip('/\\'))

    return abs_paths if abs_paths else None

def check_web_claims(data, filename):
    """Check web claims quality"""
    issues_found = []

    if 'web_claims' not in data:
        return ['no_web_claims']

    wc = data['web_claims']

    # Check if claims exist
    claims = wc.get('claims', [])
    if not claims:
        issues_found.append('empty_claims')

    # Check for verified_claims
    if 'verified_claims' not in wc:
        issues_found.append('no_verified_claims')
    else:
        vc = wc['verified_claims']
        if isinstance(vc, dict):
            vc_claims = vc.get('claims', [])
            # Check for XPath provenance
            claims_without_xpath = 0
            for claim in vc_claims:
                if isinstance(claim, dict) and 'xpath' not in claim:
                    claims_without_xpath += 1
            if claims_without_xpath > 0:
                issues_found.append(f'claims_missing_xpath:{claims_without_xpath}')

    return issues_found if issues_found else None

def check_coordinates(data, filename):
    """Check for coordinate issues"""
    issues_found = []

    # Check if location exists
    if 'location' not in data:
        issues_found.append('no_location')
        return issues_found

    loc = data['location']
    lat = loc.get('latitude')
    lon = loc.get('longitude')

    if lat is None or lon is None:
        issues_found.append('missing_coordinates')
    elif not (50.5 < lat < 53.7 and 3.3 < lon < 7.3):
        # Rough Netherlands bounding box
        issues_found.append('coordinates_outside_netherlands')

    # Check if coordinates from Google Maps differ significantly from corrected
    if 'coordinate_provenance' in loc:
        prov = loc['coordinate_provenance']
        if 'previous_coordinates' in prov:
            issues_found.append('has_coordinate_correction')

    return issues_found if issues_found else None

def check_digital_platforms(data, filename):
    """Check for missing digital platforms"""
    if 'digital_platforms' not in data or not data['digital_platforms']:
        return ['no_digital_platforms']

    platforms = data['digital_platforms']
    if len(platforms) == 0:
        return ['empty_digital_platforms']

    return None

def check_identifiers(data, filename):
    """Check identifier completeness"""
    issues_found = []

    if 'identifiers' not in data:
        issues_found.append('no_identifiers')
        return issues_found

    ids = data['identifiers']
    id_types = [i.get('identifier_scheme') for i in ids if isinstance(i, dict)]

    if 'ISIL' not in id_types:
        issues_found.append('no_isil')
    if 'GHCID' not in id_types:
        issues_found.append('no_ghcid')

    return issues_found if issues_found else None

def check_wikidata(data, filename):
    """Check Wikidata enrichment status"""
    if 'wikidata_enrichment' not in data:
        return 'no_wikidata_enrichment'

    wd = data['wikidata_enrichment']
    status = wd.get('status', '')

    if status == 'NOT_FOUND':
        return 'wikidata_not_found'
    elif status in ['SUCCESS', 'ENRICHED']:
        return None
    else:
        return f'wikidata_status:{status}'

def check_url(data, filename):
    """Check URL issues"""
    issues_found = []

    url = data.get('url', '')
    if not url:
        issues_found.append('no_url')
    elif url.startswith('http://'):
        issues_found.append('http_not_https')

    # Check if URL was corrected (indicates previous wrong URL)
    if 'url_correction' in data:
        issues_found.append('has_url_correction')

    return issues_found if issues_found else None

def scan_file(filepath):
    """Scan a single file for all issue types"""
    filename = filepath.name
    file_issues = {}

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
    except Exception as e:
        return {'parse_error': str(e)}

    if not data:
        return {'empty_file': True}

    # 1. Check GHCID type mismatch
    ghcid_type = extract_ghcid_type(filename)
    expected_type = get_expected_type(data)

    if ghcid_type and expected_type and ghcid_type != expected_type:
        if ghcid_type == 'U' and expected_type != 'U':
            file_issues['wrong_ghcid_type'] = {
                'current': ghcid_type,
                'expected': expected_type
            }

    # Also check for U type that should be something else
    if ghcid_type == 'U':
        file_issues['unknown_type'] = True

    # 2. Check Google Maps mismatch
    gm_mismatch = check_google_maps_mismatch(data, filename)
    if gm_mismatch:
        file_issues['google_maps_mismatch'] = gm_mismatch

    # 3. Check absolute paths
    abs_paths = check_absolute_paths(data, filename)
    if abs_paths:
        file_issues['absolute_paths'] = abs_paths

    # 4. Check web claims
    wc_issues = check_web_claims(data, filename)
    if wc_issues:
        file_issues['web_claims_issues'] = wc_issues

    # 5. Check coordinates
    coord_issues = check_coordinates(data, filename)
    if coord_issues:
        file_issues['coordinate_issues'] = coord_issues

    # 6. Check digital platforms
    dp_issues = check_digital_platforms(data, filename)
    if dp_issues:
        file_issues['digital_platform_issues'] = dp_issues

    # 7. Check identifiers
    id_issues = check_identifiers(data, filename)
    if id_issues:
        file_issues['identifier_issues'] = id_issues

    # 8. Check Wikidata
    wd_issue = check_wikidata(data, filename)
    if wd_issue:
        file_issues['wikidata_issue'] = wd_issue

    # 9. Check URL
    url_issues = check_url(data, filename)
    if url_issues:
        file_issues['url_issues'] = url_issues

    return file_issues

def main():
    print(f"Scanning Dutch custodian files in {CUSTODIAN_DIR}")
    print(f"Scan started: {datetime.now().isoformat()}")
    print("=" * 80)

    # Collect all issues
    all_issues = {}
    issue_counts = defaultdict(int)

    files = sorted(CUSTODIAN_DIR.glob("NL-*.yaml"))
    total_files = len(files)

    print(f"Found {total_files} Dutch custodian files\n")

    for i, filepath in enumerate(files):
        if (i + 1) % 200 == 0:
            print(f"Progress: {i+1}/{total_files} files scanned...", flush=True)

        file_issues = scan_file(filepath)

        if file_issues:
            all_issues[filepath.name] = file_issues
            for issue_type in file_issues.keys():
                issue_counts[issue_type] += 1

    print(f"\nScan complete: {total_files} files analyzed")
    print("=" * 80)

    # Summary report
    print("\n" + "=" * 80)
    print("SUMMARY REPORT: Data Quality Issues")
    print("=" * 80)

    print(f"\nTotal files scanned: {total_files}")
    print(f"Files with issues: {len(all_issues)}")
    print(f"Files without issues: {total_files - len(all_issues)}")

    print("\n" + "-" * 80)
    print("ISSUE BREAKDOWN BY TYPE")
    print("-" * 80)

    # Sort issues by count
    sorted_issues = sorted(issue_counts.items(), key=lambda x: -x[1])

    for issue_type, count in sorted_issues:
        pct = (count / total_files) * 100
        print(f"{issue_type:40} {count:5} files ({pct:5.1f}%)")

    # Detailed breakdown for critical issues
    print("\n" + "=" * 80)
    print("CRITICAL ISSUES - REQUIRE IMMEDIATE ATTENTION")
    print("=" * 80)

    # 1. Wrong GHCID type
    wrong_type_files = [(f, d) for f, d in all_issues.items() if 'wrong_ghcid_type' in d]
    print(f"\n1. WRONG GHCID TYPE ({len(wrong_type_files)} files)")
    print("-" * 40)
    if wrong_type_files:
        for filename, data in wrong_type_files[:20]:
            info = data['wrong_ghcid_type']
            print(f"  {filename}: {info['current']} -> should be {info['expected']}")
        if len(wrong_type_files) > 20:
            print(f"  ... and {len(wrong_type_files) - 20} more")
    else:
        print("  None found")

    # 2. Google Maps mismatches
    gm_mismatch_files = [(f, d) for f, d in all_issues.items() if 'google_maps_mismatch' in d]
    print(f"\n2. GOOGLE MAPS MISMATCHES ({len(gm_mismatch_files)} files)")
    print("-" * 40)
    if gm_mismatch_files:
        for filename, data in gm_mismatch_files[:20]:
            info = data['google_maps_mismatch']
            print(f"  {filename}")
            print(f"    Google: {info['google_name']}")
            print(f"    Org:    {info['org_name']}")
            print(f"    Similarity: {info['similarity']}")
        if len(gm_mismatch_files) > 20:
            print(f"  ... and {len(gm_mismatch_files) - 20} more")
    else:
        print("  None found")

    # 3. Absolute paths
    abs_path_files = [(f, d) for f, d in all_issues.items() if 'absolute_paths' in d]
    print(f"\n3. ABSOLUTE PATHS ({len(abs_path_files)} files)")
    print("-" * 40)
    if abs_path_files:
        for filename, data in abs_path_files[:10]:
            print(f"  {filename}: {data['absolute_paths']}")
        if len(abs_path_files) > 10:
            print(f"  ... and {len(abs_path_files) - 10} more")
    else:
        print("  None found")

    # 4. Unknown type (U)
    unknown_type_files = [f for f, d in all_issues.items() if 'unknown_type' in d]
    print(f"\n4. UNKNOWN TYPE CODE 'U' ({len(unknown_type_files)} files)")
    print("-" * 40)
    if unknown_type_files:
        for filename in unknown_type_files[:30]:
            print(f"  {filename}")
        if len(unknown_type_files) > 30:
            print(f"  ... and {len(unknown_type_files) - 30} more")
    else:
        print("  None found")

    print("\n" + "=" * 80)
    print("ENRICHMENT GAPS")
    print("=" * 80)

    # Web claims issues
    no_verified_claims = [f for f, d in all_issues.items()
                          if 'web_claims_issues' in d and 'no_verified_claims' in d['web_claims_issues']]
    print(f"\n5. NO VERIFIED WEB CLAIMS ({len(no_verified_claims)} files)")

    # Digital platforms
    no_platforms = [f for f, d in all_issues.items()
                    if 'digital_platform_issues' in d]
    print(f"6. NO DIGITAL PLATFORMS ({len(no_platforms)} files)")

    # Wikidata
    no_wikidata = [f for f, d in all_issues.items()
                   if d.get('wikidata_issue') in ['no_wikidata_enrichment', 'wikidata_not_found']]
    print(f"7. NO WIKIDATA ENRICHMENT ({len(no_wikidata)} files)")

    # URLs
    no_url = [f for f, d in all_issues.items()
              if 'url_issues' in d and 'no_url' in d['url_issues']]
    print(f"8. NO URL ({len(no_url)} files)")

    # Save detailed report
    report_file = CUSTODIAN_DIR.parent / 'reports' / 'dutch_data_quality_scan.yaml'
    report_file.parent.mkdir(exist_ok=True)

    report = {
        'scan_timestamp': datetime.now().isoformat(),
        'total_files': total_files,
        'files_with_issues': len(all_issues),
        'issue_counts': dict(sorted_issues),
        'detailed_issues': all_issues
    }

    with open(report_file, 'w', encoding='utf-8') as f:
        yaml.dump(report, f, default_flow_style=False, allow_unicode=True)

    print(f"\n\nDetailed report saved to: {report_file}")
    print(f"Scan completed: {datetime.now().isoformat()}")

if __name__ == '__main__':
    main()