glam/scripts/scan_dutch_data_quality.py
kempersc 0c36429257 feat(scripts): Add batch crawling and data quality scripts
- batch_crawl4ai_recrawl.py: Retry failed URL crawls
- batch_firecrawl_recrawl.py: FireCrawl batch processing
- batch_httpx_scrape.py: HTTPX-based scraping
- detect_name_mismatch.py: Find name mismatches in data
- enrich_dutch_custodians_crawl4ai.py: Dutch custodian enrichment
- fix_collision_victims.py: GHCID collision resolution
- fix_generic_platform_names*.py: Platform name cleanup
- fix_ghcid_type.py: GHCID type corrections
- fix_simon_kemper_contamination.py: Data cleanup
- scan_dutch_data_quality.py: Data quality scanning
- transform_crawl4ai_to_digital_platform.py: Data transformation
2025-12-15 01:47:46 +01:00

445 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Comprehensive data quality scan for Dutch custodian YAML files.
Identifies issues like wrong GHCID types, missing web claims, Google Maps mismatches, etc.
"""
import os
import re
import yaml
from pathlib import Path
from collections import defaultdict
from datetime import datetime
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
# Issue categories
issues = defaultdict(list)
def extract_ghcid_type(filename):
"""Extract type code from GHCID filename (e.g., NL-ZH-ZOE-A-SAZS -> A)"""
match = re.match(r'NL-[A-Z]{2}-[A-Z]{3}-([A-Z])-', filename)
return match.group(1) if match else None
def get_expected_type(data):
"""Determine expected type from original_entry or other fields"""
# Check original_entry.type
if 'original_entry' in data:
oe = data['original_entry']
if 'type' in oe and oe['type']:
types = oe['type']
if isinstance(types, list) and len(types) > 0:
return types[0]
if 'type_organisatie' in oe:
type_org = oe['type_organisatie']
if type_org:
type_map = {
'archive': 'A', 'archief': 'A',
'library': 'L', 'bibliotheek': 'L',
'museum': 'M',
'gallery': 'G', 'galerie': 'G',
}
return type_map.get(type_org.lower(), None)
return None
def check_google_maps_mismatch(data, filename):
"""Check if Google Maps name doesn't match organization name"""
if 'google_maps_enrichment' not in data:
return None
gm = data['google_maps_enrichment']
gm_name = gm.get('name', '')
# Get original org name
org_name = ''
if 'original_entry' in data:
org_name = data['original_entry'].get('organisatie', '')
if 'custodian_name' in data:
cn = data['custodian_name']
if isinstance(cn, dict):
org_name = cn.get('claim_value', org_name)
if not gm_name or not org_name:
return None
# Simple similarity check - if names share less than 30% of words, flag it
gm_words = set(gm_name.lower().split())
org_words = set(org_name.lower().split())
# Remove common words
stopwords = {'de', 'het', 'van', 'en', 'in', 'te', 'der', 'voor', 'stichting', 'vereniging'}
gm_words = gm_words - stopwords
org_words = org_words - stopwords
if len(gm_words) == 0 or len(org_words) == 0:
return None
overlap = len(gm_words & org_words)
similarity = overlap / max(len(gm_words), len(org_words))
if similarity < 0.3:
return {
'google_name': gm_name,
'org_name': org_name,
'similarity': round(similarity, 2)
}
return None
def check_absolute_paths(data, filename):
"""Check for absolute paths that should be relative"""
yaml_str = yaml.dump(data, default_flow_style=False)
abs_paths = []
patterns = [
r'/Volumes/KINGSTON/',
r'/Users/kempersc/',
r'/mnt/',
r'C:\\',
r'D:\\'
]
for pattern in patterns:
if re.search(pattern, yaml_str):
abs_paths.append(pattern.rstrip('/\\'))
return abs_paths if abs_paths else None
def check_web_claims(data, filename):
"""Check web claims quality"""
issues_found = []
if 'web_claims' not in data:
return ['no_web_claims']
wc = data['web_claims']
# Check if claims exist
claims = wc.get('claims', [])
if not claims:
issues_found.append('empty_claims')
# Check for verified_claims
if 'verified_claims' not in wc:
issues_found.append('no_verified_claims')
else:
vc = wc['verified_claims']
if isinstance(vc, dict):
vc_claims = vc.get('claims', [])
# Check for XPath provenance
claims_without_xpath = 0
for claim in vc_claims:
if isinstance(claim, dict) and 'xpath' not in claim:
claims_without_xpath += 1
if claims_without_xpath > 0:
issues_found.append(f'claims_missing_xpath:{claims_without_xpath}')
return issues_found if issues_found else None
def check_coordinates(data, filename):
"""Check for coordinate issues"""
issues_found = []
# Check if location exists
if 'location' not in data:
issues_found.append('no_location')
return issues_found
loc = data['location']
lat = loc.get('latitude')
lon = loc.get('longitude')
if lat is None or lon is None:
issues_found.append('missing_coordinates')
elif not (50.5 < lat < 53.7 and 3.3 < lon < 7.3):
# Rough Netherlands bounding box
issues_found.append('coordinates_outside_netherlands')
# Check if coordinates from Google Maps differ significantly from corrected
if 'coordinate_provenance' in loc:
prov = loc['coordinate_provenance']
if 'previous_coordinates' in prov:
issues_found.append('has_coordinate_correction')
return issues_found if issues_found else None
def check_digital_platforms(data, filename):
"""Check for missing digital platforms"""
if 'digital_platforms' not in data or not data['digital_platforms']:
return ['no_digital_platforms']
platforms = data['digital_platforms']
if len(platforms) == 0:
return ['empty_digital_platforms']
return None
def check_identifiers(data, filename):
"""Check identifier completeness"""
issues_found = []
if 'identifiers' not in data:
issues_found.append('no_identifiers')
return issues_found
ids = data['identifiers']
id_types = [i.get('identifier_scheme') for i in ids if isinstance(i, dict)]
if 'ISIL' not in id_types:
issues_found.append('no_isil')
if 'GHCID' not in id_types:
issues_found.append('no_ghcid')
return issues_found if issues_found else None
def check_wikidata(data, filename):
"""Check Wikidata enrichment status"""
if 'wikidata_enrichment' not in data:
return 'no_wikidata_enrichment'
wd = data['wikidata_enrichment']
status = wd.get('status', '')
if status == 'NOT_FOUND':
return 'wikidata_not_found'
elif status in ['SUCCESS', 'ENRICHED']:
return None
else:
return f'wikidata_status:{status}'
def check_url(data, filename):
"""Check URL issues"""
issues_found = []
url = data.get('url', '')
if not url:
issues_found.append('no_url')
elif url.startswith('http://'):
issues_found.append('http_not_https')
# Check if URL was corrected (indicates previous wrong URL)
if 'url_correction' in data:
issues_found.append('has_url_correction')
return issues_found if issues_found else None
def scan_file(filepath):
"""Scan a single file for all issue types"""
filename = filepath.name
file_issues = {}
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
except Exception as e:
return {'parse_error': str(e)}
if not data:
return {'empty_file': True}
# 1. Check GHCID type mismatch
ghcid_type = extract_ghcid_type(filename)
expected_type = get_expected_type(data)
if ghcid_type and expected_type and ghcid_type != expected_type:
if ghcid_type == 'U' and expected_type != 'U':
file_issues['wrong_ghcid_type'] = {
'current': ghcid_type,
'expected': expected_type
}
# Also check for U type that should be something else
if ghcid_type == 'U':
file_issues['unknown_type'] = True
# 2. Check Google Maps mismatch
gm_mismatch = check_google_maps_mismatch(data, filename)
if gm_mismatch:
file_issues['google_maps_mismatch'] = gm_mismatch
# 3. Check absolute paths
abs_paths = check_absolute_paths(data, filename)
if abs_paths:
file_issues['absolute_paths'] = abs_paths
# 4. Check web claims
wc_issues = check_web_claims(data, filename)
if wc_issues:
file_issues['web_claims_issues'] = wc_issues
# 5. Check coordinates
coord_issues = check_coordinates(data, filename)
if coord_issues:
file_issues['coordinate_issues'] = coord_issues
# 6. Check digital platforms
dp_issues = check_digital_platforms(data, filename)
if dp_issues:
file_issues['digital_platform_issues'] = dp_issues
# 7. Check identifiers
id_issues = check_identifiers(data, filename)
if id_issues:
file_issues['identifier_issues'] = id_issues
# 8. Check Wikidata
wd_issue = check_wikidata(data, filename)
if wd_issue:
file_issues['wikidata_issue'] = wd_issue
# 9. Check URL
url_issues = check_url(data, filename)
if url_issues:
file_issues['url_issues'] = url_issues
return file_issues
def main():
print(f"Scanning Dutch custodian files in {CUSTODIAN_DIR}")
print(f"Scan started: {datetime.now().isoformat()}")
print("=" * 80)
# Collect all issues
all_issues = {}
issue_counts = defaultdict(int)
files = sorted(CUSTODIAN_DIR.glob("NL-*.yaml"))
total_files = len(files)
print(f"Found {total_files} Dutch custodian files\n")
for i, filepath in enumerate(files):
if (i + 1) % 200 == 0:
print(f"Progress: {i+1}/{total_files} files scanned...", flush=True)
file_issues = scan_file(filepath)
if file_issues:
all_issues[filepath.name] = file_issues
for issue_type in file_issues.keys():
issue_counts[issue_type] += 1
print(f"\nScan complete: {total_files} files analyzed")
print("=" * 80)
# Summary report
print("\n" + "=" * 80)
print("SUMMARY REPORT: Data Quality Issues")
print("=" * 80)
print(f"\nTotal files scanned: {total_files}")
print(f"Files with issues: {len(all_issues)}")
print(f"Files without issues: {total_files - len(all_issues)}")
print("\n" + "-" * 80)
print("ISSUE BREAKDOWN BY TYPE")
print("-" * 80)
# Sort issues by count
sorted_issues = sorted(issue_counts.items(), key=lambda x: -x[1])
for issue_type, count in sorted_issues:
pct = (count / total_files) * 100
print(f"{issue_type:40} {count:5} files ({pct:5.1f}%)")
# Detailed breakdown for critical issues
print("\n" + "=" * 80)
print("CRITICAL ISSUES - REQUIRE IMMEDIATE ATTENTION")
print("=" * 80)
# 1. Wrong GHCID type
wrong_type_files = [(f, d) for f, d in all_issues.items() if 'wrong_ghcid_type' in d]
print(f"\n1. WRONG GHCID TYPE ({len(wrong_type_files)} files)")
print("-" * 40)
if wrong_type_files:
for filename, data in wrong_type_files[:20]:
info = data['wrong_ghcid_type']
print(f" {filename}: {info['current']} -> should be {info['expected']}")
if len(wrong_type_files) > 20:
print(f" ... and {len(wrong_type_files) - 20} more")
else:
print(" None found")
# 2. Google Maps mismatches
gm_mismatch_files = [(f, d) for f, d in all_issues.items() if 'google_maps_mismatch' in d]
print(f"\n2. GOOGLE MAPS MISMATCHES ({len(gm_mismatch_files)} files)")
print("-" * 40)
if gm_mismatch_files:
for filename, data in gm_mismatch_files[:20]:
info = data['google_maps_mismatch']
print(f" {filename}")
print(f" Google: {info['google_name']}")
print(f" Org: {info['org_name']}")
print(f" Similarity: {info['similarity']}")
if len(gm_mismatch_files) > 20:
print(f" ... and {len(gm_mismatch_files) - 20} more")
else:
print(" None found")
# 3. Absolute paths
abs_path_files = [(f, d) for f, d in all_issues.items() if 'absolute_paths' in d]
print(f"\n3. ABSOLUTE PATHS ({len(abs_path_files)} files)")
print("-" * 40)
if abs_path_files:
for filename, data in abs_path_files[:10]:
print(f" {filename}: {data['absolute_paths']}")
if len(abs_path_files) > 10:
print(f" ... and {len(abs_path_files) - 10} more")
else:
print(" None found")
# 4. Unknown type (U)
unknown_type_files = [f for f, d in all_issues.items() if 'unknown_type' in d]
print(f"\n4. UNKNOWN TYPE CODE 'U' ({len(unknown_type_files)} files)")
print("-" * 40)
if unknown_type_files:
for filename in unknown_type_files[:30]:
print(f" {filename}")
if len(unknown_type_files) > 30:
print(f" ... and {len(unknown_type_files) - 30} more")
else:
print(" None found")
print("\n" + "=" * 80)
print("ENRICHMENT GAPS")
print("=" * 80)
# Web claims issues
no_verified_claims = [f for f, d in all_issues.items()
if 'web_claims_issues' in d and 'no_verified_claims' in d['web_claims_issues']]
print(f"\n5. NO VERIFIED WEB CLAIMS ({len(no_verified_claims)} files)")
# Digital platforms
no_platforms = [f for f, d in all_issues.items()
if 'digital_platform_issues' in d]
print(f"6. NO DIGITAL PLATFORMS ({len(no_platforms)} files)")
# Wikidata
no_wikidata = [f for f, d in all_issues.items()
if d.get('wikidata_issue') in ['no_wikidata_enrichment', 'wikidata_not_found']]
print(f"7. NO WIKIDATA ENRICHMENT ({len(no_wikidata)} files)")
# URLs
no_url = [f for f, d in all_issues.items()
if 'url_issues' in d and 'no_url' in d['url_issues']]
print(f"8. NO URL ({len(no_url)} files)")
# Save detailed report
report_file = CUSTODIAN_DIR.parent / 'reports' / 'dutch_data_quality_scan.yaml'
report_file.parent.mkdir(exist_ok=True)
report = {
'scan_timestamp': datetime.now().isoformat(),
'total_files': total_files,
'files_with_issues': len(all_issues),
'issue_counts': dict(sorted_issues),
'detailed_issues': all_issues
}
with open(report_file, 'w', encoding='utf-8') as f:
yaml.dump(report, f, default_flow_style=False, allow_unicode=True)
print(f"\n\nDetailed report saved to: {report_file}")
print(f"Scan completed: {datetime.now().isoformat()}")
if __name__ == '__main__':
main()