- batch_crawl4ai_recrawl.py: Retry failed URL crawls - batch_firecrawl_recrawl.py: FireCrawl batch processing - batch_httpx_scrape.py: HTTPX-based scraping - detect_name_mismatch.py: Find name mismatches in data - enrich_dutch_custodians_crawl4ai.py: Dutch custodian enrichment - fix_collision_victims.py: GHCID collision resolution - fix_generic_platform_names*.py: Platform name cleanup - fix_ghcid_type.py: GHCID type corrections - fix_simon_kemper_contamination.py: Data cleanup - scan_dutch_data_quality.py: Data quality scanning - transform_crawl4ai_to_digital_platform.py: Data transformation
445 lines
14 KiB
Python
445 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Comprehensive data quality scan for Dutch custodian YAML files.
|
|
Identifies issues like wrong GHCID types, missing web claims, Google Maps mismatches, etc.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import yaml
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
from datetime import datetime
|
|
|
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
|
|
|
# Issue categories
|
|
issues = defaultdict(list)
|
|
|
|
def extract_ghcid_type(filename):
|
|
"""Extract type code from GHCID filename (e.g., NL-ZH-ZOE-A-SAZS -> A)"""
|
|
match = re.match(r'NL-[A-Z]{2}-[A-Z]{3}-([A-Z])-', filename)
|
|
return match.group(1) if match else None
|
|
|
|
def get_expected_type(data):
|
|
"""Determine expected type from original_entry or other fields"""
|
|
# Check original_entry.type
|
|
if 'original_entry' in data:
|
|
oe = data['original_entry']
|
|
if 'type' in oe and oe['type']:
|
|
types = oe['type']
|
|
if isinstance(types, list) and len(types) > 0:
|
|
return types[0]
|
|
if 'type_organisatie' in oe:
|
|
type_org = oe['type_organisatie']
|
|
if type_org:
|
|
type_map = {
|
|
'archive': 'A', 'archief': 'A',
|
|
'library': 'L', 'bibliotheek': 'L',
|
|
'museum': 'M',
|
|
'gallery': 'G', 'galerie': 'G',
|
|
}
|
|
return type_map.get(type_org.lower(), None)
|
|
return None
|
|
|
|
def check_google_maps_mismatch(data, filename):
|
|
"""Check if Google Maps name doesn't match organization name"""
|
|
if 'google_maps_enrichment' not in data:
|
|
return None
|
|
|
|
gm = data['google_maps_enrichment']
|
|
gm_name = gm.get('name', '')
|
|
|
|
# Get original org name
|
|
org_name = ''
|
|
if 'original_entry' in data:
|
|
org_name = data['original_entry'].get('organisatie', '')
|
|
if 'custodian_name' in data:
|
|
cn = data['custodian_name']
|
|
if isinstance(cn, dict):
|
|
org_name = cn.get('claim_value', org_name)
|
|
|
|
if not gm_name or not org_name:
|
|
return None
|
|
|
|
# Simple similarity check - if names share less than 30% of words, flag it
|
|
gm_words = set(gm_name.lower().split())
|
|
org_words = set(org_name.lower().split())
|
|
|
|
# Remove common words
|
|
stopwords = {'de', 'het', 'van', 'en', 'in', 'te', 'der', 'voor', 'stichting', 'vereniging'}
|
|
gm_words = gm_words - stopwords
|
|
org_words = org_words - stopwords
|
|
|
|
if len(gm_words) == 0 or len(org_words) == 0:
|
|
return None
|
|
|
|
overlap = len(gm_words & org_words)
|
|
similarity = overlap / max(len(gm_words), len(org_words))
|
|
|
|
if similarity < 0.3:
|
|
return {
|
|
'google_name': gm_name,
|
|
'org_name': org_name,
|
|
'similarity': round(similarity, 2)
|
|
}
|
|
return None
|
|
|
|
def check_absolute_paths(data, filename):
|
|
"""Check for absolute paths that should be relative"""
|
|
yaml_str = yaml.dump(data, default_flow_style=False)
|
|
abs_paths = []
|
|
|
|
patterns = [
|
|
r'/Volumes/KINGSTON/',
|
|
r'/Users/kempersc/',
|
|
r'/mnt/',
|
|
r'C:\\',
|
|
r'D:\\'
|
|
]
|
|
|
|
for pattern in patterns:
|
|
if re.search(pattern, yaml_str):
|
|
abs_paths.append(pattern.rstrip('/\\'))
|
|
|
|
return abs_paths if abs_paths else None
|
|
|
|
def check_web_claims(data, filename):
|
|
"""Check web claims quality"""
|
|
issues_found = []
|
|
|
|
if 'web_claims' not in data:
|
|
return ['no_web_claims']
|
|
|
|
wc = data['web_claims']
|
|
|
|
# Check if claims exist
|
|
claims = wc.get('claims', [])
|
|
if not claims:
|
|
issues_found.append('empty_claims')
|
|
|
|
# Check for verified_claims
|
|
if 'verified_claims' not in wc:
|
|
issues_found.append('no_verified_claims')
|
|
else:
|
|
vc = wc['verified_claims']
|
|
if isinstance(vc, dict):
|
|
vc_claims = vc.get('claims', [])
|
|
# Check for XPath provenance
|
|
claims_without_xpath = 0
|
|
for claim in vc_claims:
|
|
if isinstance(claim, dict) and 'xpath' not in claim:
|
|
claims_without_xpath += 1
|
|
if claims_without_xpath > 0:
|
|
issues_found.append(f'claims_missing_xpath:{claims_without_xpath}')
|
|
|
|
return issues_found if issues_found else None
|
|
|
|
def check_coordinates(data, filename):
|
|
"""Check for coordinate issues"""
|
|
issues_found = []
|
|
|
|
# Check if location exists
|
|
if 'location' not in data:
|
|
issues_found.append('no_location')
|
|
return issues_found
|
|
|
|
loc = data['location']
|
|
lat = loc.get('latitude')
|
|
lon = loc.get('longitude')
|
|
|
|
if lat is None or lon is None:
|
|
issues_found.append('missing_coordinates')
|
|
elif not (50.5 < lat < 53.7 and 3.3 < lon < 7.3):
|
|
# Rough Netherlands bounding box
|
|
issues_found.append('coordinates_outside_netherlands')
|
|
|
|
# Check if coordinates from Google Maps differ significantly from corrected
|
|
if 'coordinate_provenance' in loc:
|
|
prov = loc['coordinate_provenance']
|
|
if 'previous_coordinates' in prov:
|
|
issues_found.append('has_coordinate_correction')
|
|
|
|
return issues_found if issues_found else None
|
|
|
|
def check_digital_platforms(data, filename):
|
|
"""Check for missing digital platforms"""
|
|
if 'digital_platforms' not in data or not data['digital_platforms']:
|
|
return ['no_digital_platforms']
|
|
|
|
platforms = data['digital_platforms']
|
|
if len(platforms) == 0:
|
|
return ['empty_digital_platforms']
|
|
|
|
return None
|
|
|
|
def check_identifiers(data, filename):
|
|
"""Check identifier completeness"""
|
|
issues_found = []
|
|
|
|
if 'identifiers' not in data:
|
|
issues_found.append('no_identifiers')
|
|
return issues_found
|
|
|
|
ids = data['identifiers']
|
|
id_types = [i.get('identifier_scheme') for i in ids if isinstance(i, dict)]
|
|
|
|
if 'ISIL' not in id_types:
|
|
issues_found.append('no_isil')
|
|
if 'GHCID' not in id_types:
|
|
issues_found.append('no_ghcid')
|
|
|
|
return issues_found if issues_found else None
|
|
|
|
def check_wikidata(data, filename):
|
|
"""Check Wikidata enrichment status"""
|
|
if 'wikidata_enrichment' not in data:
|
|
return 'no_wikidata_enrichment'
|
|
|
|
wd = data['wikidata_enrichment']
|
|
status = wd.get('status', '')
|
|
|
|
if status == 'NOT_FOUND':
|
|
return 'wikidata_not_found'
|
|
elif status in ['SUCCESS', 'ENRICHED']:
|
|
return None
|
|
else:
|
|
return f'wikidata_status:{status}'
|
|
|
|
def check_url(data, filename):
|
|
"""Check URL issues"""
|
|
issues_found = []
|
|
|
|
url = data.get('url', '')
|
|
if not url:
|
|
issues_found.append('no_url')
|
|
elif url.startswith('http://'):
|
|
issues_found.append('http_not_https')
|
|
|
|
# Check if URL was corrected (indicates previous wrong URL)
|
|
if 'url_correction' in data:
|
|
issues_found.append('has_url_correction')
|
|
|
|
return issues_found if issues_found else None
|
|
|
|
def scan_file(filepath):
|
|
"""Scan a single file for all issue types"""
|
|
filename = filepath.name
|
|
file_issues = {}
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
except Exception as e:
|
|
return {'parse_error': str(e)}
|
|
|
|
if not data:
|
|
return {'empty_file': True}
|
|
|
|
# 1. Check GHCID type mismatch
|
|
ghcid_type = extract_ghcid_type(filename)
|
|
expected_type = get_expected_type(data)
|
|
|
|
if ghcid_type and expected_type and ghcid_type != expected_type:
|
|
if ghcid_type == 'U' and expected_type != 'U':
|
|
file_issues['wrong_ghcid_type'] = {
|
|
'current': ghcid_type,
|
|
'expected': expected_type
|
|
}
|
|
|
|
# Also check for U type that should be something else
|
|
if ghcid_type == 'U':
|
|
file_issues['unknown_type'] = True
|
|
|
|
# 2. Check Google Maps mismatch
|
|
gm_mismatch = check_google_maps_mismatch(data, filename)
|
|
if gm_mismatch:
|
|
file_issues['google_maps_mismatch'] = gm_mismatch
|
|
|
|
# 3. Check absolute paths
|
|
abs_paths = check_absolute_paths(data, filename)
|
|
if abs_paths:
|
|
file_issues['absolute_paths'] = abs_paths
|
|
|
|
# 4. Check web claims
|
|
wc_issues = check_web_claims(data, filename)
|
|
if wc_issues:
|
|
file_issues['web_claims_issues'] = wc_issues
|
|
|
|
# 5. Check coordinates
|
|
coord_issues = check_coordinates(data, filename)
|
|
if coord_issues:
|
|
file_issues['coordinate_issues'] = coord_issues
|
|
|
|
# 6. Check digital platforms
|
|
dp_issues = check_digital_platforms(data, filename)
|
|
if dp_issues:
|
|
file_issues['digital_platform_issues'] = dp_issues
|
|
|
|
# 7. Check identifiers
|
|
id_issues = check_identifiers(data, filename)
|
|
if id_issues:
|
|
file_issues['identifier_issues'] = id_issues
|
|
|
|
# 8. Check Wikidata
|
|
wd_issue = check_wikidata(data, filename)
|
|
if wd_issue:
|
|
file_issues['wikidata_issue'] = wd_issue
|
|
|
|
# 9. Check URL
|
|
url_issues = check_url(data, filename)
|
|
if url_issues:
|
|
file_issues['url_issues'] = url_issues
|
|
|
|
return file_issues
|
|
|
|
def main():
|
|
print(f"Scanning Dutch custodian files in {CUSTODIAN_DIR}")
|
|
print(f"Scan started: {datetime.now().isoformat()}")
|
|
print("=" * 80)
|
|
|
|
# Collect all issues
|
|
all_issues = {}
|
|
issue_counts = defaultdict(int)
|
|
|
|
files = sorted(CUSTODIAN_DIR.glob("NL-*.yaml"))
|
|
total_files = len(files)
|
|
|
|
print(f"Found {total_files} Dutch custodian files\n")
|
|
|
|
for i, filepath in enumerate(files):
|
|
if (i + 1) % 200 == 0:
|
|
print(f"Progress: {i+1}/{total_files} files scanned...", flush=True)
|
|
|
|
file_issues = scan_file(filepath)
|
|
|
|
if file_issues:
|
|
all_issues[filepath.name] = file_issues
|
|
for issue_type in file_issues.keys():
|
|
issue_counts[issue_type] += 1
|
|
|
|
print(f"\nScan complete: {total_files} files analyzed")
|
|
print("=" * 80)
|
|
|
|
# Summary report
|
|
print("\n" + "=" * 80)
|
|
print("SUMMARY REPORT: Data Quality Issues")
|
|
print("=" * 80)
|
|
|
|
print(f"\nTotal files scanned: {total_files}")
|
|
print(f"Files with issues: {len(all_issues)}")
|
|
print(f"Files without issues: {total_files - len(all_issues)}")
|
|
|
|
print("\n" + "-" * 80)
|
|
print("ISSUE BREAKDOWN BY TYPE")
|
|
print("-" * 80)
|
|
|
|
# Sort issues by count
|
|
sorted_issues = sorted(issue_counts.items(), key=lambda x: -x[1])
|
|
|
|
for issue_type, count in sorted_issues:
|
|
pct = (count / total_files) * 100
|
|
print(f"{issue_type:40} {count:5} files ({pct:5.1f}%)")
|
|
|
|
# Detailed breakdown for critical issues
|
|
print("\n" + "=" * 80)
|
|
print("CRITICAL ISSUES - REQUIRE IMMEDIATE ATTENTION")
|
|
print("=" * 80)
|
|
|
|
# 1. Wrong GHCID type
|
|
wrong_type_files = [(f, d) for f, d in all_issues.items() if 'wrong_ghcid_type' in d]
|
|
print(f"\n1. WRONG GHCID TYPE ({len(wrong_type_files)} files)")
|
|
print("-" * 40)
|
|
if wrong_type_files:
|
|
for filename, data in wrong_type_files[:20]:
|
|
info = data['wrong_ghcid_type']
|
|
print(f" {filename}: {info['current']} -> should be {info['expected']}")
|
|
if len(wrong_type_files) > 20:
|
|
print(f" ... and {len(wrong_type_files) - 20} more")
|
|
else:
|
|
print(" None found")
|
|
|
|
# 2. Google Maps mismatches
|
|
gm_mismatch_files = [(f, d) for f, d in all_issues.items() if 'google_maps_mismatch' in d]
|
|
print(f"\n2. GOOGLE MAPS MISMATCHES ({len(gm_mismatch_files)} files)")
|
|
print("-" * 40)
|
|
if gm_mismatch_files:
|
|
for filename, data in gm_mismatch_files[:20]:
|
|
info = data['google_maps_mismatch']
|
|
print(f" {filename}")
|
|
print(f" Google: {info['google_name']}")
|
|
print(f" Org: {info['org_name']}")
|
|
print(f" Similarity: {info['similarity']}")
|
|
if len(gm_mismatch_files) > 20:
|
|
print(f" ... and {len(gm_mismatch_files) - 20} more")
|
|
else:
|
|
print(" None found")
|
|
|
|
# 3. Absolute paths
|
|
abs_path_files = [(f, d) for f, d in all_issues.items() if 'absolute_paths' in d]
|
|
print(f"\n3. ABSOLUTE PATHS ({len(abs_path_files)} files)")
|
|
print("-" * 40)
|
|
if abs_path_files:
|
|
for filename, data in abs_path_files[:10]:
|
|
print(f" {filename}: {data['absolute_paths']}")
|
|
if len(abs_path_files) > 10:
|
|
print(f" ... and {len(abs_path_files) - 10} more")
|
|
else:
|
|
print(" None found")
|
|
|
|
# 4. Unknown type (U)
|
|
unknown_type_files = [f for f, d in all_issues.items() if 'unknown_type' in d]
|
|
print(f"\n4. UNKNOWN TYPE CODE 'U' ({len(unknown_type_files)} files)")
|
|
print("-" * 40)
|
|
if unknown_type_files:
|
|
for filename in unknown_type_files[:30]:
|
|
print(f" {filename}")
|
|
if len(unknown_type_files) > 30:
|
|
print(f" ... and {len(unknown_type_files) - 30} more")
|
|
else:
|
|
print(" None found")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("ENRICHMENT GAPS")
|
|
print("=" * 80)
|
|
|
|
# Web claims issues
|
|
no_verified_claims = [f for f, d in all_issues.items()
|
|
if 'web_claims_issues' in d and 'no_verified_claims' in d['web_claims_issues']]
|
|
print(f"\n5. NO VERIFIED WEB CLAIMS ({len(no_verified_claims)} files)")
|
|
|
|
# Digital platforms
|
|
no_platforms = [f for f, d in all_issues.items()
|
|
if 'digital_platform_issues' in d]
|
|
print(f"6. NO DIGITAL PLATFORMS ({len(no_platforms)} files)")
|
|
|
|
# Wikidata
|
|
no_wikidata = [f for f, d in all_issues.items()
|
|
if d.get('wikidata_issue') in ['no_wikidata_enrichment', 'wikidata_not_found']]
|
|
print(f"7. NO WIKIDATA ENRICHMENT ({len(no_wikidata)} files)")
|
|
|
|
# URLs
|
|
no_url = [f for f, d in all_issues.items()
|
|
if 'url_issues' in d and 'no_url' in d['url_issues']]
|
|
print(f"8. NO URL ({len(no_url)} files)")
|
|
|
|
# Save detailed report
|
|
report_file = CUSTODIAN_DIR.parent / 'reports' / 'dutch_data_quality_scan.yaml'
|
|
report_file.parent.mkdir(exist_ok=True)
|
|
|
|
report = {
|
|
'scan_timestamp': datetime.now().isoformat(),
|
|
'total_files': total_files,
|
|
'files_with_issues': len(all_issues),
|
|
'issue_counts': dict(sorted_issues),
|
|
'detailed_issues': all_issues
|
|
}
|
|
|
|
with open(report_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(report, f, default_flow_style=False, allow_unicode=True)
|
|
|
|
print(f"\n\nDetailed report saved to: {report_file}")
|
|
print(f"Scan completed: {datetime.now().isoformat()}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|