- batch_crawl4ai_recrawl.py: Retry failed URL crawls - batch_firecrawl_recrawl.py: FireCrawl batch processing - batch_httpx_scrape.py: HTTPX-based scraping - detect_name_mismatch.py: Find name mismatches in data - enrich_dutch_custodians_crawl4ai.py: Dutch custodian enrichment - fix_collision_victims.py: GHCID collision resolution - fix_generic_platform_names*.py: Platform name cleanup - fix_ghcid_type.py: GHCID type corrections - fix_simon_kemper_contamination.py: Data cleanup - scan_dutch_data_quality.py: Data quality scanning - transform_crawl4ai_to_digital_platform.py: Data transformation
140 lines
5 KiB
Python
Executable file
140 lines
5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Fix generic platform names ('Home Website', 'Homepage Website') by using
|
|
the organisatie field from original_entry.
|
|
|
|
Also filters invalid platform types (ONLINEMARKETING, ONLINEBRANDING).
|
|
"""
|
|
|
|
import yaml
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
# Custom YAML representer to preserve formatting
|
|
def str_representer(dumper, data):
|
|
if '\n' in data:
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
|
|
|
|
yaml.add_representer(str, str_representer)
|
|
|
|
GENERIC_NAMES = {'Home Website', 'Homepage Website', 'Welkom Website'}
|
|
INVALID_TYPES = {'ONLINEMARKETING', 'ONLINEBRANDING', 'ONLINEWEBSITE', 'ONLINE'}
|
|
|
|
def fix_file(filepath: Path, dry_run: bool = False) -> dict:
|
|
"""Fix a single file. Returns stats dict."""
|
|
stats = {
|
|
'name_fixed': False,
|
|
'types_fixed': False,
|
|
'old_name': None,
|
|
'new_name': None,
|
|
'removed_types': []
|
|
}
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
return stats
|
|
|
|
# Check if file has digital_platform_v2
|
|
if 'digital_platform_v2' not in data:
|
|
return stats
|
|
|
|
dpv2 = data['digital_platform_v2']
|
|
modified = False
|
|
|
|
# Fix 1: Generic platform names
|
|
current_name = dpv2.get('platform_name', '')
|
|
if current_name in GENERIC_NAMES:
|
|
# Try to get organisation name
|
|
org_name = None
|
|
if 'original_entry' in data and 'organisatie' in data['original_entry']:
|
|
org_name = data['original_entry']['organisatie']
|
|
elif 'museum_register_enrichment' in data and 'museum_name' in data['museum_register_enrichment']:
|
|
org_name = data['museum_register_enrichment']['museum_name']
|
|
elif 'wikidata_enrichment' in data and 'wikidata_label_nl' in data['wikidata_enrichment']:
|
|
org_name = data['wikidata_enrichment']['wikidata_label_nl']
|
|
|
|
if org_name:
|
|
new_name = f"{org_name} Website"
|
|
stats['old_name'] = current_name
|
|
stats['new_name'] = new_name
|
|
stats['name_fixed'] = True
|
|
dpv2['platform_name'] = new_name
|
|
modified = True
|
|
|
|
# Fix 2: Invalid platform types
|
|
if 'platform_type' in dpv2 and isinstance(dpv2['platform_type'], list):
|
|
original_types = dpv2['platform_type'].copy()
|
|
filtered_types = [t for t in original_types if t not in INVALID_TYPES]
|
|
|
|
if len(filtered_types) < len(original_types):
|
|
stats['removed_types'] = [t for t in original_types if t in INVALID_TYPES]
|
|
stats['types_fixed'] = True
|
|
dpv2['platform_type'] = filtered_types if filtered_types else ['INSTITUTIONAL_WEBSITE']
|
|
modified = True
|
|
|
|
# Add fix metadata
|
|
if modified:
|
|
if '_transformation_metadata' not in dpv2:
|
|
dpv2['_transformation_metadata'] = {}
|
|
dpv2['_transformation_metadata']['quality_fix_date'] = datetime.now(timezone.utc).isoformat()
|
|
if stats['name_fixed']:
|
|
dpv2['_transformation_metadata']['name_source'] = 'organisatie_field'
|
|
|
|
if not dry_run:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description='Fix generic platform names')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be changed without modifying files')
|
|
parser.add_argument('--path', default='/Users/kempersc/apps/glam/data/custodian', help='Path to custodian files')
|
|
args = parser.parse_args()
|
|
|
|
custodian_path = Path(args.path)
|
|
|
|
# Find files with digital_platform_v2
|
|
files_fixed_names = 0
|
|
files_fixed_types = 0
|
|
total_checked = 0
|
|
|
|
print(f"{'[DRY RUN] ' if args.dry_run else ''}Scanning {custodian_path}...")
|
|
print()
|
|
|
|
for filepath in sorted(custodian_path.glob('NL-*.yaml')):
|
|
stats = fix_file(filepath, dry_run=args.dry_run)
|
|
|
|
if stats['name_fixed'] or stats['types_fixed']:
|
|
total_checked += 1
|
|
|
|
if stats['name_fixed']:
|
|
files_fixed_names += 1
|
|
print(f"✓ {filepath.name}")
|
|
print(f" Name: '{stats['old_name']}' → '{stats['new_name']}'")
|
|
|
|
if stats['types_fixed']:
|
|
files_fixed_types += 1
|
|
print(f" Removed types: {stats['removed_types']}")
|
|
|
|
print()
|
|
print("=" * 60)
|
|
print(f"{'[DRY RUN] ' if args.dry_run else ''}Summary:")
|
|
print(f" Files with name fixed: {files_fixed_names}")
|
|
print(f" Files with types fixed: {files_fixed_types}")
|
|
print(f" Total files modified: {total_checked}")
|
|
|
|
if args.dry_run:
|
|
print()
|
|
print("Run without --dry-run to apply changes.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|