#!/usr/bin/env python3 """ Patch web_enrichment provenance to add missing wasDerivedFrom. Sources for wasDerivedFrom (in priority order): 1. web_archives[0].url (original source) 2. archive_metadata.pages_archived[0].url 3. claims[0].source_url """ import sys from pathlib import Path try: from ruamel.yaml import YAML yaml = YAML() yaml.preserve_quotes = True yaml.width = 4096 except ImportError: print("Error: ruamel.yaml required. Install with: pip install ruamel.yaml") sys.exit(1) CUSTODIAN_DIR = Path("data/custodian") def get_web_source_url(web_enrichment: dict) -> str | None: """Extract source URL from web_enrichment section.""" # Priority 1: web_archives[0].url (most authoritative) web_archives = web_enrichment.get('web_archives', []) if web_archives and isinstance(web_archives, list) and len(web_archives) > 0: url = web_archives[0].get('url') if url: return url # Priority 2: archive_metadata.pages_archived[0].url archive_meta = web_enrichment.get('archive_metadata', {}) pages = archive_meta.get('pages_archived', []) if pages and isinstance(pages, list) and len(pages) > 0: url = pages[0].get('url') if url: return url # Priority 3: claims[0].source_url claims = web_enrichment.get('claims', []) if claims and isinstance(claims, list) and len(claims) > 0: url = claims[0].get('source_url') if url: return url # Priority 4: source_url at root level if 'source_url' in web_enrichment: return web_enrichment['source_url'] return None def patch_file(filepath: Path, dry_run: bool = False, verbose: bool = False) -> tuple[bool, int]: """Patch a single file to add missing wasDerivedFrom for web_enrichment.""" with open(filepath, 'r', encoding='utf-8') as f: data = yaml.load(f) if not data: return False, 0 modified = False count = 0 if 'web_enrichment' not in data: return False, 0 we = data['web_enrichment'] if '_provenance' not in we: return False, 0 prov = we['_provenance'] # Check if wasDerivedFrom is already present if 'prov' in prov and 'wasDerivedFrom' in prov['prov']: return False, 0 # Get source URL source_url = get_web_source_url(we) if not source_url: if verbose: print(f" [web_enrichment] No source URL found - skipping") return False, 0 # Add wasDerivedFrom if 'prov' not in prov: prov['prov'] = {} prov['prov']['wasDerivedFrom'] = source_url modified = True count = 1 if verbose: print(f" [web_enrichment] Added wasDerivedFrom: {source_url[:60]}...") if modified and not dry_run: with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f) return modified, count def main(): import argparse parser = argparse.ArgumentParser(description="Patch missing wasDerivedFrom in web_enrichment") parser.add_argument('--dry-run', action='store_true', help="Show what would be done") parser.add_argument('--verbose', '-v', action='store_true', help="Verbose output") parser.add_argument('--limit', type=int, help="Limit number of files to process") parser.add_argument('--file', type=str, help="Process single file") args = parser.parse_args() if args.file: files = [Path(args.file)] else: files = sorted(CUSTODIAN_DIR.glob("*.yaml")) if args.limit: files = files[:args.limit] total = len(files) modified_count = 0 added_count = 0 print(f"Patching {total} files for web_enrichment wasDerivedFrom...") print(f" Dry run: {args.dry_run}") for i, filepath in enumerate(files, 1): if i % 1000 == 0 or args.verbose: print(f"[{i}/{total}] {filepath.name}") try: was_modified, count = patch_file(filepath, args.dry_run, args.verbose) if was_modified: modified_count += 1 added_count += count except Exception as e: print(f" ERROR: {filepath.name}: {e}", file=sys.stderr) print("\n" + "=" * 60) print("SUMMARY") print("=" * 60) print(f"Files processed: {total}") print(f"Files modified: {modified_count}") print(f"Fields added: {added_count}") if args.dry_run: print("\n[DRY-RUN] No files were actually modified.") if __name__ == '__main__': main()