156 lines
4.5 KiB
Python
Executable file
156 lines
4.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Patch web_enrichment provenance to add missing wasDerivedFrom.
|
|
|
|
Sources for wasDerivedFrom (in priority order):
|
|
1. web_archives[0].url (original source)
|
|
2. archive_metadata.pages_archived[0].url
|
|
3. claims[0].source_url
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
try:
|
|
from ruamel.yaml import YAML
|
|
yaml = YAML()
|
|
yaml.preserve_quotes = True
|
|
yaml.width = 4096
|
|
except ImportError:
|
|
print("Error: ruamel.yaml required. Install with: pip install ruamel.yaml")
|
|
sys.exit(1)
|
|
|
|
CUSTODIAN_DIR = Path("data/custodian")
|
|
|
|
|
|
def get_web_source_url(web_enrichment: dict) -> str | None:
|
|
"""Extract source URL from web_enrichment section."""
|
|
|
|
# Priority 1: web_archives[0].url (most authoritative)
|
|
web_archives = web_enrichment.get('web_archives', [])
|
|
if web_archives and isinstance(web_archives, list) and len(web_archives) > 0:
|
|
url = web_archives[0].get('url')
|
|
if url:
|
|
return url
|
|
|
|
# Priority 2: archive_metadata.pages_archived[0].url
|
|
archive_meta = web_enrichment.get('archive_metadata', {})
|
|
pages = archive_meta.get('pages_archived', [])
|
|
if pages and isinstance(pages, list) and len(pages) > 0:
|
|
url = pages[0].get('url')
|
|
if url:
|
|
return url
|
|
|
|
# Priority 3: claims[0].source_url
|
|
claims = web_enrichment.get('claims', [])
|
|
if claims and isinstance(claims, list) and len(claims) > 0:
|
|
url = claims[0].get('source_url')
|
|
if url:
|
|
return url
|
|
|
|
# Priority 4: source_url at root level
|
|
if 'source_url' in web_enrichment:
|
|
return web_enrichment['source_url']
|
|
|
|
return None
|
|
|
|
|
|
def patch_file(filepath: Path, dry_run: bool = False, verbose: bool = False) -> tuple[bool, int]:
|
|
"""Patch a single file to add missing wasDerivedFrom for web_enrichment."""
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.load(f)
|
|
|
|
if not data:
|
|
return False, 0
|
|
|
|
modified = False
|
|
count = 0
|
|
|
|
if 'web_enrichment' not in data:
|
|
return False, 0
|
|
|
|
we = data['web_enrichment']
|
|
|
|
if '_provenance' not in we:
|
|
return False, 0
|
|
|
|
prov = we['_provenance']
|
|
|
|
# Check if wasDerivedFrom is already present
|
|
if 'prov' in prov and 'wasDerivedFrom' in prov['prov']:
|
|
return False, 0
|
|
|
|
# Get source URL
|
|
source_url = get_web_source_url(we)
|
|
|
|
if not source_url:
|
|
if verbose:
|
|
print(f" [web_enrichment] No source URL found - skipping")
|
|
return False, 0
|
|
|
|
# Add wasDerivedFrom
|
|
if 'prov' not in prov:
|
|
prov['prov'] = {}
|
|
|
|
prov['prov']['wasDerivedFrom'] = source_url
|
|
modified = True
|
|
count = 1
|
|
|
|
if verbose:
|
|
print(f" [web_enrichment] Added wasDerivedFrom: {source_url[:60]}...")
|
|
|
|
if modified and not dry_run:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f)
|
|
|
|
return modified, count
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description="Patch missing wasDerivedFrom in web_enrichment")
|
|
parser.add_argument('--dry-run', action='store_true', help="Show what would be done")
|
|
parser.add_argument('--verbose', '-v', action='store_true', help="Verbose output")
|
|
parser.add_argument('--limit', type=int, help="Limit number of files to process")
|
|
parser.add_argument('--file', type=str, help="Process single file")
|
|
args = parser.parse_args()
|
|
|
|
if args.file:
|
|
files = [Path(args.file)]
|
|
else:
|
|
files = sorted(CUSTODIAN_DIR.glob("*.yaml"))
|
|
if args.limit:
|
|
files = files[:args.limit]
|
|
|
|
total = len(files)
|
|
modified_count = 0
|
|
added_count = 0
|
|
|
|
print(f"Patching {total} files for web_enrichment wasDerivedFrom...")
|
|
print(f" Dry run: {args.dry_run}")
|
|
|
|
for i, filepath in enumerate(files, 1):
|
|
if i % 1000 == 0 or args.verbose:
|
|
print(f"[{i}/{total}] {filepath.name}")
|
|
|
|
try:
|
|
was_modified, count = patch_file(filepath, args.dry_run, args.verbose)
|
|
if was_modified:
|
|
modified_count += 1
|
|
added_count += count
|
|
except Exception as e:
|
|
print(f" ERROR: {filepath.name}: {e}", file=sys.stderr)
|
|
|
|
print("\n" + "=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Files processed: {total}")
|
|
print(f"Files modified: {modified_count}")
|
|
print(f"Fields added: {added_count}")
|
|
if args.dry_run:
|
|
print("\n[DRY-RUN] No files were actually modified.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|