glam/scripts/patch_wikidata_provenance.py
2025-12-30 03:43:31 +01:00

116 lines
3.7 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Patch existing provenance to add missing wasDerivedFrom for wikidata_enrichment.
This is a one-time fix to add the Wikidata entity URL that was missing due to
incorrect source_key configuration.
"""
import sys
from pathlib import Path
from datetime import datetime, timezone
try:
from ruamel.yaml import YAML
yaml = YAML()
yaml.preserve_quotes = True
yaml.width = 4096 # Prevent line wrapping
except ImportError:
print("Error: ruamel.yaml required. Install with: pip install ruamel.yaml")
sys.exit(1)
CUSTODIAN_DIR = Path("data/custodian")
def patch_file(filepath: Path, dry_run: bool = False, verbose: bool = False) -> tuple[bool, int]:
"""
Patch a single file to add missing wasDerivedFrom.
Returns (was_modified, count_added)
"""
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.load(f)
if not data:
return False, 0
modified = False
count = 0
# Patch wikidata_enrichment
if 'wikidata_enrichment' in data:
we = data['wikidata_enrichment']
entity_id = we.get('wikidata_entity_id')
if entity_id and '_provenance' in we:
prov = we['_provenance']
if 'prov' in prov and 'wasDerivedFrom' not in prov['prov']:
prov['prov']['wasDerivedFrom'] = f"https://www.wikidata.org/wiki/{entity_id}"
modified = True
count += 1
if verbose:
print(f" [wikidata_enrichment] Added wasDerivedFrom: {entity_id}")
# Patch youtube_enrichment
if 'youtube_enrichment' in data:
ye = data['youtube_enrichment']
source_url = ye.get('source_url')
if source_url and '_provenance' in ye:
prov = ye['_provenance']
if 'prov' in prov and 'wasDerivedFrom' not in prov['prov']:
prov['prov']['wasDerivedFrom'] = source_url
modified = True
count += 1
if verbose:
print(f" [youtube_enrichment] Added wasDerivedFrom: {source_url[:50]}...")
if modified and not dry_run:
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f)
return modified, count
def main():
import argparse
parser = argparse.ArgumentParser(description="Patch missing wasDerivedFrom in provenance")
parser.add_argument('--dry-run', action='store_true', help="Show what would be done")
parser.add_argument('--verbose', '-v', action='store_true', help="Verbose output")
parser.add_argument('--limit', type=int, help="Limit number of files to process")
args = parser.parse_args()
files = sorted(CUSTODIAN_DIR.glob("*.yaml"))
if args.limit:
files = files[:args.limit]
total = len(files)
modified_count = 0
added_count = 0
print(f"Patching {total} files...")
print(f" Dry run: {args.dry_run}")
for i, filepath in enumerate(files, 1):
if i % 1000 == 0 or args.verbose:
print(f"[{i}/{total}] {filepath.name}")
try:
was_modified, count = patch_file(filepath, args.dry_run, args.verbose)
if was_modified:
modified_count += 1
added_count += count
except Exception as e:
print(f" ERROR: {filepath.name}: {e}", file=sys.stderr)
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Files processed: {total}")
print(f"Files modified: {modified_count}")
print(f"Fields added: {added_count}")
if args.dry_run:
print("\n[DRY-RUN] No files were actually modified.")
if __name__ == '__main__':
main()