255 lines
8 KiB
Python
255 lines
8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fast patch for missing wasDerivedFrom fields in YAML enrichment sections.
|
|
|
|
Uses regex-based detection for speed, then ruamel.yaml only for files that need changes.
|
|
|
|
Usage:
|
|
python scripts/patch_derived_from_fast.py
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
try:
|
|
from ruamel.yaml import YAML # type: ignore
|
|
from ruamel.yaml.comments import CommentedMap # type: ignore
|
|
except ImportError:
|
|
print("ERROR: ruamel.yaml not installed. Run: pip install ruamel.yaml")
|
|
sys.exit(1)
|
|
|
|
|
|
def needs_youtube_patch(content: str) -> bool:
|
|
"""Check if file has youtube_enrichment without wasDerivedFrom but with channel_url."""
|
|
if 'youtube_enrichment:' not in content:
|
|
return False
|
|
if 'channel_url:' not in content and 'channel_id:' not in content:
|
|
return False
|
|
|
|
# Check if _provenance exists but wasDerivedFrom is missing in youtube section
|
|
# This is a heuristic - we'll verify with proper YAML parsing
|
|
yt_match = re.search(r'youtube_enrichment:.*?(?=\n\w|\Z)', content, re.DOTALL)
|
|
if yt_match:
|
|
yt_section = yt_match.group()
|
|
if '_provenance:' in yt_section:
|
|
# Check if wasDerivedFrom exists in the prov subsection
|
|
prov_match = re.search(r'_provenance:.*?prov:.*?(?=\n \w|\n \w|\Z)', yt_section, re.DOTALL)
|
|
if prov_match:
|
|
prov_section = prov_match.group()
|
|
if 'wasDerivedFrom:' not in prov_section:
|
|
return True
|
|
return False
|
|
|
|
|
|
def needs_zcbs_patch(content: str) -> bool:
|
|
"""Check if file has zcbs_enrichment without wasDerivedFrom."""
|
|
if 'zcbs_enrichment:' not in content:
|
|
return False
|
|
|
|
zcbs_match = re.search(r'zcbs_enrichment:.*?(?=\n\w|\Z)', content, re.DOTALL)
|
|
if zcbs_match:
|
|
zcbs_section = zcbs_match.group()
|
|
if '_provenance:' in zcbs_section:
|
|
prov_match = re.search(r'_provenance:.*?prov:.*?(?=\n \w|\n \w|\Z)', zcbs_section, re.DOTALL)
|
|
if prov_match:
|
|
prov_section = prov_match.group()
|
|
if 'wasDerivedFrom:' not in prov_section:
|
|
return True
|
|
return False
|
|
|
|
|
|
def patch_youtube_section(section: dict) -> bool:
|
|
"""Add wasDerivedFrom to youtube_enrichment section."""
|
|
provenance = section.get('_provenance')
|
|
if not provenance:
|
|
return False
|
|
|
|
prov = provenance.get('prov')
|
|
if not prov:
|
|
prov = CommentedMap()
|
|
provenance['prov'] = prov
|
|
|
|
if prov.get('wasDerivedFrom'):
|
|
return False
|
|
|
|
# Get URL
|
|
url = section.get('channel_url') or section.get('source_url')
|
|
if not url and section.get('channel_id'):
|
|
url = f"https://www.youtube.com/channel/{section['channel_id']}"
|
|
|
|
if not url:
|
|
return False
|
|
|
|
prov['wasDerivedFrom'] = url
|
|
|
|
if not prov.get('generatedAtTime'):
|
|
prov['generatedAtTime'] = section.get('fetch_timestamp') or datetime.now(timezone.utc).isoformat()
|
|
|
|
if not prov.get('wasGeneratedBy'):
|
|
generated_by = CommentedMap()
|
|
generated_by['@type'] = 'prov:Activity'
|
|
generated_by['name'] = 'youtube_api_fetch'
|
|
generated_by['used'] = 'https://www.googleapis.com/youtube/v3'
|
|
prov['wasGeneratedBy'] = generated_by
|
|
|
|
return True
|
|
|
|
|
|
def patch_zcbs_section(section: dict) -> bool:
|
|
"""Add wasDerivedFrom to zcbs_enrichment section."""
|
|
provenance = section.get('_provenance')
|
|
if not provenance:
|
|
return False
|
|
|
|
prov = provenance.get('prov')
|
|
if not prov:
|
|
prov = CommentedMap()
|
|
provenance['prov'] = prov
|
|
|
|
if prov.get('wasDerivedFrom'):
|
|
return False
|
|
|
|
# Get URL
|
|
url = section.get('source')
|
|
if not url:
|
|
platform_urls = section.get('platform_urls', {})
|
|
if platform_urls:
|
|
url = platform_urls.get('website') or platform_urls.get('catalog') or next(iter(platform_urls.values()), None)
|
|
if not url and section.get('zcbs_id'):
|
|
url = f"https://www.zcbs.nl/organisatie/{section['zcbs_id']}"
|
|
|
|
if not url:
|
|
return False
|
|
|
|
prov['wasDerivedFrom'] = url
|
|
|
|
if not prov.get('generatedAtTime'):
|
|
prov['generatedAtTime'] = section.get('enrichment_timestamp') or datetime.now(timezone.utc).isoformat()
|
|
|
|
if not prov.get('wasGeneratedBy'):
|
|
generated_by = CommentedMap()
|
|
generated_by['@type'] = 'prov:Activity'
|
|
generated_by['name'] = 'zcbs_registry_fetch'
|
|
generated_by['used'] = 'https://www.zcbs.nl'
|
|
prov['wasGeneratedBy'] = generated_by
|
|
|
|
return True
|
|
|
|
|
|
def process_file(filepath: Path, yaml: YAML) -> dict:
|
|
"""Process a single file."""
|
|
result = {'modified': False, 'youtube': False, 'zcbs': False, 'error': None}
|
|
|
|
try:
|
|
content = filepath.read_text(encoding='utf-8')
|
|
except Exception as e:
|
|
result['error'] = str(e)
|
|
return result
|
|
|
|
# Quick check if file needs patching
|
|
needs_yt = needs_youtube_patch(content)
|
|
needs_zc = needs_zcbs_patch(content)
|
|
|
|
if not needs_yt and not needs_zc:
|
|
return result
|
|
|
|
# Parse with ruamel.yaml
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.load(f)
|
|
except Exception as e:
|
|
result['error'] = str(e)
|
|
return result
|
|
|
|
modified = False
|
|
|
|
if needs_yt and 'youtube_enrichment' in data:
|
|
if patch_youtube_section(data['youtube_enrichment']):
|
|
result['youtube'] = True
|
|
modified = True
|
|
|
|
if needs_zc and 'zcbs_enrichment' in data:
|
|
if patch_zcbs_section(data['zcbs_enrichment']):
|
|
result['zcbs'] = True
|
|
modified = True
|
|
|
|
if modified:
|
|
try:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f)
|
|
result['modified'] = True
|
|
except Exception as e:
|
|
result['error'] = f"Write error: {e}"
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
yaml = YAML()
|
|
yaml.preserve_quotes = True
|
|
yaml.default_flow_style = False
|
|
yaml.width = 4096
|
|
|
|
script_dir = Path(__file__).parent
|
|
base_dir = script_dir.parent
|
|
custodian_dir = base_dir / 'data' / 'custodian'
|
|
|
|
yaml_files = list(custodian_dir.glob('*.yaml'))
|
|
total_files = len(yaml_files)
|
|
|
|
print(f"Scanning {total_files} files for missing wasDerivedFrom...")
|
|
|
|
# First pass: identify files that need patching (fast regex scan)
|
|
candidates = []
|
|
for i, filepath in enumerate(yaml_files):
|
|
if (i + 1) % 5000 == 0:
|
|
print(f" Scan progress: {i + 1}/{total_files}")
|
|
try:
|
|
content = filepath.read_text(encoding='utf-8')
|
|
if needs_youtube_patch(content) or needs_zcbs_patch(content):
|
|
candidates.append(filepath)
|
|
except:
|
|
pass
|
|
|
|
print(f"\nFound {len(candidates)} files that may need patching.")
|
|
print(f"Processing with full YAML parser...")
|
|
|
|
stats = {
|
|
'files_modified': 0,
|
|
'youtube_patched': 0,
|
|
'zcbs_patched': 0,
|
|
'errors': 0,
|
|
}
|
|
|
|
for i, filepath in enumerate(candidates):
|
|
if (i + 1) % 100 == 0:
|
|
print(f" Patch progress: {i + 1}/{len(candidates)}")
|
|
|
|
result = process_file(filepath, yaml)
|
|
|
|
if result['error']:
|
|
stats['errors'] += 1
|
|
print(f" ERROR: {filepath.name}: {result['error']}")
|
|
elif result['modified']:
|
|
stats['files_modified'] += 1
|
|
if result['youtube']:
|
|
stats['youtube_patched'] += 1
|
|
if result['zcbs']:
|
|
stats['zcbs_patched'] += 1
|
|
|
|
print()
|
|
print("=" * 60)
|
|
print("PATCH SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Files modified: {stats['files_modified']:,}")
|
|
print(f"YouTube patched: {stats['youtube_patched']:,}")
|
|
print(f"ZCBS patched: {stats['zcbs_patched']:,}")
|
|
print(f"Errors: {stats['errors']:,}")
|
|
print()
|
|
print(f"Total wasDerivedFrom fields added: {stats['youtube_patched'] + stats['zcbs_patched']:,}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|