- Add reenrich_wikidata_with_verification.py for re-running enrichment - Add remove_wikidata_duplicates.py for deduplication
145 lines
4.8 KiB
Python
145 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Remove wikidata_enrichment from files with duplicate Wikidata entity IDs.
|
|
|
|
These files have incorrect Wikidata entity ID assignments where the same Q-number
|
|
was incorrectly assigned to multiple different institutions.
|
|
|
|
The script:
|
|
1. Reads the list of affected files from /tmp/wikidata_duplicates_to_clean.txt
|
|
2. For each file, removes the wikidata_enrichment section
|
|
3. Adds a provenance note documenting the removal
|
|
4. Preserves all other data
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
import yaml
|
|
|
|
# Preserve order in YAML output
|
|
class OrderedDumper(yaml.SafeDumper):
|
|
pass
|
|
|
|
def represent_ordereddict(dumper, data):
|
|
return dumper.represent_mapping('tag:yaml.org,2002:map', data.items())
|
|
|
|
def str_representer(dumper, data):
|
|
if '\n' in data:
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
|
|
|
|
OrderedDumper.add_representer(str, str_representer)
|
|
|
|
def remove_wikidata_enrichment(file_path: Path) -> tuple[bool, str]:
|
|
"""
|
|
Remove wikidata_enrichment from a file.
|
|
|
|
Returns:
|
|
tuple of (success: bool, message: str)
|
|
"""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Parse YAML
|
|
data = yaml.safe_load(content)
|
|
|
|
if data is None:
|
|
return False, "Empty or invalid YAML"
|
|
|
|
# Check if wikidata_enrichment exists
|
|
if 'wikidata_enrichment' not in data:
|
|
return False, "No wikidata_enrichment found"
|
|
|
|
# Get the old wikidata ID for documentation
|
|
old_wikidata_id = None
|
|
if isinstance(data.get('wikidata_enrichment'), dict):
|
|
old_wikidata_id = data['wikidata_enrichment'].get('wikidata_id')
|
|
|
|
# Remove wikidata_enrichment
|
|
del data['wikidata_enrichment']
|
|
|
|
# Add provenance note
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
note = f"Removed incorrect wikidata_enrichment on {timestamp}. "
|
|
if old_wikidata_id:
|
|
note += f"Previous Wikidata ID {old_wikidata_id} was incorrectly assigned (duplicate across multiple institutions). "
|
|
note += "Re-enrichment required with proper matching."
|
|
|
|
# Update or create provenance
|
|
if 'provenance' not in data:
|
|
data['provenance'] = {}
|
|
|
|
if isinstance(data['provenance'], dict):
|
|
existing_notes = data['provenance'].get('notes', '')
|
|
# Handle case where notes is a list
|
|
if isinstance(existing_notes, list):
|
|
existing_notes.append(note)
|
|
data['provenance']['notes'] = existing_notes
|
|
elif existing_notes:
|
|
data['provenance']['notes'] = existing_notes + '\n\n' + note
|
|
else:
|
|
data['provenance']['notes'] = note
|
|
|
|
# Write back
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, Dumper=OrderedDumper, allow_unicode=True,
|
|
default_flow_style=False, sort_keys=False, width=120)
|
|
|
|
return True, f"Removed wikidata_enrichment (was {old_wikidata_id})"
|
|
|
|
except Exception as e:
|
|
return False, f"Error: {e}"
|
|
|
|
|
|
def main():
|
|
# Read the list of files to clean
|
|
list_file = Path('/tmp/wikidata_duplicates_to_clean.txt')
|
|
if not list_file.exists():
|
|
print(f"Error: {list_file} not found")
|
|
sys.exit(1)
|
|
|
|
with open(list_file, 'r') as f:
|
|
filenames = [line.strip() for line in f if line.strip()]
|
|
|
|
print(f"Found {len(filenames)} files to clean")
|
|
|
|
# Base directory for custodian files
|
|
base_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
|
|
|
success_count = 0
|
|
skip_count = 0
|
|
error_count = 0
|
|
|
|
for i, filename in enumerate(filenames):
|
|
file_path = base_dir / filename
|
|
|
|
if not file_path.exists():
|
|
print(f"[{i+1}/{len(filenames)}] SKIP (not found): {filename}")
|
|
skip_count += 1
|
|
continue
|
|
|
|
success, message = remove_wikidata_enrichment(file_path)
|
|
|
|
if success:
|
|
print(f"[{i+1}/{len(filenames)}] OK: {filename} - {message}")
|
|
success_count += 1
|
|
else:
|
|
if "No wikidata_enrichment" in message:
|
|
print(f"[{i+1}/{len(filenames)}] SKIP (no wikidata): {filename}")
|
|
skip_count += 1
|
|
else:
|
|
print(f"[{i+1}/{len(filenames)}] ERROR: {filename} - {message}")
|
|
error_count += 1
|
|
|
|
print(f"\n=== Summary ===")
|
|
print(f"Total files: {len(filenames)}")
|
|
print(f"Successfully cleaned: {success_count}")
|
|
print(f"Skipped: {skip_count}")
|
|
print(f"Errors: {error_count}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|