glam/scripts/remove_wikidata_duplicates.py
kempersc 486bbee813 feat(wikidata): add re-enrichment and duplicate removal scripts
- Add reenrich_wikidata_with_verification.py for re-running enrichment
- Add remove_wikidata_duplicates.py for deduplication
2025-12-08 14:59:38 +01:00

145 lines
4.8 KiB
Python

#!/usr/bin/env python3
"""
Remove wikidata_enrichment from files with duplicate Wikidata entity IDs.
These files have incorrect Wikidata entity ID assignments where the same Q-number
was incorrectly assigned to multiple different institutions.
The script:
1. Reads the list of affected files from /tmp/wikidata_duplicates_to_clean.txt
2. For each file, removes the wikidata_enrichment section
3. Adds a provenance note documenting the removal
4. Preserves all other data
"""
import os
import sys
from datetime import datetime, timezone
from pathlib import Path
import yaml
# Preserve order in YAML output
class OrderedDumper(yaml.SafeDumper):
pass
def represent_ordereddict(dumper, data):
return dumper.represent_mapping('tag:yaml.org,2002:map', data.items())
def str_representer(dumper, data):
if '\n' in data:
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
OrderedDumper.add_representer(str, str_representer)
def remove_wikidata_enrichment(file_path: Path) -> tuple[bool, str]:
"""
Remove wikidata_enrichment from a file.
Returns:
tuple of (success: bool, message: str)
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Parse YAML
data = yaml.safe_load(content)
if data is None:
return False, "Empty or invalid YAML"
# Check if wikidata_enrichment exists
if 'wikidata_enrichment' not in data:
return False, "No wikidata_enrichment found"
# Get the old wikidata ID for documentation
old_wikidata_id = None
if isinstance(data.get('wikidata_enrichment'), dict):
old_wikidata_id = data['wikidata_enrichment'].get('wikidata_id')
# Remove wikidata_enrichment
del data['wikidata_enrichment']
# Add provenance note
timestamp = datetime.now(timezone.utc).isoformat()
note = f"Removed incorrect wikidata_enrichment on {timestamp}. "
if old_wikidata_id:
note += f"Previous Wikidata ID {old_wikidata_id} was incorrectly assigned (duplicate across multiple institutions). "
note += "Re-enrichment required with proper matching."
# Update or create provenance
if 'provenance' not in data:
data['provenance'] = {}
if isinstance(data['provenance'], dict):
existing_notes = data['provenance'].get('notes', '')
# Handle case where notes is a list
if isinstance(existing_notes, list):
existing_notes.append(note)
data['provenance']['notes'] = existing_notes
elif existing_notes:
data['provenance']['notes'] = existing_notes + '\n\n' + note
else:
data['provenance']['notes'] = note
# Write back
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, Dumper=OrderedDumper, allow_unicode=True,
default_flow_style=False, sort_keys=False, width=120)
return True, f"Removed wikidata_enrichment (was {old_wikidata_id})"
except Exception as e:
return False, f"Error: {e}"
def main():
# Read the list of files to clean
list_file = Path('/tmp/wikidata_duplicates_to_clean.txt')
if not list_file.exists():
print(f"Error: {list_file} not found")
sys.exit(1)
with open(list_file, 'r') as f:
filenames = [line.strip() for line in f if line.strip()]
print(f"Found {len(filenames)} files to clean")
# Base directory for custodian files
base_dir = Path('/Users/kempersc/apps/glam/data/custodian')
success_count = 0
skip_count = 0
error_count = 0
for i, filename in enumerate(filenames):
file_path = base_dir / filename
if not file_path.exists():
print(f"[{i+1}/{len(filenames)}] SKIP (not found): {filename}")
skip_count += 1
continue
success, message = remove_wikidata_enrichment(file_path)
if success:
print(f"[{i+1}/{len(filenames)}] OK: {filename} - {message}")
success_count += 1
else:
if "No wikidata_enrichment" in message:
print(f"[{i+1}/{len(filenames)}] SKIP (no wikidata): {filename}")
skip_count += 1
else:
print(f"[{i+1}/{len(filenames)}] ERROR: {filename} - {message}")
error_count += 1
print(f"\n=== Summary ===")
print(f"Total files: {len(filenames)}")
print(f"Successfully cleaned: {success_count}")
print(f"Skipped: {skip_count}")
print(f"Errors: {error_count}")
if __name__ == '__main__':
main()