glam/scripts/fix_japan_synthetic_qnumbers.py
2025-11-21 22:12:33 +01:00

297 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Fix Japan Dataset - Remove Synthetic Q-Numbers
CRITICAL DATA INTEGRITY FIX:
The Japan dataset (jp_institutions_resolved.yaml) contains 3,426 synthetic Q-numbers
that DO NOT exist in Wikidata. This violates the project's data integrity policy.
Per AGENTS.md:
🚨 CRITICAL POLICY: REAL IDENTIFIERS ONLY 🚨
SYNTHETIC Q-NUMBERS ARE STRICTLY PROHIBITED IN THIS PROJECT.
This script:
1. Strips all synthetic Q-numbers from GHCIDs
2. Restores base GHCIDs (without Q-suffix)
3. Updates GHCID history to document the fix
4. Marks institutions with needs_wikidata_enrichment flag
5. Preserves all other data
Example transformation:
BEFORE: ghcid: JP-HO-SAP-L-SSL-Q61382582 (FAKE Q-number)
AFTER: ghcid: JP-HO-SAP-L-SSL (base GHCID)
needs_wikidata_enrichment: true
Usage:
python scripts/fix_japan_synthetic_qnumbers.py
"""
import yaml
import re
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, Any, List
INPUT_FILE = Path('/Users/kempersc/apps/glam/data/instances/japan/jp_institutions_resolved.yaml')
OUTPUT_FILE = Path('/Users/kempersc/apps/glam/data/instances/japan/jp_institutions_cleaned.yaml')
REPORT_FILE = Path('/Users/kempersc/apps/glam/data/instances/japan/SYNTHETIC_QNUMBER_CLEANUP_REPORT.md')
def extract_base_ghcid(ghcid: str) -> str:
"""
Extract base GHCID by removing Q-number suffix.
Example: JP-HO-SAP-L-SSL-Q61382582 -> JP-HO-SAP-L-SSL
"""
match = re.match(r'^(JP-.+?)-Q\d+$', ghcid)
if match:
return match.group(1)
return ghcid
def fix_institution(inst: Dict[str, Any]) -> Dict[str, Any]:
"""
Remove synthetic Q-numbers from institution record.
"""
current_ghcid = inst.get('ghcid', '')
base_ghcid = extract_base_ghcid(current_ghcid)
# Check if this GHCID has a synthetic Q-number
has_synthetic = '-Q' in current_ghcid and current_ghcid != base_ghcid
if not has_synthetic:
return inst # No changes needed
# Restore base GHCID
inst['ghcid'] = base_ghcid
# Update ghcid_original if it exists
if 'ghcid_original' in inst:
inst['ghcid_original'] = base_ghcid
# Mark for Wikidata enrichment
inst['needs_wikidata_enrichment'] = True
# Update GHCID history
if 'ghcid_history' in inst and len(inst['ghcid_history']) > 0:
history = inst['ghcid_history']
# Add new history entry documenting the fix
fix_timestamp = datetime.now(timezone.utc).isoformat()
new_entry = {
'ghcid': base_ghcid,
'ghcid_numeric': inst.get('ghcid_numeric'),
'valid_from': fix_timestamp,
'valid_to': None,
'reason': (
f'Synthetic Q-number removed (was {current_ghcid}). '
f'Restored base GHCID. Per AGENTS.md data integrity policy, '
f'synthetic Q-numbers are prohibited. Institution flagged for '
f'real Wikidata enrichment.'
),
'institution_name': inst.get('name'),
'location_city': history[0].get('location_city') if history else None,
'location_country': 'JP'
}
# Close out the synthetic Q-number entry
if history[0].get('ghcid') == current_ghcid:
history[0]['valid_to'] = fix_timestamp
history[0]['reason'] += ' [INVALID: Synthetic Q-number removed]'
# Insert new entry at the beginning
history.insert(0, new_entry)
inst['ghcid_history'] = history
# Add note to provenance if it exists
if 'provenance' in inst:
prov = inst['provenance']
if isinstance(prov, dict):
if 'notes' not in prov:
prov['notes'] = ''
prov['notes'] += (
f'\n[2025-11-20 DATA INTEGRITY FIX] Synthetic Q-number {current_ghcid.split("-")[-1]} '
f'removed from GHCID. Restored base GHCID {base_ghcid}. '
f'Institution requires real Wikidata lookup before Q-number can be added.'
)
return inst
def main():
print("=" * 80)
print("Japan Dataset Synthetic Q-Number Cleanup")
print("=" * 80)
print(f"\nInput: {INPUT_FILE}")
print(f"Output: {OUTPUT_FILE}")
print()
# Load YAML
print("Loading dataset...")
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f"Loaded {len(institutions):,} institutions")
# Process institutions
print("\nProcessing institutions...")
fixed_count = 0
unchanged_count = 0
cleaned_institutions = []
for inst in institutions:
original_ghcid = inst.get('ghcid', '')
fixed_inst = fix_institution(inst)
if fixed_inst.get('ghcid') != original_ghcid:
fixed_count += 1
if fixed_count <= 5: # Show first 5 examples
print(f" ✓ Fixed: {original_ghcid}{fixed_inst['ghcid']}")
else:
unchanged_count += 1
cleaned_institutions.append(fixed_inst)
print(f"\nResults:")
print(f" - Fixed (synthetic Q-numbers removed): {fixed_count:,}")
print(f" - Unchanged (no synthetic Q-numbers): {unchanged_count:,}")
print(f" - Total: {len(cleaned_institutions):,}")
# Save cleaned dataset
print(f"\nSaving cleaned dataset to {OUTPUT_FILE}...")
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
yaml.dump(cleaned_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
output_size_mb = OUTPUT_FILE.stat().st_size / (1024 * 1024)
print(f"✓ Saved: {output_size_mb:.1f} MB")
# Generate report
print(f"\nGenerating cleanup report...")
report = f"""# Japan Dataset Synthetic Q-Number Cleanup Report
**Generated**: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}
## Critical Data Integrity Fix
### Problem Identified
The Japan dataset (`jp_institutions_resolved.yaml`) contained **{fixed_count:,} synthetic Q-numbers** that do not exist in Wikidata.
**Example violations**:
```yaml
# BEFORE (INVALID)
ghcid: JP-HO-SAP-L-SSL-Q61382582
reason: "Q-number Q61382582 added to resolve collision. Source: Synthetic (from ISIL code hash)"
# Q61382582 does NOT exist in Wikidata!
# https://www.wikidata.org/wiki/Q61382582 → 404 NOT FOUND
```
### Policy Violation
Per `AGENTS.md` data integrity policy:
> **🚨 CRITICAL POLICY: REAL IDENTIFIERS ONLY 🚨**
>
> **SYNTHETIC Q-NUMBERS ARE STRICTLY PROHIBITED IN THIS PROJECT.**
>
> All Wikidata Q-numbers used in GHCIDs MUST be:
> - ✅ Real Wikidata entity identifiers (verified via API query)
> - ✅ Confirmed to match the institution (fuzzy match score > 0.85)
> - ✅ Resolvable at `https://www.wikidata.org/wiki/Q[number]`
>
> ❌ **NEVER** generate synthetic/fake Q-numbers from hashes, numeric IDs, or algorithms
### Root Cause
The original processing script generated Q-numbers algorithmically from ISIL code hashes to resolve GHCID collisions, without verifying they existed in Wikidata.
### Fix Applied
**Transformation**:
1. Stripped all synthetic Q-numbers from GHCIDs
2. Restored base GHCIDs (without Q-suffix)
3. Added `needs_wikidata_enrichment: true` flag to {fixed_count:,} institutions
4. Updated GHCID history to document the cleanup
5. Added provenance notes explaining the fix
**Example**:
```yaml
# AFTER (VALID)
ghcid: JP-HO-SAP-L-SSL # Base GHCID without Q-number
needs_wikidata_enrichment: true
ghcid_history:
- ghcid: JP-HO-SAP-L-SSL
valid_from: "2025-11-20T..."
reason: >-
Synthetic Q-number removed (was JP-HO-SAP-L-SSL-Q61382582).
Restored base GHCID. Per AGENTS.md data integrity policy,
synthetic Q-numbers are prohibited. Institution flagged for
real Wikidata enrichment.
```
## Statistics
| Metric | Count |
|--------|-------|
| **Total institutions** | {len(cleaned_institutions):,} |
| **Fixed (synthetic Q-numbers removed)** | {fixed_count:,} ({fixed_count/len(cleaned_institutions)*100:.1f}%) |
| **Unchanged (no synthetic Q-numbers)** | {unchanged_count:,} ({unchanged_count/len(cleaned_institutions)*100:.1f}%) |
| **Institutions needing Wikidata enrichment** | {fixed_count:,} |
## Next Steps
### Immediate Actions
1. ✅ **Cleaned dataset saved**: `jp_institutions_cleaned.yaml`
2. ⏳ **Replace original dataset** with cleaned version
3. ⏳ **Update unified database** to use cleaned Japan data
### Follow-up: Real Wikidata Enrichment
The {fixed_count:,} institutions flagged with `needs_wikidata_enrichment: true` should undergo **real Wikidata lookup**:
**Workflow**:
1. Query Wikidata SPARQL endpoint for Japanese heritage institutions
2. Fuzzy match institution names (threshold > 0.85)
3. Verify matches by comparing location (city, prefecture)
4. Add REAL Q-numbers to identifiers
5. Update GHCIDs with verified Q-numbers
6. Document enrichment in provenance metadata
**Reference**: See `docs/WIKIDATA_ENRICHMENT.md` for detailed procedures
## File Locations
- **Original (with synthetic Q-numbers)**: `{INPUT_FILE}`
- **Cleaned (synthetic Q-numbers removed)**: `{OUTPUT_FILE}`
- **Cleanup report**: `{REPORT_FILE}`
## Data Integrity Guarantee
✅ All Q-numbers in the cleaned dataset are now either:
- Real Wikidata identifiers (verified), OR
- Absent (base GHCID only, awaiting real Wikidata enrichment)
❌ Zero synthetic/fake Q-numbers remain in the dataset
---
**Cleanup script**: `scripts/fix_japan_synthetic_qnumbers.py`
**Executed**: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}
"""
with open(REPORT_FILE, 'w', encoding='utf-8') as f:
f.write(report)
print(f"✓ Report saved: {REPORT_FILE}")
print("\n" + "=" * 80)
print("CLEANUP COMPLETE")
print("=" * 80)
print(f"\n✅ Cleaned dataset: {OUTPUT_FILE}")
print(f"✅ Cleanup report: {REPORT_FILE}")
print(f"\n⚠️ Next: Replace original dataset with cleaned version")
print(f"⚠️ Then: Run real Wikidata enrichment for {fixed_count:,} institutions")
if __name__ == '__main__':
main()