glam/scripts/migrate_claims_to_canonical.py
2025-12-02 14:36:01 +01:00

462 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Migrate web claims from 1,456 ad-hoc types to canonical types.
This script:
1. Reads existing web_enrichment claims from NDE entry YAML files
2. Maps claim types to canonical types (from CanonicalClaimTypes.yaml)
3. Drops metadata fields that aren't institution claims
4. Converts nested claims (branches_0_name) to structured arrays
5. Validates that TIER 3 claims have XPath provenance
6. Writes updated entries with standardized claims
Usage:
# Analyze without writing changes
python scripts/migrate_claims_to_canonical.py --dry-run
# Migrate all entries
python scripts/migrate_claims_to_canonical.py
# Migrate single entry
python scripts/migrate_claims_to_canonical.py --entry 0001
"""
import argparse
import logging
import re
import sys
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import yaml
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# ========================================
# CANONICAL TYPE MAPPINGS
# ========================================
# Direct mappings from old types to canonical types
CANONICAL_MAPPINGS = {
# Identity claims -> full_name or short_name
'org_name': 'full_name',
'custodian_name': 'full_name',
'full_name': 'full_name',
'organization_name': 'full_name',
'museum_name': 'full_name',
'operating_name': 'full_name',
'legal_name': 'full_name',
'name': 'full_name',
'brand_name': 'short_name',
'short_name': 'short_name',
'abbreviation': 'short_name',
# Description claims
'description': 'description',
'description_short': 'description',
'mission': 'description',
'slogan': 'description',
'tagline': 'description',
'about': 'description',
'history': 'description',
# Contact claims
'email': 'email',
'phone': 'phone',
'telephone': 'phone',
'address': 'address',
'postal_code': 'postal_code',
'city': 'city',
'street_address': 'address',
# Social media normalization
'facebook': 'social_facebook',
'social_facebook': 'social_facebook',
'social_media_facebook': 'social_facebook',
'instagram': 'social_instagram',
'social_instagram': 'social_instagram',
'social_media_instagram': 'social_instagram',
'twitter': 'social_twitter',
'social_twitter': 'social_twitter',
'social_media_twitter': 'social_twitter',
'x': 'social_twitter',
'linkedin': 'social_linkedin',
'social_linkedin': 'social_linkedin',
'social_media_linkedin': 'social_linkedin',
'youtube': 'social_youtube',
'social_youtube': 'social_youtube',
'social_media_youtube': 'social_youtube',
# Website
'website': 'website',
'social_media_website': 'website',
'source_urls': 'website',
'url': 'website',
# Identifiers
'isil_code': 'isil_code',
'isil': 'isil_code',
'kvk_number': 'kvk_number',
'kvk': 'kvk_number',
'wikidata_id': 'wikidata_id',
'wikidata': 'wikidata_id',
'rsin': 'kvk_number',
# Organization metadata
'organization_type': 'organization_type',
'institution_type': 'organization_type',
'type': 'organization_type',
'legal_form': 'legal_form',
'rechtsvorm': 'legal_form',
'founding_date': 'founding_date',
'founded': 'founding_date',
'founding_year': 'founding_date',
'established': 'founding_date',
'opening_hours': 'opening_hours',
'hours': 'opening_hours',
# Structural (TIER 1) - from docling
'page_title': 'page_title',
'page_count': 'page_count',
'image_count': 'image_count',
'table_count': 'table_count',
'markdown_length': 'markdown_length',
# Pattern-based (TIER 2)
'main_h1': 'main_h1',
'nav_items': 'nav_items',
'has_contact_section': 'has_contact_section',
'has_footer': 'has_footer',
'language_detected': 'language_detected',
}
# Types to DROP (metadata, not institution claims)
DROP_TYPES = {
'extraction_timestamp',
'extraction_method',
'confidence_score',
'enrichment_method',
'name_verified',
'needs_verification',
'verification_notes',
'note',
'notes',
'claim_notes',
'source',
'xpath_match_score', # Keep as claim metadata, not claim type
'retrieved_on', # Keep as claim metadata
'html_file', # Keep as claim metadata
}
# Generic UI text to filter out
INVALID_CLAIMS_PATTERNS = [
r'^Home$',
r'^Contact$',
r'^Over ons$',
r'^Collectie$',
r'^Bezoek$',
r'^Menu$',
r'^Search$',
r'^Zoeken$',
r'^Nederlands$',
r'^English$',
r'^Skip to',
r'^Cookie',
r'share.*facebook',
r'share.*twitter',
r'intent/tweet',
r'sharer\.php',
]
INVALID_CLAIMS_RE = [re.compile(p, re.IGNORECASE) for p in INVALID_CLAIMS_PATTERNS]
# Nested claim patterns
NESTED_PATTERNS = [
(re.compile(r'^branches_(\d+)_(.+)$'), 'branches'),
(re.compile(r'^programs_(\d+)_(.+)$'), 'programs'),
(re.compile(r'^collections_(\d+)_(.+)$'), 'collections'),
(re.compile(r'^digital_platforms_(\d+)_(.+)$'), 'digital_platforms'),
(re.compile(r'^organization_details_(.+)$'), 'organization_details'),
(re.compile(r'^location_details_(.+)$'), 'location'),
(re.compile(r'^contact_(.+)$'), 'contact'),
]
# TIER 3 claims that MUST have XPath provenance
TIER_3_CLAIMS = {
'full_name', 'short_name', 'description', 'email', 'phone',
'address', 'postal_code', 'city', 'organization_type',
'legal_form', 'founding_date', 'opening_hours'
}
class ClaimMigrator:
"""Migrate claims to canonical types."""
def __init__(self, entries_dir: Path, dry_run: bool = False):
self.entries_dir = entries_dir
self.dry_run = dry_run
self.stats = {
'entries_processed': 0,
'claims_migrated': 0,
'claims_dropped': 0,
'claims_invalid': 0,
'claims_nested': 0,
'claims_unmapped': 0,
'tier3_missing_xpath': 0,
}
self.type_counts = Counter()
self.unmapped_types = Counter()
def find_entry_files(self) -> list[Path]:
"""Find all entry YAML files."""
return sorted(self.entries_dir.glob('*.yaml'))
def is_invalid_claim_value(self, value: str) -> bool:
"""Check if a claim value is invalid (generic UI text)."""
if not value or not isinstance(value, str):
return False
value = value.strip()
if len(value) < 3:
return True # Too short to be meaningful
for pattern in INVALID_CLAIMS_RE:
if pattern.search(value):
return True
return False
def map_claim_type(self, claim_type: str) -> tuple[str | None, str]:
"""
Map a claim type to canonical type.
Returns:
(canonical_type, action) where action is:
- 'map': Direct mapping
- 'drop': Should be dropped
- 'nest': Should be converted to nested structure
- 'unknown': Unknown type
"""
# Normalize type name
claim_type_lower = claim_type.lower().strip()
# Check for DROP types
if claim_type_lower in DROP_TYPES:
return None, 'drop'
# Check for nested patterns
for pattern, parent in NESTED_PATTERNS:
if pattern.match(claim_type):
return parent, 'nest'
# Check direct mapping
if claim_type_lower in CANONICAL_MAPPINGS:
return CANONICAL_MAPPINGS[claim_type_lower], 'map'
# Try with underscores removed
claim_type_clean = claim_type_lower.replace('_', '')
for old_type, new_type in CANONICAL_MAPPINGS.items():
if old_type.replace('_', '') == claim_type_clean:
return new_type, 'map'
return None, 'unknown'
def migrate_claim(self, claim: dict) -> dict | None:
"""
Migrate a single claim to canonical type.
Returns migrated claim or None if should be dropped.
"""
claim_type = claim.get('claim_type', '')
claim_value = claim.get('claim_value', '')
# Check for invalid value
if self.is_invalid_claim_value(claim_value):
self.stats['claims_invalid'] += 1
return None
# Map type
canonical_type, action = self.map_claim_type(claim_type)
if action == 'drop':
self.stats['claims_dropped'] += 1
return None
if action == 'nest':
self.stats['claims_nested'] += 1
# For now, we'll keep nested claims but mark them
# A separate pass will convert to structured arrays
return {
**claim,
'claim_type': claim_type, # Keep original for later processing
'_nested_parent': canonical_type,
}
if action == 'unknown':
self.stats['claims_unmapped'] += 1
self.unmapped_types[claim_type] += 1
# Keep unknown claims but mark them
return {
**claim,
'_unmapped': True,
}
# Successfully mapped
self.stats['claims_migrated'] += 1
self.type_counts[canonical_type] += 1
# Check TIER 3 XPath requirement
if canonical_type in TIER_3_CLAIMS:
if not claim.get('xpath'):
self.stats['tier3_missing_xpath'] += 1
# Mark as needing verification
claim['_missing_xpath'] = True
return {
**claim,
'claim_type': canonical_type,
'_original_type': claim_type if claim_type != canonical_type else None,
}
def migrate_entry(self, entry: dict) -> dict:
"""Migrate all claims in an entry."""
# Check both web_claims and web_enrichment for claims
web_claims = entry.get('web_claims', {})
web_enrichment = entry.get('web_enrichment', {})
# Try web_claims first (newer structure), then web_enrichment
claims = web_claims.get('claims', []) or web_enrichment.get('claims', [])
if not claims:
return entry
migrated_claims = []
for claim in claims:
migrated = self.migrate_claim(claim)
if migrated:
# Clean up internal markers for final output
if not self.dry_run:
migrated.pop('_unmapped', None)
migrated.pop('_nested_parent', None)
orig_type = migrated.pop('_original_type', None)
# Optionally store original type for audit
if orig_type:
migrated['original_claim_type'] = orig_type
migrated_claims.append(migrated)
# Update the appropriate section
if 'web_claims' in entry:
entry['web_claims']['claims'] = migrated_claims
entry['web_claims']['claims_migrated'] = True
entry['web_claims']['migration_timestamp'] = datetime.now(timezone.utc).isoformat()
elif 'web_enrichment' in entry:
if 'claims' not in entry['web_enrichment']:
entry['web_enrichment'] = entry.get('web_enrichment', {})
entry['web_enrichment']['claims'] = migrated_claims
entry['web_enrichment']['claims_migrated'] = True
entry['web_enrichment']['migration_timestamp'] = datetime.now(timezone.utc).isoformat()
return entry
def process_entry_file(self, path: Path) -> bool:
"""Process a single entry file."""
try:
with open(path, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
if not entry:
return False
# Check if already migrated
web_claims = entry.get('web_claims', {})
web_enrichment = entry.get('web_enrichment', {})
if web_claims.get('claims_migrated') or web_enrichment.get('claims_migrated'):
logger.debug(f"Skipping {path.name} - already migrated")
return False
# Migrate
migrated = self.migrate_entry(entry)
self.stats['entries_processed'] += 1
# Write if not dry run
if not self.dry_run:
with open(path, 'w', encoding='utf-8') as f:
yaml.dump(migrated, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return True
except Exception as e:
logger.error(f"Error processing {path}: {e}")
return False
def run(self, entry_filter: str | None = None):
"""Run migration on all entries."""
files = self.find_entry_files()
if entry_filter:
files = [f for f in files if entry_filter in f.name]
logger.info(f"Found {len(files)} entry files")
for path in files:
self.process_entry_file(path)
if self.stats['entries_processed'] % 100 == 0 and self.stats['entries_processed'] > 0:
logger.info(f"Processed {self.stats['entries_processed']} entries...")
self.report()
def report(self):
"""Print migration report."""
print("\n" + "=" * 60)
print("CLAIM MIGRATION REPORT")
print("=" * 60)
print(f"\nEntries processed: {self.stats['entries_processed']}")
print(f"\nClaims:")
print(f" - Migrated to canonical: {self.stats['claims_migrated']}")
print(f" - Dropped (metadata): {self.stats['claims_dropped']}")
print(f" - Invalid (UI text): {self.stats['claims_invalid']}")
print(f" - Nested (to convert): {self.stats['claims_nested']}")
print(f" - Unmapped (unknown): {self.stats['claims_unmapped']}")
print(f"\nTIER 3 claims missing XPath: {self.stats['tier3_missing_xpath']}")
if self.type_counts:
print("\nCanonical type distribution:")
for claim_type, count in self.type_counts.most_common(20):
print(f" {claim_type}: {count}")
if self.unmapped_types:
print(f"\nUnmapped types ({len(self.unmapped_types)} unique):")
for claim_type, count in self.unmapped_types.most_common(30):
print(f" {claim_type}: {count}")
def main():
parser = argparse.ArgumentParser(description='Migrate web claims to canonical types')
parser.add_argument('--entries-dir', type=Path,
default=Path('data/nde/enriched/entries'),
help='Path to entries directory')
parser.add_argument('--entry', type=str,
help='Filter to specific entry ID (e.g., 0001)')
parser.add_argument('--dry-run', action='store_true',
help='Analyze without writing changes')
args = parser.parse_args()
if not args.entries_dir.exists():
logger.error(f"Entries directory not found: {args.entries_dir}")
sys.exit(1)
mode = "DRY RUN" if args.dry_run else "MIGRATION"
logger.info(f"Starting {mode}...")
migrator = ClaimMigrator(args.entries_dir, dry_run=args.dry_run)
migrator.run(entry_filter=args.entry)
if __name__ == '__main__':
main()