462 lines
15 KiB
Python
462 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Migrate web claims from 1,456 ad-hoc types to canonical types.
|
|
|
|
This script:
|
|
1. Reads existing web_enrichment claims from NDE entry YAML files
|
|
2. Maps claim types to canonical types (from CanonicalClaimTypes.yaml)
|
|
3. Drops metadata fields that aren't institution claims
|
|
4. Converts nested claims (branches_0_name) to structured arrays
|
|
5. Validates that TIER 3 claims have XPath provenance
|
|
6. Writes updated entries with standardized claims
|
|
|
|
Usage:
|
|
# Analyze without writing changes
|
|
python scripts/migrate_claims_to_canonical.py --dry-run
|
|
|
|
# Migrate all entries
|
|
python scripts/migrate_claims_to_canonical.py
|
|
|
|
# Migrate single entry
|
|
python scripts/migrate_claims_to_canonical.py --entry 0001
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import re
|
|
import sys
|
|
from collections import Counter, defaultdict
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import yaml
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ========================================
|
|
# CANONICAL TYPE MAPPINGS
|
|
# ========================================
|
|
|
|
# Direct mappings from old types to canonical types
|
|
CANONICAL_MAPPINGS = {
|
|
# Identity claims -> full_name or short_name
|
|
'org_name': 'full_name',
|
|
'custodian_name': 'full_name',
|
|
'full_name': 'full_name',
|
|
'organization_name': 'full_name',
|
|
'museum_name': 'full_name',
|
|
'operating_name': 'full_name',
|
|
'legal_name': 'full_name',
|
|
'name': 'full_name',
|
|
'brand_name': 'short_name',
|
|
'short_name': 'short_name',
|
|
'abbreviation': 'short_name',
|
|
|
|
# Description claims
|
|
'description': 'description',
|
|
'description_short': 'description',
|
|
'mission': 'description',
|
|
'slogan': 'description',
|
|
'tagline': 'description',
|
|
'about': 'description',
|
|
'history': 'description',
|
|
|
|
# Contact claims
|
|
'email': 'email',
|
|
'phone': 'phone',
|
|
'telephone': 'phone',
|
|
'address': 'address',
|
|
'postal_code': 'postal_code',
|
|
'city': 'city',
|
|
'street_address': 'address',
|
|
|
|
# Social media normalization
|
|
'facebook': 'social_facebook',
|
|
'social_facebook': 'social_facebook',
|
|
'social_media_facebook': 'social_facebook',
|
|
'instagram': 'social_instagram',
|
|
'social_instagram': 'social_instagram',
|
|
'social_media_instagram': 'social_instagram',
|
|
'twitter': 'social_twitter',
|
|
'social_twitter': 'social_twitter',
|
|
'social_media_twitter': 'social_twitter',
|
|
'x': 'social_twitter',
|
|
'linkedin': 'social_linkedin',
|
|
'social_linkedin': 'social_linkedin',
|
|
'social_media_linkedin': 'social_linkedin',
|
|
'youtube': 'social_youtube',
|
|
'social_youtube': 'social_youtube',
|
|
'social_media_youtube': 'social_youtube',
|
|
|
|
# Website
|
|
'website': 'website',
|
|
'social_media_website': 'website',
|
|
'source_urls': 'website',
|
|
'url': 'website',
|
|
|
|
# Identifiers
|
|
'isil_code': 'isil_code',
|
|
'isil': 'isil_code',
|
|
'kvk_number': 'kvk_number',
|
|
'kvk': 'kvk_number',
|
|
'wikidata_id': 'wikidata_id',
|
|
'wikidata': 'wikidata_id',
|
|
'rsin': 'kvk_number',
|
|
|
|
# Organization metadata
|
|
'organization_type': 'organization_type',
|
|
'institution_type': 'organization_type',
|
|
'type': 'organization_type',
|
|
'legal_form': 'legal_form',
|
|
'rechtsvorm': 'legal_form',
|
|
'founding_date': 'founding_date',
|
|
'founded': 'founding_date',
|
|
'founding_year': 'founding_date',
|
|
'established': 'founding_date',
|
|
'opening_hours': 'opening_hours',
|
|
'hours': 'opening_hours',
|
|
|
|
# Structural (TIER 1) - from docling
|
|
'page_title': 'page_title',
|
|
'page_count': 'page_count',
|
|
'image_count': 'image_count',
|
|
'table_count': 'table_count',
|
|
'markdown_length': 'markdown_length',
|
|
|
|
# Pattern-based (TIER 2)
|
|
'main_h1': 'main_h1',
|
|
'nav_items': 'nav_items',
|
|
'has_contact_section': 'has_contact_section',
|
|
'has_footer': 'has_footer',
|
|
'language_detected': 'language_detected',
|
|
}
|
|
|
|
# Types to DROP (metadata, not institution claims)
|
|
DROP_TYPES = {
|
|
'extraction_timestamp',
|
|
'extraction_method',
|
|
'confidence_score',
|
|
'enrichment_method',
|
|
'name_verified',
|
|
'needs_verification',
|
|
'verification_notes',
|
|
'note',
|
|
'notes',
|
|
'claim_notes',
|
|
'source',
|
|
'xpath_match_score', # Keep as claim metadata, not claim type
|
|
'retrieved_on', # Keep as claim metadata
|
|
'html_file', # Keep as claim metadata
|
|
}
|
|
|
|
# Generic UI text to filter out
|
|
INVALID_CLAIMS_PATTERNS = [
|
|
r'^Home$',
|
|
r'^Contact$',
|
|
r'^Over ons$',
|
|
r'^Collectie$',
|
|
r'^Bezoek$',
|
|
r'^Menu$',
|
|
r'^Search$',
|
|
r'^Zoeken$',
|
|
r'^Nederlands$',
|
|
r'^English$',
|
|
r'^Skip to',
|
|
r'^Cookie',
|
|
r'share.*facebook',
|
|
r'share.*twitter',
|
|
r'intent/tweet',
|
|
r'sharer\.php',
|
|
]
|
|
INVALID_CLAIMS_RE = [re.compile(p, re.IGNORECASE) for p in INVALID_CLAIMS_PATTERNS]
|
|
|
|
# Nested claim patterns
|
|
NESTED_PATTERNS = [
|
|
(re.compile(r'^branches_(\d+)_(.+)$'), 'branches'),
|
|
(re.compile(r'^programs_(\d+)_(.+)$'), 'programs'),
|
|
(re.compile(r'^collections_(\d+)_(.+)$'), 'collections'),
|
|
(re.compile(r'^digital_platforms_(\d+)_(.+)$'), 'digital_platforms'),
|
|
(re.compile(r'^organization_details_(.+)$'), 'organization_details'),
|
|
(re.compile(r'^location_details_(.+)$'), 'location'),
|
|
(re.compile(r'^contact_(.+)$'), 'contact'),
|
|
]
|
|
|
|
# TIER 3 claims that MUST have XPath provenance
|
|
TIER_3_CLAIMS = {
|
|
'full_name', 'short_name', 'description', 'email', 'phone',
|
|
'address', 'postal_code', 'city', 'organization_type',
|
|
'legal_form', 'founding_date', 'opening_hours'
|
|
}
|
|
|
|
|
|
class ClaimMigrator:
|
|
"""Migrate claims to canonical types."""
|
|
|
|
def __init__(self, entries_dir: Path, dry_run: bool = False):
|
|
self.entries_dir = entries_dir
|
|
self.dry_run = dry_run
|
|
self.stats = {
|
|
'entries_processed': 0,
|
|
'claims_migrated': 0,
|
|
'claims_dropped': 0,
|
|
'claims_invalid': 0,
|
|
'claims_nested': 0,
|
|
'claims_unmapped': 0,
|
|
'tier3_missing_xpath': 0,
|
|
}
|
|
self.type_counts = Counter()
|
|
self.unmapped_types = Counter()
|
|
|
|
def find_entry_files(self) -> list[Path]:
|
|
"""Find all entry YAML files."""
|
|
return sorted(self.entries_dir.glob('*.yaml'))
|
|
|
|
def is_invalid_claim_value(self, value: str) -> bool:
|
|
"""Check if a claim value is invalid (generic UI text)."""
|
|
if not value or not isinstance(value, str):
|
|
return False
|
|
value = value.strip()
|
|
if len(value) < 3:
|
|
return True # Too short to be meaningful
|
|
for pattern in INVALID_CLAIMS_RE:
|
|
if pattern.search(value):
|
|
return True
|
|
return False
|
|
|
|
def map_claim_type(self, claim_type: str) -> tuple[str | None, str]:
|
|
"""
|
|
Map a claim type to canonical type.
|
|
|
|
Returns:
|
|
(canonical_type, action) where action is:
|
|
- 'map': Direct mapping
|
|
- 'drop': Should be dropped
|
|
- 'nest': Should be converted to nested structure
|
|
- 'unknown': Unknown type
|
|
"""
|
|
# Normalize type name
|
|
claim_type_lower = claim_type.lower().strip()
|
|
|
|
# Check for DROP types
|
|
if claim_type_lower in DROP_TYPES:
|
|
return None, 'drop'
|
|
|
|
# Check for nested patterns
|
|
for pattern, parent in NESTED_PATTERNS:
|
|
if pattern.match(claim_type):
|
|
return parent, 'nest'
|
|
|
|
# Check direct mapping
|
|
if claim_type_lower in CANONICAL_MAPPINGS:
|
|
return CANONICAL_MAPPINGS[claim_type_lower], 'map'
|
|
|
|
# Try with underscores removed
|
|
claim_type_clean = claim_type_lower.replace('_', '')
|
|
for old_type, new_type in CANONICAL_MAPPINGS.items():
|
|
if old_type.replace('_', '') == claim_type_clean:
|
|
return new_type, 'map'
|
|
|
|
return None, 'unknown'
|
|
|
|
def migrate_claim(self, claim: dict) -> dict | None:
|
|
"""
|
|
Migrate a single claim to canonical type.
|
|
|
|
Returns migrated claim or None if should be dropped.
|
|
"""
|
|
claim_type = claim.get('claim_type', '')
|
|
claim_value = claim.get('claim_value', '')
|
|
|
|
# Check for invalid value
|
|
if self.is_invalid_claim_value(claim_value):
|
|
self.stats['claims_invalid'] += 1
|
|
return None
|
|
|
|
# Map type
|
|
canonical_type, action = self.map_claim_type(claim_type)
|
|
|
|
if action == 'drop':
|
|
self.stats['claims_dropped'] += 1
|
|
return None
|
|
|
|
if action == 'nest':
|
|
self.stats['claims_nested'] += 1
|
|
# For now, we'll keep nested claims but mark them
|
|
# A separate pass will convert to structured arrays
|
|
return {
|
|
**claim,
|
|
'claim_type': claim_type, # Keep original for later processing
|
|
'_nested_parent': canonical_type,
|
|
}
|
|
|
|
if action == 'unknown':
|
|
self.stats['claims_unmapped'] += 1
|
|
self.unmapped_types[claim_type] += 1
|
|
# Keep unknown claims but mark them
|
|
return {
|
|
**claim,
|
|
'_unmapped': True,
|
|
}
|
|
|
|
# Successfully mapped
|
|
self.stats['claims_migrated'] += 1
|
|
self.type_counts[canonical_type] += 1
|
|
|
|
# Check TIER 3 XPath requirement
|
|
if canonical_type in TIER_3_CLAIMS:
|
|
if not claim.get('xpath'):
|
|
self.stats['tier3_missing_xpath'] += 1
|
|
# Mark as needing verification
|
|
claim['_missing_xpath'] = True
|
|
|
|
return {
|
|
**claim,
|
|
'claim_type': canonical_type,
|
|
'_original_type': claim_type if claim_type != canonical_type else None,
|
|
}
|
|
|
|
def migrate_entry(self, entry: dict) -> dict:
|
|
"""Migrate all claims in an entry."""
|
|
# Check both web_claims and web_enrichment for claims
|
|
web_claims = entry.get('web_claims', {})
|
|
web_enrichment = entry.get('web_enrichment', {})
|
|
|
|
# Try web_claims first (newer structure), then web_enrichment
|
|
claims = web_claims.get('claims', []) or web_enrichment.get('claims', [])
|
|
|
|
if not claims:
|
|
return entry
|
|
|
|
migrated_claims = []
|
|
for claim in claims:
|
|
migrated = self.migrate_claim(claim)
|
|
if migrated:
|
|
# Clean up internal markers for final output
|
|
if not self.dry_run:
|
|
migrated.pop('_unmapped', None)
|
|
migrated.pop('_nested_parent', None)
|
|
orig_type = migrated.pop('_original_type', None)
|
|
# Optionally store original type for audit
|
|
if orig_type:
|
|
migrated['original_claim_type'] = orig_type
|
|
migrated_claims.append(migrated)
|
|
|
|
# Update the appropriate section
|
|
if 'web_claims' in entry:
|
|
entry['web_claims']['claims'] = migrated_claims
|
|
entry['web_claims']['claims_migrated'] = True
|
|
entry['web_claims']['migration_timestamp'] = datetime.now(timezone.utc).isoformat()
|
|
elif 'web_enrichment' in entry:
|
|
if 'claims' not in entry['web_enrichment']:
|
|
entry['web_enrichment'] = entry.get('web_enrichment', {})
|
|
entry['web_enrichment']['claims'] = migrated_claims
|
|
entry['web_enrichment']['claims_migrated'] = True
|
|
entry['web_enrichment']['migration_timestamp'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
return entry
|
|
|
|
def process_entry_file(self, path: Path) -> bool:
|
|
"""Process a single entry file."""
|
|
try:
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
if not entry:
|
|
return False
|
|
|
|
# Check if already migrated
|
|
web_claims = entry.get('web_claims', {})
|
|
web_enrichment = entry.get('web_enrichment', {})
|
|
if web_claims.get('claims_migrated') or web_enrichment.get('claims_migrated'):
|
|
logger.debug(f"Skipping {path.name} - already migrated")
|
|
return False
|
|
|
|
# Migrate
|
|
migrated = self.migrate_entry(entry)
|
|
self.stats['entries_processed'] += 1
|
|
|
|
# Write if not dry run
|
|
if not self.dry_run:
|
|
with open(path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(migrated, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing {path}: {e}")
|
|
return False
|
|
|
|
def run(self, entry_filter: str | None = None):
|
|
"""Run migration on all entries."""
|
|
files = self.find_entry_files()
|
|
|
|
if entry_filter:
|
|
files = [f for f in files if entry_filter in f.name]
|
|
|
|
logger.info(f"Found {len(files)} entry files")
|
|
|
|
for path in files:
|
|
self.process_entry_file(path)
|
|
if self.stats['entries_processed'] % 100 == 0 and self.stats['entries_processed'] > 0:
|
|
logger.info(f"Processed {self.stats['entries_processed']} entries...")
|
|
|
|
self.report()
|
|
|
|
def report(self):
|
|
"""Print migration report."""
|
|
print("\n" + "=" * 60)
|
|
print("CLAIM MIGRATION REPORT")
|
|
print("=" * 60)
|
|
|
|
print(f"\nEntries processed: {self.stats['entries_processed']}")
|
|
print(f"\nClaims:")
|
|
print(f" - Migrated to canonical: {self.stats['claims_migrated']}")
|
|
print(f" - Dropped (metadata): {self.stats['claims_dropped']}")
|
|
print(f" - Invalid (UI text): {self.stats['claims_invalid']}")
|
|
print(f" - Nested (to convert): {self.stats['claims_nested']}")
|
|
print(f" - Unmapped (unknown): {self.stats['claims_unmapped']}")
|
|
|
|
print(f"\nTIER 3 claims missing XPath: {self.stats['tier3_missing_xpath']}")
|
|
|
|
if self.type_counts:
|
|
print("\nCanonical type distribution:")
|
|
for claim_type, count in self.type_counts.most_common(20):
|
|
print(f" {claim_type}: {count}")
|
|
|
|
if self.unmapped_types:
|
|
print(f"\nUnmapped types ({len(self.unmapped_types)} unique):")
|
|
for claim_type, count in self.unmapped_types.most_common(30):
|
|
print(f" {claim_type}: {count}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Migrate web claims to canonical types')
|
|
parser.add_argument('--entries-dir', type=Path,
|
|
default=Path('data/nde/enriched/entries'),
|
|
help='Path to entries directory')
|
|
parser.add_argument('--entry', type=str,
|
|
help='Filter to specific entry ID (e.g., 0001)')
|
|
parser.add_argument('--dry-run', action='store_true',
|
|
help='Analyze without writing changes')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.entries_dir.exists():
|
|
logger.error(f"Entries directory not found: {args.entries_dir}")
|
|
sys.exit(1)
|
|
|
|
mode = "DRY RUN" if args.dry_run else "MIGRATION"
|
|
logger.info(f"Starting {mode}...")
|
|
|
|
migrator = ClaimMigrator(args.entries_dir, dry_run=args.dry_run)
|
|
migrator.run(entry_filter=args.entry)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|