glam/scripts/analyze_entity_duplicates.py

#!/usr/bin/env python3
"""Analyze duplicate entities across custodian files for deduplication."""

import os
import re
import glob
from collections import Counter, defaultdict
from datetime import datetime
import json

def extract_entities_fast(filepath):
    """Extract entity info using regex."""
    entities = []
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()

        if 'validated_entity_claims:' not in content:
            return []

        claims_match = re.search(r'validated_entity_claims:.*?claims:(.*?)(?=\n[a-z_]+:|$)', content, re.DOTALL)
        if not claims_match:
            return []

        claims_section = claims_match.group(1)
        entity_blocks = re.findall(r'- entity: (.+?)\n\s+entity_type: (\S+)', claims_section)

        for entity_name, entity_type in entity_blocks:
            entities.append({
                'name': entity_name.strip(),
                'type': entity_type.strip()
            })

        return entities
    except:
        return []

def normalize_entity(name):
    """Normalize entity name for comparison."""
    # Lowercase
    norm = name.lower().strip()
    # Remove quotes
    norm = norm.strip("'\"")
    # Normalize whitespace
    norm = ' '.join(norm.split())
    return norm

def main():
    print("Analyzing entity duplicates...")

    files = glob.glob('data/custodian/NL-*.yaml')

    # Track entities: normalized_name -> [(original_name, type, file)]
    entity_occurrences = defaultdict(list)

    for i, filepath in enumerate(files):
        if i % 200 == 0:
            print(f"  Processing {i}/{len(files)}...")

        basename = os.path.basename(filepath)
        entities = extract_entities_fast(filepath)

        for ent in entities:
            norm_name = normalize_entity(ent['name'])
            entity_occurrences[norm_name].append({
                'original': ent['name'],
                'type': ent['type'],
                'file': basename
            })

    print(f"\nTotal unique normalized entities: {len(entity_occurrences):,}")

    # Identify quality issues
    issues = {
        'language_codes': [],      # nl-NL, nl_NL, etc.
        'generic_labels': [],      # Home, Menu, etc.
        'numeric_only': [],        # '2025', '1200', etc.
        'single_char': [],         # Single characters
        'type_mismatches': [],     # Same entity with different types
        'variant_spellings': [],   # Same entity with slight variations
    }

    # Language code patterns
    lang_patterns = [
        r'^[a-z]{2}[-_][A-Z]{2}$',  # nl-NL, en-US
        r'^[a-z]{2}$',              # nl, en
        r'^[a-z]{2}_[a-z]{2}$',     # nl_nl
    ]

    # Generic navigation/UI labels
    generic_labels = {'home', 'menu', 'contact', 'over', 'about', 'search', 'zoeken',
                      'nieuws', 'news', 'agenda', 'events', 'login', 'logout',
                      'inloggen', 'uitloggen', 'cookie', 'cookies', 'privacy',
                      'collectie', 'collection', 'archief', 'archive'}

    for norm_name, occurrences in entity_occurrences.items():
        orig = occurrences[0]['original']

        # Check for language codes
        for pat in lang_patterns:
            if re.match(pat, orig, re.IGNORECASE):
                issues['language_codes'].append((orig, len(occurrences)))
                break

        # Check for generic labels
        if norm_name in generic_labels:
            issues['generic_labels'].append((orig, len(occurrences)))

        # Check for numeric-only
        if re.match(r"^'?\d+'?$", orig):
            issues['numeric_only'].append((orig, len(occurrences)))

        # Check for single character
        if len(norm_name) <= 2:
            issues['single_char'].append((orig, len(occurrences)))

        # Check for type mismatches
        types = set(o['type'] for o in occurrences)
        if len(types) > 1:
            issues['type_mismatches'].append((orig, types, len(occurrences)))

    # Generate report
    report = []
    report.append("# Entity Duplicate Analysis Report")
    report.append(f"\n**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    report.append(f"\n## Data Quality Issues Summary")
    report.append(f"\n| Issue Type | Count | Total Occurrences |")
    report.append(f"|------------|-------|-------------------|")

    total_issues = 0
    total_occurrences = 0

    for issue_type, items in issues.items():
        count = len(items)
        if issue_type == 'type_mismatches':
            occ = sum(c for _, _, c in items)
        else:
            occ = sum(c for _, c in items)
        total_issues += count
        total_occurrences += occ
        report.append(f"| {issue_type.replace('_', ' ').title()} | {count} | {occ} |")

    report.append(f"| **TOTAL** | **{total_issues}** | **{total_occurrences}** |")

    report.append(f"\n## Language Code Entities (Should Be Filtered)")
    report.append(f"\nThese are HTML `lang` attribute values, not real entities:")
    report.append(f"\n| Entity | Occurrences | Action |")
    report.append(f"|--------|-------------|--------|")
    for entity, count in sorted(issues['language_codes'], key=lambda x: -x[1])[:20]:
        report.append(f"| `{entity}` | {count} | Filter out |")

    report.append(f"\n## Generic Navigation Labels (Low Value)")
    report.append(f"\nThese are website navigation labels, not heritage entities:")
    report.append(f"\n| Entity | Occurrences | Action |")
    report.append(f"|--------|-------------|--------|")
    for entity, count in sorted(issues['generic_labels'], key=lambda x: -x[1])[:20]:
        report.append(f"| `{entity}` | {count} | Filter out |")

    report.append(f"\n## Numeric-Only Entities (Often Dimensions)")
    report.append(f"\nThese are often image dimensions or years without context:")
    report.append(f"\n| Entity | Occurrences | Action |")
    report.append(f"|--------|-------------|--------|")
    for entity, count in sorted(issues['numeric_only'], key=lambda x: -x[1])[:20]:
        report.append(f"| `{entity}` | {count} | Review context |")

    report.append(f"\n## Type Mismatches (Need Resolution)")
    report.append(f"\nSame entity classified with different types:")
    report.append(f"\n| Entity | Types | Occurrences | Action |")
    report.append(f"|--------|-------|-------------|--------|")
    for entity, types, count in sorted(issues['type_mismatches'], key=lambda x: -x[2])[:50]:
        types_str = ', '.join(sorted(types))
        report.append(f"| {entity[:40]} | `{types_str}` | {count} | Resolve type |")

    # Calculate cleanup impact
    report.append(f"\n## Cleanup Impact Analysis")

    cleanup_candidates = set()
    for entity, _ in issues['language_codes']:
        cleanup_candidates.add(normalize_entity(entity))
    for entity, _ in issues['generic_labels']:
        cleanup_candidates.add(normalize_entity(entity))
    for entity, _ in issues['numeric_only']:
        cleanup_candidates.add(normalize_entity(entity))

    cleanup_occurrences = sum(
        len(entity_occurrences[norm])
        for norm in cleanup_candidates
        if norm in entity_occurrences
    )

    total_entities = sum(len(v) for v in entity_occurrences.values())

    report.append(f"\n| Metric | Value |")
    report.append(f"|--------|-------|")
    report.append(f"| Total entity occurrences | {total_entities:,} |")
    report.append(f"| Candidate cleanup occurrences | {cleanup_occurrences:,} |")
    report.append(f"| Cleanup percentage | {100*cleanup_occurrences/total_entities:.1f}% |")
    report.append(f"| Remaining after cleanup | {total_entities - cleanup_occurrences:,} |")

    # High-value entities for linking
    report.append(f"\n## High-Value Entities for Linking")
    report.append(f"\nEntities that appear frequently and are good candidates for Wikidata/VIAF linking:")

    # Filter to heritage-relevant types
    heritage_types = {'GRP.HER', 'GRP.ASS', 'GRP.GOV', 'GRP.COR', 'GRP.EDU',
                      'AGT.PER', 'TOP.SET', 'TOP.REG', 'TOP.CTY', 'TOP.BLD'}

    linking_candidates = []
    for norm_name, occurrences in entity_occurrences.items():
        # Skip cleanup candidates
        if norm_name in cleanup_candidates:
            continue

        # Skip if too short
        if len(norm_name) < 3:
            continue

        # Check if has heritage-relevant types
        types = set(o['type'] for o in occurrences)
        if types & heritage_types:
            primary_type = Counter(o['type'] for o in occurrences).most_common(1)[0][0]
            linking_candidates.append({
                'name': occurrences[0]['original'],
                'norm': norm_name,
                'type': primary_type,
                'count': len(occurrences)
            })

    linking_candidates.sort(key=lambda x: -x['count'])

    report.append(f"\n| Entity | Type | Occurrences | Wikidata? |")
    report.append(f"|--------|------|-------------|-----------|")
    for cand in linking_candidates[:75]:
        report.append(f"| {cand['name'][:40]} | `{cand['type']}` | {cand['count']} | 🔍 Search |")

    # Write report
    report_path = 'reports/entity_duplicate_analysis.md'
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(report))

    print(f"\nReport written to: {report_path}")

    # Also save linking candidates as JSON for automated processing
    json_path = 'reports/entity_linking_candidates.json'
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(linking_candidates[:200], f, indent=2, ensure_ascii=False)

    print(f"Linking candidates JSON: {json_path}")

    print(f"\nSummary:")
    print(f"  - {len(issues['language_codes'])} language code entities to filter")
    print(f"  - {len(issues['generic_labels'])} generic labels to filter")
    print(f"  - {len(issues['numeric_only'])} numeric-only entities to review")
    print(f"  - {len(issues['type_mismatches'])} type mismatches to resolve")
    print(f"  - {len(linking_candidates)} high-value linking candidates identified")
    print(f"  - Potential cleanup: {cleanup_occurrences:,} occurrences ({100*cleanup_occurrences/total_entities:.1f}%)")

if __name__ == '__main__':
    main()