glam/scripts/generate_entity_stats.py

#!/usr/bin/env python3
"""Generate comprehensive entity extraction statistics report."""

import os
import re
import glob
from collections import Counter, defaultdict
from datetime import datetime

# Entity type descriptions from CH-Annotator convention
ENTITY_TYPE_LABELS = {
    # Appellations
    "APP.URL": "URL/Website",
    "APP.TIT": "Title (document)",
    "APP.TTL": "Title (general)",
    "APP.EXH": "Exhibition name",
    "APP.COL": "Collection name",
    # Groups
    "GRP.HER": "Heritage institution",
    "GRP.ASS": "Association/Society",
    "GRP.GOV": "Government body",
    "GRP.COR": "Corporation",
    "GRP.EDU": "Educational institution",
    # Agents
    "AGT.PER": "Person",
    # Places
    "TOP.SET": "Settlement/City",
    "TOP.ADR": "Address",
    "TOP.REG": "Region/Province",
    "TOP.CTY": "Country",
    "TOP.BLD": "Building",
    # Temporal
    "TMP.DAB": "Date (absolute)",
    "TMP.OPH": "Opening hours",
    "TMP.DRL": "Date range/Duration",
    # Works
    "WRK.WEB": "Website/Web resource",
    "WRK.COL": "Collection (as work)",
    "WRK.TXT": "Text/Document",
    # Things
    "THG.LNG": "Language",
    "THG.CON": "Contact info",
    "THG.ART": "Artifact/Object",
    "THG.EVT": "Event",
    "THG.PHO": "Photograph",
    # Quantities
    "QTY.CNT": "Count/Number",
    "QTY.MSR": "Measurement",
    "QTY.CUR": "Currency/Price",
}

def extract_entities_fast(filepath):
    """Extract entity info using regex (much faster than YAML parsing)."""
    entities = []
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()

        # Check if file has validated_entity_claims
        if 'validated_entity_claims:' not in content:
            return None, []

        # Extract claims section
        claims_match = re.search(r'validated_entity_claims:.*?claims:(.*?)(?=\n[a-z_]+:|$)', content, re.DOTALL)
        if not claims_match:
            return None, []

        claims_section = claims_match.group(1)

        # Extract individual entities
        entity_blocks = re.findall(r'- entity: (.+?)\n\s+entity_type: (\S+)', claims_section)
        for entity_name, entity_type in entity_blocks:
            entities.append({
                'name': entity_name.strip(),
                'type': entity_type.strip()
            })

        # Extract confidence scores
        conf_matches = re.findall(r'final_confidence: ([\d.]+)', claims_section)
        confidences = [float(c) for c in conf_matches]
        avg_conf = sum(confidences) / len(confidences) if confidences else 0

        return avg_conf, entities
    except Exception as e:
        return None, []

def main():
    print("Scanning custodian files...")

    files = glob.glob('data/custodian/NL-*.yaml')

    type_counts = Counter()
    entity_names = defaultdict(list)  # entity_name -> list of (file, type)
    files_with_claims = 0
    total_entities = 0
    confidence_sum = 0
    confidence_count = 0

    # Per-type confidence tracking
    type_confidences = defaultdict(list)

    for i, filepath in enumerate(files):
        if i % 200 == 0:
            print(f"  Processing {i}/{len(files)}...")

        avg_conf, entities = extract_entities_fast(filepath)

        if entities:
            files_with_claims += 1
            total_entities += len(entities)
            if avg_conf:
                confidence_sum += avg_conf
                confidence_count += 1

            basename = os.path.basename(filepath)
            for ent in entities:
                type_counts[ent['type']] += 1
                entity_names[ent['name']].append((basename, ent['type']))

    # Find duplicates (entities appearing in multiple files)
    duplicates = {name: locs for name, locs in entity_names.items() if len(locs) > 1}

    # Generate report
    report = []
    report.append("# Entity Extraction Statistics Report")
    report.append(f"\n**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report.append(f"\n## Summary")
    report.append(f"\n| Metric | Value |")
    report.append(f"|--------|-------|")
    report.append(f"| Total custodian files scanned | {len(files):,} |")
    report.append(f"| Files with validated_entity_claims | {files_with_claims:,} ({100*files_with_claims/len(files):.1f}%) |")
    report.append(f"| Total entities extracted | {total_entities:,} |")
    report.append(f"| Unique entity names | {len(entity_names):,} |")
    report.append(f"| Entities appearing in multiple files | {len(duplicates):,} |")
    if confidence_count:
        report.append(f"| Average confidence score | {confidence_sum/confidence_count:.3f} |")
    report.append(f"| Unique entity types | {len(type_counts)} |")

    report.append(f"\n## Entity Types Breakdown")
    report.append(f"\n| Type | Label | Count | % |")
    report.append(f"|------|-------|-------|---|")
    for etype, count in type_counts.most_common():
        label = ENTITY_TYPE_LABELS.get(etype, etype)
        pct = 100 * count / total_entities if total_entities else 0
        report.append(f"| `{etype}` | {label} | {count:,} | {pct:.1f}% |")

    report.append(f"\n## Entity Type Categories")

    # Group by prefix
    prefix_counts = Counter()
    for etype, count in type_counts.items():
        prefix = etype.split('.')[0] if '.' in etype else etype
        prefix_counts[prefix] += count

    prefix_labels = {
        'APP': 'Appellations (names, titles)',
        'GRP': 'Groups (organizations)',
        'AGT': 'Agents (people)',
        'TOP': 'Toponyms (places)',
        'TMP': 'Temporal (dates, times)',
        'WRK': 'Works (documents, collections)',
        'THG': 'Things (objects, concepts)',
        'QTY': 'Quantities (numbers, measurements)',
    }

    report.append(f"\n| Category | Label | Count | % |")
    report.append(f"|----------|-------|-------|---|")
    for prefix, count in prefix_counts.most_common():
        label = prefix_labels.get(prefix, prefix)
        pct = 100 * count / total_entities if total_entities else 0
        report.append(f"| `{prefix}` | {label} | {count:,} | {pct:.1f}% |")

    report.append(f"\n## Top 50 Most Common Entities")
    report.append(f"\nEntities appearing across multiple custodian files:")
    report.append(f"\n| Entity | Type | Occurrences |")
    report.append(f"|--------|------|-------------|")

    sorted_by_freq = sorted(entity_names.items(), key=lambda x: len(x[1]), reverse=True)[:50]
    for name, locs in sorted_by_freq:
        if len(locs) >= 2:
            types = set(t for _, t in locs)
            type_str = ', '.join(sorted(types))
            report.append(f"| {name[:60]} | `{type_str}` | {len(locs)} |")

    report.append(f"\n## Duplicate Entity Analysis")
    report.append(f"\nEntities that appear in 5+ different custodian files (candidates for entity linking):")
    report.append(f"\n| Entity | Files | Primary Type |")
    report.append(f"|--------|-------|--------------|")

    high_freq_dups = [(name, locs) for name, locs in duplicates.items() if len(locs) >= 5]
    high_freq_dups.sort(key=lambda x: len(x[1]), reverse=True)

    for name, locs in high_freq_dups[:100]:
        type_counter = Counter(t for _, t in locs)
        primary_type = type_counter.most_common(1)[0][0]
        report.append(f"| {name[:60]} | {len(locs)} | `{primary_type}` |")

    # Heritage institutions specifically
    report.append(f"\n## Heritage Institution Entities (GRP.HER)")
    report.append(f"\nThese are extracted mentions of heritage institutions:")

    her_entities = [(name, locs) for name, locs in entity_names.items()
                    if any(t == 'GRP.HER' for _, t in locs)]
    her_entities.sort(key=lambda x: len(x[1]), reverse=True)

    report.append(f"\n| Institution Name | Mentions | Files |")
    report.append(f"|------------------|----------|-------|")
    for name, locs in her_entities[:50]:
        her_locs = [(f, t) for f, t in locs if t == 'GRP.HER']
        files_list = ', '.join(set(f.replace('.yaml', '') for f, _ in her_locs[:3]))
        if len(her_locs) > 3:
            files_list += f" (+{len(her_locs)-3} more)"
        report.append(f"| {name[:50]} | {len(her_locs)} | {files_list[:60]} |")

    # Write report
    report_path = 'reports/entity_extraction_stats.md'
    os.makedirs('reports', exist_ok=True)
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(report))

    print(f"\nReport written to: {report_path}")
    print(f"\nQuick Summary:")
    print(f"  - {files_with_claims:,} files with validated entities")
    print(f"  - {total_entities:,} total entities")
    print(f"  - {len(entity_names):,} unique entity names")
    print(f"  - {len(duplicates):,} entities appear in multiple files")
    print(f"  - {len(high_freq_dups)} entities appear in 5+ files (linking candidates)")

if __name__ == '__main__':
    main()