#!/usr/bin/env python3 """Generate comprehensive entity extraction statistics report.""" import os import re import glob from collections import Counter, defaultdict from datetime import datetime # Entity type descriptions from CH-Annotator convention ENTITY_TYPE_LABELS = { # Appellations "APP.URL": "URL/Website", "APP.TIT": "Title (document)", "APP.TTL": "Title (general)", "APP.EXH": "Exhibition name", "APP.COL": "Collection name", # Groups "GRP.HER": "Heritage institution", "GRP.ASS": "Association/Society", "GRP.GOV": "Government body", "GRP.COR": "Corporation", "GRP.EDU": "Educational institution", # Agents "AGT.PER": "Person", # Places "TOP.SET": "Settlement/City", "TOP.ADR": "Address", "TOP.REG": "Region/Province", "TOP.CTY": "Country", "TOP.BLD": "Building", # Temporal "TMP.DAB": "Date (absolute)", "TMP.OPH": "Opening hours", "TMP.DRL": "Date range/Duration", # Works "WRK.WEB": "Website/Web resource", "WRK.COL": "Collection (as work)", "WRK.TXT": "Text/Document", # Things "THG.LNG": "Language", "THG.CON": "Contact info", "THG.ART": "Artifact/Object", "THG.EVT": "Event", "THG.PHO": "Photograph", # Quantities "QTY.CNT": "Count/Number", "QTY.MSR": "Measurement", "QTY.CUR": "Currency/Price", } def extract_entities_fast(filepath): """Extract entity info using regex (much faster than YAML parsing).""" entities = [] try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Check if file has validated_entity_claims if 'validated_entity_claims:' not in content: return None, [] # Extract claims section claims_match = re.search(r'validated_entity_claims:.*?claims:(.*?)(?=\n[a-z_]+:|$)', content, re.DOTALL) if not claims_match: return None, [] claims_section = claims_match.group(1) # Extract individual entities entity_blocks = re.findall(r'- entity: (.+?)\n\s+entity_type: (\S+)', claims_section) for entity_name, entity_type in entity_blocks: entities.append({ 'name': entity_name.strip(), 'type': entity_type.strip() }) # Extract confidence scores conf_matches = re.findall(r'final_confidence: ([\d.]+)', claims_section) confidences = [float(c) for c in conf_matches] avg_conf = sum(confidences) / len(confidences) if confidences else 0 return avg_conf, entities except Exception as e: return None, [] def main(): print("Scanning custodian files...") files = glob.glob('data/custodian/NL-*.yaml') type_counts = Counter() entity_names = defaultdict(list) # entity_name -> list of (file, type) files_with_claims = 0 total_entities = 0 confidence_sum = 0 confidence_count = 0 # Per-type confidence tracking type_confidences = defaultdict(list) for i, filepath in enumerate(files): if i % 200 == 0: print(f" Processing {i}/{len(files)}...") avg_conf, entities = extract_entities_fast(filepath) if entities: files_with_claims += 1 total_entities += len(entities) if avg_conf: confidence_sum += avg_conf confidence_count += 1 basename = os.path.basename(filepath) for ent in entities: type_counts[ent['type']] += 1 entity_names[ent['name']].append((basename, ent['type'])) # Find duplicates (entities appearing in multiple files) duplicates = {name: locs for name, locs in entity_names.items() if len(locs) > 1} # Generate report report = [] report.append("# Entity Extraction Statistics Report") report.append(f"\n**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") report.append(f"\n## Summary") report.append(f"\n| Metric | Value |") report.append(f"|--------|-------|") report.append(f"| Total custodian files scanned | {len(files):,} |") report.append(f"| Files with validated_entity_claims | {files_with_claims:,} ({100*files_with_claims/len(files):.1f}%) |") report.append(f"| Total entities extracted | {total_entities:,} |") report.append(f"| Unique entity names | {len(entity_names):,} |") report.append(f"| Entities appearing in multiple files | {len(duplicates):,} |") if confidence_count: report.append(f"| Average confidence score | {confidence_sum/confidence_count:.3f} |") report.append(f"| Unique entity types | {len(type_counts)} |") report.append(f"\n## Entity Types Breakdown") report.append(f"\n| Type | Label | Count | % |") report.append(f"|------|-------|-------|---|") for etype, count in type_counts.most_common(): label = ENTITY_TYPE_LABELS.get(etype, etype) pct = 100 * count / total_entities if total_entities else 0 report.append(f"| `{etype}` | {label} | {count:,} | {pct:.1f}% |") report.append(f"\n## Entity Type Categories") # Group by prefix prefix_counts = Counter() for etype, count in type_counts.items(): prefix = etype.split('.')[0] if '.' in etype else etype prefix_counts[prefix] += count prefix_labels = { 'APP': 'Appellations (names, titles)', 'GRP': 'Groups (organizations)', 'AGT': 'Agents (people)', 'TOP': 'Toponyms (places)', 'TMP': 'Temporal (dates, times)', 'WRK': 'Works (documents, collections)', 'THG': 'Things (objects, concepts)', 'QTY': 'Quantities (numbers, measurements)', } report.append(f"\n| Category | Label | Count | % |") report.append(f"|----------|-------|-------|---|") for prefix, count in prefix_counts.most_common(): label = prefix_labels.get(prefix, prefix) pct = 100 * count / total_entities if total_entities else 0 report.append(f"| `{prefix}` | {label} | {count:,} | {pct:.1f}% |") report.append(f"\n## Top 50 Most Common Entities") report.append(f"\nEntities appearing across multiple custodian files:") report.append(f"\n| Entity | Type | Occurrences |") report.append(f"|--------|------|-------------|") sorted_by_freq = sorted(entity_names.items(), key=lambda x: len(x[1]), reverse=True)[:50] for name, locs in sorted_by_freq: if len(locs) >= 2: types = set(t for _, t in locs) type_str = ', '.join(sorted(types)) report.append(f"| {name[:60]} | `{type_str}` | {len(locs)} |") report.append(f"\n## Duplicate Entity Analysis") report.append(f"\nEntities that appear in 5+ different custodian files (candidates for entity linking):") report.append(f"\n| Entity | Files | Primary Type |") report.append(f"|--------|-------|--------------|") high_freq_dups = [(name, locs) for name, locs in duplicates.items() if len(locs) >= 5] high_freq_dups.sort(key=lambda x: len(x[1]), reverse=True) for name, locs in high_freq_dups[:100]: type_counter = Counter(t for _, t in locs) primary_type = type_counter.most_common(1)[0][0] report.append(f"| {name[:60]} | {len(locs)} | `{primary_type}` |") # Heritage institutions specifically report.append(f"\n## Heritage Institution Entities (GRP.HER)") report.append(f"\nThese are extracted mentions of heritage institutions:") her_entities = [(name, locs) for name, locs in entity_names.items() if any(t == 'GRP.HER' for _, t in locs)] her_entities.sort(key=lambda x: len(x[1]), reverse=True) report.append(f"\n| Institution Name | Mentions | Files |") report.append(f"|------------------|----------|-------|") for name, locs in her_entities[:50]: her_locs = [(f, t) for f, t in locs if t == 'GRP.HER'] files_list = ', '.join(set(f.replace('.yaml', '') for f, _ in her_locs[:3])) if len(her_locs) > 3: files_list += f" (+{len(her_locs)-3} more)" report.append(f"| {name[:50]} | {len(her_locs)} | {files_list[:60]} |") # Write report report_path = 'reports/entity_extraction_stats.md' os.makedirs('reports', exist_ok=True) with open(report_path, 'w', encoding='utf-8') as f: f.write('\n'.join(report)) print(f"\nReport written to: {report_path}") print(f"\nQuick Summary:") print(f" - {files_with_claims:,} files with validated entities") print(f" - {total_entities:,} total entities") print(f" - {len(entity_names):,} unique entity names") print(f" - {len(duplicates):,} entities appear in multiple files") print(f" - {len(high_freq_dups)} entities appear in 5+ files (linking candidates)") if __name__ == '__main__': main()