glam/scripts/generate_entity_stats.py
2025-12-14 17:09:55 +01:00

230 lines
8.8 KiB
Python

#!/usr/bin/env python3
"""Generate comprehensive entity extraction statistics report."""
import os
import re
import glob
from collections import Counter, defaultdict
from datetime import datetime
# Entity type descriptions from CH-Annotator convention
ENTITY_TYPE_LABELS = {
# Appellations
"APP.URL": "URL/Website",
"APP.TIT": "Title (document)",
"APP.TTL": "Title (general)",
"APP.EXH": "Exhibition name",
"APP.COL": "Collection name",
# Groups
"GRP.HER": "Heritage institution",
"GRP.ASS": "Association/Society",
"GRP.GOV": "Government body",
"GRP.COR": "Corporation",
"GRP.EDU": "Educational institution",
# Agents
"AGT.PER": "Person",
# Places
"TOP.SET": "Settlement/City",
"TOP.ADR": "Address",
"TOP.REG": "Region/Province",
"TOP.CTY": "Country",
"TOP.BLD": "Building",
# Temporal
"TMP.DAB": "Date (absolute)",
"TMP.OPH": "Opening hours",
"TMP.DRL": "Date range/Duration",
# Works
"WRK.WEB": "Website/Web resource",
"WRK.COL": "Collection (as work)",
"WRK.TXT": "Text/Document",
# Things
"THG.LNG": "Language",
"THG.CON": "Contact info",
"THG.ART": "Artifact/Object",
"THG.EVT": "Event",
"THG.PHO": "Photograph",
# Quantities
"QTY.CNT": "Count/Number",
"QTY.MSR": "Measurement",
"QTY.CUR": "Currency/Price",
}
def extract_entities_fast(filepath):
"""Extract entity info using regex (much faster than YAML parsing)."""
entities = []
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Check if file has validated_entity_claims
if 'validated_entity_claims:' not in content:
return None, []
# Extract claims section
claims_match = re.search(r'validated_entity_claims:.*?claims:(.*?)(?=\n[a-z_]+:|$)', content, re.DOTALL)
if not claims_match:
return None, []
claims_section = claims_match.group(1)
# Extract individual entities
entity_blocks = re.findall(r'- entity: (.+?)\n\s+entity_type: (\S+)', claims_section)
for entity_name, entity_type in entity_blocks:
entities.append({
'name': entity_name.strip(),
'type': entity_type.strip()
})
# Extract confidence scores
conf_matches = re.findall(r'final_confidence: ([\d.]+)', claims_section)
confidences = [float(c) for c in conf_matches]
avg_conf = sum(confidences) / len(confidences) if confidences else 0
return avg_conf, entities
except Exception as e:
return None, []
def main():
print("Scanning custodian files...")
files = glob.glob('data/custodian/NL-*.yaml')
type_counts = Counter()
entity_names = defaultdict(list) # entity_name -> list of (file, type)
files_with_claims = 0
total_entities = 0
confidence_sum = 0
confidence_count = 0
# Per-type confidence tracking
type_confidences = defaultdict(list)
for i, filepath in enumerate(files):
if i % 200 == 0:
print(f" Processing {i}/{len(files)}...")
avg_conf, entities = extract_entities_fast(filepath)
if entities:
files_with_claims += 1
total_entities += len(entities)
if avg_conf:
confidence_sum += avg_conf
confidence_count += 1
basename = os.path.basename(filepath)
for ent in entities:
type_counts[ent['type']] += 1
entity_names[ent['name']].append((basename, ent['type']))
# Find duplicates (entities appearing in multiple files)
duplicates = {name: locs for name, locs in entity_names.items() if len(locs) > 1}
# Generate report
report = []
report.append("# Entity Extraction Statistics Report")
report.append(f"\n**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append(f"\n## Summary")
report.append(f"\n| Metric | Value |")
report.append(f"|--------|-------|")
report.append(f"| Total custodian files scanned | {len(files):,} |")
report.append(f"| Files with validated_entity_claims | {files_with_claims:,} ({100*files_with_claims/len(files):.1f}%) |")
report.append(f"| Total entities extracted | {total_entities:,} |")
report.append(f"| Unique entity names | {len(entity_names):,} |")
report.append(f"| Entities appearing in multiple files | {len(duplicates):,} |")
if confidence_count:
report.append(f"| Average confidence score | {confidence_sum/confidence_count:.3f} |")
report.append(f"| Unique entity types | {len(type_counts)} |")
report.append(f"\n## Entity Types Breakdown")
report.append(f"\n| Type | Label | Count | % |")
report.append(f"|------|-------|-------|---|")
for etype, count in type_counts.most_common():
label = ENTITY_TYPE_LABELS.get(etype, etype)
pct = 100 * count / total_entities if total_entities else 0
report.append(f"| `{etype}` | {label} | {count:,} | {pct:.1f}% |")
report.append(f"\n## Entity Type Categories")
# Group by prefix
prefix_counts = Counter()
for etype, count in type_counts.items():
prefix = etype.split('.')[0] if '.' in etype else etype
prefix_counts[prefix] += count
prefix_labels = {
'APP': 'Appellations (names, titles)',
'GRP': 'Groups (organizations)',
'AGT': 'Agents (people)',
'TOP': 'Toponyms (places)',
'TMP': 'Temporal (dates, times)',
'WRK': 'Works (documents, collections)',
'THG': 'Things (objects, concepts)',
'QTY': 'Quantities (numbers, measurements)',
}
report.append(f"\n| Category | Label | Count | % |")
report.append(f"|----------|-------|-------|---|")
for prefix, count in prefix_counts.most_common():
label = prefix_labels.get(prefix, prefix)
pct = 100 * count / total_entities if total_entities else 0
report.append(f"| `{prefix}` | {label} | {count:,} | {pct:.1f}% |")
report.append(f"\n## Top 50 Most Common Entities")
report.append(f"\nEntities appearing across multiple custodian files:")
report.append(f"\n| Entity | Type | Occurrences |")
report.append(f"|--------|------|-------------|")
sorted_by_freq = sorted(entity_names.items(), key=lambda x: len(x[1]), reverse=True)[:50]
for name, locs in sorted_by_freq:
if len(locs) >= 2:
types = set(t for _, t in locs)
type_str = ', '.join(sorted(types))
report.append(f"| {name[:60]} | `{type_str}` | {len(locs)} |")
report.append(f"\n## Duplicate Entity Analysis")
report.append(f"\nEntities that appear in 5+ different custodian files (candidates for entity linking):")
report.append(f"\n| Entity | Files | Primary Type |")
report.append(f"|--------|-------|--------------|")
high_freq_dups = [(name, locs) for name, locs in duplicates.items() if len(locs) >= 5]
high_freq_dups.sort(key=lambda x: len(x[1]), reverse=True)
for name, locs in high_freq_dups[:100]:
type_counter = Counter(t for _, t in locs)
primary_type = type_counter.most_common(1)[0][0]
report.append(f"| {name[:60]} | {len(locs)} | `{primary_type}` |")
# Heritage institutions specifically
report.append(f"\n## Heritage Institution Entities (GRP.HER)")
report.append(f"\nThese are extracted mentions of heritage institutions:")
her_entities = [(name, locs) for name, locs in entity_names.items()
if any(t == 'GRP.HER' for _, t in locs)]
her_entities.sort(key=lambda x: len(x[1]), reverse=True)
report.append(f"\n| Institution Name | Mentions | Files |")
report.append(f"|------------------|----------|-------|")
for name, locs in her_entities[:50]:
her_locs = [(f, t) for f, t in locs if t == 'GRP.HER']
files_list = ', '.join(set(f.replace('.yaml', '') for f, _ in her_locs[:3]))
if len(her_locs) > 3:
files_list += f" (+{len(her_locs)-3} more)"
report.append(f"| {name[:50]} | {len(her_locs)} | {files_list[:60]} |")
# Write report
report_path = 'reports/entity_extraction_stats.md'
os.makedirs('reports', exist_ok=True)
with open(report_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(report))
print(f"\nReport written to: {report_path}")
print(f"\nQuick Summary:")
print(f" - {files_with_claims:,} files with validated entities")
print(f" - {total_entities:,} total entities")
print(f" - {len(entity_names):,} unique entity names")
print(f" - {len(duplicates):,} entities appear in multiple files")
print(f" - {len(high_freq_dups)} entities appear in 5+ files (linking candidates)")
if __name__ == '__main__':
main()