230 lines
8.8 KiB
Python
230 lines
8.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Generate comprehensive entity extraction statistics report."""
|
|
|
|
import os
|
|
import re
|
|
import glob
|
|
from collections import Counter, defaultdict
|
|
from datetime import datetime
|
|
|
|
# Entity type descriptions from CH-Annotator convention
|
|
ENTITY_TYPE_LABELS = {
|
|
# Appellations
|
|
"APP.URL": "URL/Website",
|
|
"APP.TIT": "Title (document)",
|
|
"APP.TTL": "Title (general)",
|
|
"APP.EXH": "Exhibition name",
|
|
"APP.COL": "Collection name",
|
|
# Groups
|
|
"GRP.HER": "Heritage institution",
|
|
"GRP.ASS": "Association/Society",
|
|
"GRP.GOV": "Government body",
|
|
"GRP.COR": "Corporation",
|
|
"GRP.EDU": "Educational institution",
|
|
# Agents
|
|
"AGT.PER": "Person",
|
|
# Places
|
|
"TOP.SET": "Settlement/City",
|
|
"TOP.ADR": "Address",
|
|
"TOP.REG": "Region/Province",
|
|
"TOP.CTY": "Country",
|
|
"TOP.BLD": "Building",
|
|
# Temporal
|
|
"TMP.DAB": "Date (absolute)",
|
|
"TMP.OPH": "Opening hours",
|
|
"TMP.DRL": "Date range/Duration",
|
|
# Works
|
|
"WRK.WEB": "Website/Web resource",
|
|
"WRK.COL": "Collection (as work)",
|
|
"WRK.TXT": "Text/Document",
|
|
# Things
|
|
"THG.LNG": "Language",
|
|
"THG.CON": "Contact info",
|
|
"THG.ART": "Artifact/Object",
|
|
"THG.EVT": "Event",
|
|
"THG.PHO": "Photograph",
|
|
# Quantities
|
|
"QTY.CNT": "Count/Number",
|
|
"QTY.MSR": "Measurement",
|
|
"QTY.CUR": "Currency/Price",
|
|
}
|
|
|
|
def extract_entities_fast(filepath):
|
|
"""Extract entity info using regex (much faster than YAML parsing)."""
|
|
entities = []
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Check if file has validated_entity_claims
|
|
if 'validated_entity_claims:' not in content:
|
|
return None, []
|
|
|
|
# Extract claims section
|
|
claims_match = re.search(r'validated_entity_claims:.*?claims:(.*?)(?=\n[a-z_]+:|$)', content, re.DOTALL)
|
|
if not claims_match:
|
|
return None, []
|
|
|
|
claims_section = claims_match.group(1)
|
|
|
|
# Extract individual entities
|
|
entity_blocks = re.findall(r'- entity: (.+?)\n\s+entity_type: (\S+)', claims_section)
|
|
for entity_name, entity_type in entity_blocks:
|
|
entities.append({
|
|
'name': entity_name.strip(),
|
|
'type': entity_type.strip()
|
|
})
|
|
|
|
# Extract confidence scores
|
|
conf_matches = re.findall(r'final_confidence: ([\d.]+)', claims_section)
|
|
confidences = [float(c) for c in conf_matches]
|
|
avg_conf = sum(confidences) / len(confidences) if confidences else 0
|
|
|
|
return avg_conf, entities
|
|
except Exception as e:
|
|
return None, []
|
|
|
|
def main():
|
|
print("Scanning custodian files...")
|
|
|
|
files = glob.glob('data/custodian/NL-*.yaml')
|
|
|
|
type_counts = Counter()
|
|
entity_names = defaultdict(list) # entity_name -> list of (file, type)
|
|
files_with_claims = 0
|
|
total_entities = 0
|
|
confidence_sum = 0
|
|
confidence_count = 0
|
|
|
|
# Per-type confidence tracking
|
|
type_confidences = defaultdict(list)
|
|
|
|
for i, filepath in enumerate(files):
|
|
if i % 200 == 0:
|
|
print(f" Processing {i}/{len(files)}...")
|
|
|
|
avg_conf, entities = extract_entities_fast(filepath)
|
|
|
|
if entities:
|
|
files_with_claims += 1
|
|
total_entities += len(entities)
|
|
if avg_conf:
|
|
confidence_sum += avg_conf
|
|
confidence_count += 1
|
|
|
|
basename = os.path.basename(filepath)
|
|
for ent in entities:
|
|
type_counts[ent['type']] += 1
|
|
entity_names[ent['name']].append((basename, ent['type']))
|
|
|
|
# Find duplicates (entities appearing in multiple files)
|
|
duplicates = {name: locs for name, locs in entity_names.items() if len(locs) > 1}
|
|
|
|
# Generate report
|
|
report = []
|
|
report.append("# Entity Extraction Statistics Report")
|
|
report.append(f"\n**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
report.append(f"\n## Summary")
|
|
report.append(f"\n| Metric | Value |")
|
|
report.append(f"|--------|-------|")
|
|
report.append(f"| Total custodian files scanned | {len(files):,} |")
|
|
report.append(f"| Files with validated_entity_claims | {files_with_claims:,} ({100*files_with_claims/len(files):.1f}%) |")
|
|
report.append(f"| Total entities extracted | {total_entities:,} |")
|
|
report.append(f"| Unique entity names | {len(entity_names):,} |")
|
|
report.append(f"| Entities appearing in multiple files | {len(duplicates):,} |")
|
|
if confidence_count:
|
|
report.append(f"| Average confidence score | {confidence_sum/confidence_count:.3f} |")
|
|
report.append(f"| Unique entity types | {len(type_counts)} |")
|
|
|
|
report.append(f"\n## Entity Types Breakdown")
|
|
report.append(f"\n| Type | Label | Count | % |")
|
|
report.append(f"|------|-------|-------|---|")
|
|
for etype, count in type_counts.most_common():
|
|
label = ENTITY_TYPE_LABELS.get(etype, etype)
|
|
pct = 100 * count / total_entities if total_entities else 0
|
|
report.append(f"| `{etype}` | {label} | {count:,} | {pct:.1f}% |")
|
|
|
|
report.append(f"\n## Entity Type Categories")
|
|
|
|
# Group by prefix
|
|
prefix_counts = Counter()
|
|
for etype, count in type_counts.items():
|
|
prefix = etype.split('.')[0] if '.' in etype else etype
|
|
prefix_counts[prefix] += count
|
|
|
|
prefix_labels = {
|
|
'APP': 'Appellations (names, titles)',
|
|
'GRP': 'Groups (organizations)',
|
|
'AGT': 'Agents (people)',
|
|
'TOP': 'Toponyms (places)',
|
|
'TMP': 'Temporal (dates, times)',
|
|
'WRK': 'Works (documents, collections)',
|
|
'THG': 'Things (objects, concepts)',
|
|
'QTY': 'Quantities (numbers, measurements)',
|
|
}
|
|
|
|
report.append(f"\n| Category | Label | Count | % |")
|
|
report.append(f"|----------|-------|-------|---|")
|
|
for prefix, count in prefix_counts.most_common():
|
|
label = prefix_labels.get(prefix, prefix)
|
|
pct = 100 * count / total_entities if total_entities else 0
|
|
report.append(f"| `{prefix}` | {label} | {count:,} | {pct:.1f}% |")
|
|
|
|
report.append(f"\n## Top 50 Most Common Entities")
|
|
report.append(f"\nEntities appearing across multiple custodian files:")
|
|
report.append(f"\n| Entity | Type | Occurrences |")
|
|
report.append(f"|--------|------|-------------|")
|
|
|
|
sorted_by_freq = sorted(entity_names.items(), key=lambda x: len(x[1]), reverse=True)[:50]
|
|
for name, locs in sorted_by_freq:
|
|
if len(locs) >= 2:
|
|
types = set(t for _, t in locs)
|
|
type_str = ', '.join(sorted(types))
|
|
report.append(f"| {name[:60]} | `{type_str}` | {len(locs)} |")
|
|
|
|
report.append(f"\n## Duplicate Entity Analysis")
|
|
report.append(f"\nEntities that appear in 5+ different custodian files (candidates for entity linking):")
|
|
report.append(f"\n| Entity | Files | Primary Type |")
|
|
report.append(f"|--------|-------|--------------|")
|
|
|
|
high_freq_dups = [(name, locs) for name, locs in duplicates.items() if len(locs) >= 5]
|
|
high_freq_dups.sort(key=lambda x: len(x[1]), reverse=True)
|
|
|
|
for name, locs in high_freq_dups[:100]:
|
|
type_counter = Counter(t for _, t in locs)
|
|
primary_type = type_counter.most_common(1)[0][0]
|
|
report.append(f"| {name[:60]} | {len(locs)} | `{primary_type}` |")
|
|
|
|
# Heritage institutions specifically
|
|
report.append(f"\n## Heritage Institution Entities (GRP.HER)")
|
|
report.append(f"\nThese are extracted mentions of heritage institutions:")
|
|
|
|
her_entities = [(name, locs) for name, locs in entity_names.items()
|
|
if any(t == 'GRP.HER' for _, t in locs)]
|
|
her_entities.sort(key=lambda x: len(x[1]), reverse=True)
|
|
|
|
report.append(f"\n| Institution Name | Mentions | Files |")
|
|
report.append(f"|------------------|----------|-------|")
|
|
for name, locs in her_entities[:50]:
|
|
her_locs = [(f, t) for f, t in locs if t == 'GRP.HER']
|
|
files_list = ', '.join(set(f.replace('.yaml', '') for f, _ in her_locs[:3]))
|
|
if len(her_locs) > 3:
|
|
files_list += f" (+{len(her_locs)-3} more)"
|
|
report.append(f"| {name[:50]} | {len(her_locs)} | {files_list[:60]} |")
|
|
|
|
# Write report
|
|
report_path = 'reports/entity_extraction_stats.md'
|
|
os.makedirs('reports', exist_ok=True)
|
|
with open(report_path, 'w', encoding='utf-8') as f:
|
|
f.write('\n'.join(report))
|
|
|
|
print(f"\nReport written to: {report_path}")
|
|
print(f"\nQuick Summary:")
|
|
print(f" - {files_with_claims:,} files with validated entities")
|
|
print(f" - {total_entities:,} total entities")
|
|
print(f" - {len(entity_names):,} unique entity names")
|
|
print(f" - {len(duplicates):,} entities appear in multiple files")
|
|
print(f" - {len(high_freq_dups)} entities appear in 5+ files (linking candidates)")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|