glam/scripts/analyze_entity_duplicates.py
2025-12-14 17:09:55 +01:00

260 lines
10 KiB
Python

#!/usr/bin/env python3
"""Analyze duplicate entities across custodian files for deduplication."""
import os
import re
import glob
from collections import Counter, defaultdict
from datetime import datetime
import json
def extract_entities_fast(filepath):
"""Extract entity info using regex."""
entities = []
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
if 'validated_entity_claims:' not in content:
return []
claims_match = re.search(r'validated_entity_claims:.*?claims:(.*?)(?=\n[a-z_]+:|$)', content, re.DOTALL)
if not claims_match:
return []
claims_section = claims_match.group(1)
entity_blocks = re.findall(r'- entity: (.+?)\n\s+entity_type: (\S+)', claims_section)
for entity_name, entity_type in entity_blocks:
entities.append({
'name': entity_name.strip(),
'type': entity_type.strip()
})
return entities
except:
return []
def normalize_entity(name):
"""Normalize entity name for comparison."""
# Lowercase
norm = name.lower().strip()
# Remove quotes
norm = norm.strip("'\"")
# Normalize whitespace
norm = ' '.join(norm.split())
return norm
def main():
print("Analyzing entity duplicates...")
files = glob.glob('data/custodian/NL-*.yaml')
# Track entities: normalized_name -> [(original_name, type, file)]
entity_occurrences = defaultdict(list)
for i, filepath in enumerate(files):
if i % 200 == 0:
print(f" Processing {i}/{len(files)}...")
basename = os.path.basename(filepath)
entities = extract_entities_fast(filepath)
for ent in entities:
norm_name = normalize_entity(ent['name'])
entity_occurrences[norm_name].append({
'original': ent['name'],
'type': ent['type'],
'file': basename
})
print(f"\nTotal unique normalized entities: {len(entity_occurrences):,}")
# Identify quality issues
issues = {
'language_codes': [], # nl-NL, nl_NL, etc.
'generic_labels': [], # Home, Menu, etc.
'numeric_only': [], # '2025', '1200', etc.
'single_char': [], # Single characters
'type_mismatches': [], # Same entity with different types
'variant_spellings': [], # Same entity with slight variations
}
# Language code patterns
lang_patterns = [
r'^[a-z]{2}[-_][A-Z]{2}$', # nl-NL, en-US
r'^[a-z]{2}$', # nl, en
r'^[a-z]{2}_[a-z]{2}$', # nl_nl
]
# Generic navigation/UI labels
generic_labels = {'home', 'menu', 'contact', 'over', 'about', 'search', 'zoeken',
'nieuws', 'news', 'agenda', 'events', 'login', 'logout',
'inloggen', 'uitloggen', 'cookie', 'cookies', 'privacy',
'collectie', 'collection', 'archief', 'archive'}
for norm_name, occurrences in entity_occurrences.items():
orig = occurrences[0]['original']
# Check for language codes
for pat in lang_patterns:
if re.match(pat, orig, re.IGNORECASE):
issues['language_codes'].append((orig, len(occurrences)))
break
# Check for generic labels
if norm_name in generic_labels:
issues['generic_labels'].append((orig, len(occurrences)))
# Check for numeric-only
if re.match(r"^'?\d+'?$", orig):
issues['numeric_only'].append((orig, len(occurrences)))
# Check for single character
if len(norm_name) <= 2:
issues['single_char'].append((orig, len(occurrences)))
# Check for type mismatches
types = set(o['type'] for o in occurrences)
if len(types) > 1:
issues['type_mismatches'].append((orig, types, len(occurrences)))
# Generate report
report = []
report.append("# Entity Duplicate Analysis Report")
report.append(f"\n**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append(f"\n## Data Quality Issues Summary")
report.append(f"\n| Issue Type | Count | Total Occurrences |")
report.append(f"|------------|-------|-------------------|")
total_issues = 0
total_occurrences = 0
for issue_type, items in issues.items():
count = len(items)
if issue_type == 'type_mismatches':
occ = sum(c for _, _, c in items)
else:
occ = sum(c for _, c in items)
total_issues += count
total_occurrences += occ
report.append(f"| {issue_type.replace('_', ' ').title()} | {count} | {occ} |")
report.append(f"| **TOTAL** | **{total_issues}** | **{total_occurrences}** |")
report.append(f"\n## Language Code Entities (Should Be Filtered)")
report.append(f"\nThese are HTML `lang` attribute values, not real entities:")
report.append(f"\n| Entity | Occurrences | Action |")
report.append(f"|--------|-------------|--------|")
for entity, count in sorted(issues['language_codes'], key=lambda x: -x[1])[:20]:
report.append(f"| `{entity}` | {count} | Filter out |")
report.append(f"\n## Generic Navigation Labels (Low Value)")
report.append(f"\nThese are website navigation labels, not heritage entities:")
report.append(f"\n| Entity | Occurrences | Action |")
report.append(f"|--------|-------------|--------|")
for entity, count in sorted(issues['generic_labels'], key=lambda x: -x[1])[:20]:
report.append(f"| `{entity}` | {count} | Filter out |")
report.append(f"\n## Numeric-Only Entities (Often Dimensions)")
report.append(f"\nThese are often image dimensions or years without context:")
report.append(f"\n| Entity | Occurrences | Action |")
report.append(f"|--------|-------------|--------|")
for entity, count in sorted(issues['numeric_only'], key=lambda x: -x[1])[:20]:
report.append(f"| `{entity}` | {count} | Review context |")
report.append(f"\n## Type Mismatches (Need Resolution)")
report.append(f"\nSame entity classified with different types:")
report.append(f"\n| Entity | Types | Occurrences | Action |")
report.append(f"|--------|-------|-------------|--------|")
for entity, types, count in sorted(issues['type_mismatches'], key=lambda x: -x[2])[:50]:
types_str = ', '.join(sorted(types))
report.append(f"| {entity[:40]} | `{types_str}` | {count} | Resolve type |")
# Calculate cleanup impact
report.append(f"\n## Cleanup Impact Analysis")
cleanup_candidates = set()
for entity, _ in issues['language_codes']:
cleanup_candidates.add(normalize_entity(entity))
for entity, _ in issues['generic_labels']:
cleanup_candidates.add(normalize_entity(entity))
for entity, _ in issues['numeric_only']:
cleanup_candidates.add(normalize_entity(entity))
cleanup_occurrences = sum(
len(entity_occurrences[norm])
for norm in cleanup_candidates
if norm in entity_occurrences
)
total_entities = sum(len(v) for v in entity_occurrences.values())
report.append(f"\n| Metric | Value |")
report.append(f"|--------|-------|")
report.append(f"| Total entity occurrences | {total_entities:,} |")
report.append(f"| Candidate cleanup occurrences | {cleanup_occurrences:,} |")
report.append(f"| Cleanup percentage | {100*cleanup_occurrences/total_entities:.1f}% |")
report.append(f"| Remaining after cleanup | {total_entities - cleanup_occurrences:,} |")
# High-value entities for linking
report.append(f"\n## High-Value Entities for Linking")
report.append(f"\nEntities that appear frequently and are good candidates for Wikidata/VIAF linking:")
# Filter to heritage-relevant types
heritage_types = {'GRP.HER', 'GRP.ASS', 'GRP.GOV', 'GRP.COR', 'GRP.EDU',
'AGT.PER', 'TOP.SET', 'TOP.REG', 'TOP.CTY', 'TOP.BLD'}
linking_candidates = []
for norm_name, occurrences in entity_occurrences.items():
# Skip cleanup candidates
if norm_name in cleanup_candidates:
continue
# Skip if too short
if len(norm_name) < 3:
continue
# Check if has heritage-relevant types
types = set(o['type'] for o in occurrences)
if types & heritage_types:
primary_type = Counter(o['type'] for o in occurrences).most_common(1)[0][0]
linking_candidates.append({
'name': occurrences[0]['original'],
'norm': norm_name,
'type': primary_type,
'count': len(occurrences)
})
linking_candidates.sort(key=lambda x: -x['count'])
report.append(f"\n| Entity | Type | Occurrences | Wikidata? |")
report.append(f"|--------|------|-------------|-----------|")
for cand in linking_candidates[:75]:
report.append(f"| {cand['name'][:40]} | `{cand['type']}` | {cand['count']} | 🔍 Search |")
# Write report
report_path = 'reports/entity_duplicate_analysis.md'
with open(report_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(report))
print(f"\nReport written to: {report_path}")
# Also save linking candidates as JSON for automated processing
json_path = 'reports/entity_linking_candidates.json'
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(linking_candidates[:200], f, indent=2, ensure_ascii=False)
print(f"Linking candidates JSON: {json_path}")
print(f"\nSummary:")
print(f" - {len(issues['language_codes'])} language code entities to filter")
print(f" - {len(issues['generic_labels'])} generic labels to filter")
print(f" - {len(issues['numeric_only'])} numeric-only entities to review")
print(f" - {len(issues['type_mismatches'])} type mismatches to resolve")
print(f" - {len(linking_candidates)} high-value linking candidates identified")
print(f" - Potential cleanup: {cleanup_occurrences:,} occurrences ({100*cleanup_occurrences/total_entities:.1f}%)")
if __name__ == '__main__':
main()