260 lines
10 KiB
Python
260 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""Analyze duplicate entities across custodian files for deduplication."""
|
|
|
|
import os
|
|
import re
|
|
import glob
|
|
from collections import Counter, defaultdict
|
|
from datetime import datetime
|
|
import json
|
|
|
|
def extract_entities_fast(filepath):
|
|
"""Extract entity info using regex."""
|
|
entities = []
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
if 'validated_entity_claims:' not in content:
|
|
return []
|
|
|
|
claims_match = re.search(r'validated_entity_claims:.*?claims:(.*?)(?=\n[a-z_]+:|$)', content, re.DOTALL)
|
|
if not claims_match:
|
|
return []
|
|
|
|
claims_section = claims_match.group(1)
|
|
entity_blocks = re.findall(r'- entity: (.+?)\n\s+entity_type: (\S+)', claims_section)
|
|
|
|
for entity_name, entity_type in entity_blocks:
|
|
entities.append({
|
|
'name': entity_name.strip(),
|
|
'type': entity_type.strip()
|
|
})
|
|
|
|
return entities
|
|
except:
|
|
return []
|
|
|
|
def normalize_entity(name):
|
|
"""Normalize entity name for comparison."""
|
|
# Lowercase
|
|
norm = name.lower().strip()
|
|
# Remove quotes
|
|
norm = norm.strip("'\"")
|
|
# Normalize whitespace
|
|
norm = ' '.join(norm.split())
|
|
return norm
|
|
|
|
def main():
|
|
print("Analyzing entity duplicates...")
|
|
|
|
files = glob.glob('data/custodian/NL-*.yaml')
|
|
|
|
# Track entities: normalized_name -> [(original_name, type, file)]
|
|
entity_occurrences = defaultdict(list)
|
|
|
|
for i, filepath in enumerate(files):
|
|
if i % 200 == 0:
|
|
print(f" Processing {i}/{len(files)}...")
|
|
|
|
basename = os.path.basename(filepath)
|
|
entities = extract_entities_fast(filepath)
|
|
|
|
for ent in entities:
|
|
norm_name = normalize_entity(ent['name'])
|
|
entity_occurrences[norm_name].append({
|
|
'original': ent['name'],
|
|
'type': ent['type'],
|
|
'file': basename
|
|
})
|
|
|
|
print(f"\nTotal unique normalized entities: {len(entity_occurrences):,}")
|
|
|
|
# Identify quality issues
|
|
issues = {
|
|
'language_codes': [], # nl-NL, nl_NL, etc.
|
|
'generic_labels': [], # Home, Menu, etc.
|
|
'numeric_only': [], # '2025', '1200', etc.
|
|
'single_char': [], # Single characters
|
|
'type_mismatches': [], # Same entity with different types
|
|
'variant_spellings': [], # Same entity with slight variations
|
|
}
|
|
|
|
# Language code patterns
|
|
lang_patterns = [
|
|
r'^[a-z]{2}[-_][A-Z]{2}$', # nl-NL, en-US
|
|
r'^[a-z]{2}$', # nl, en
|
|
r'^[a-z]{2}_[a-z]{2}$', # nl_nl
|
|
]
|
|
|
|
# Generic navigation/UI labels
|
|
generic_labels = {'home', 'menu', 'contact', 'over', 'about', 'search', 'zoeken',
|
|
'nieuws', 'news', 'agenda', 'events', 'login', 'logout',
|
|
'inloggen', 'uitloggen', 'cookie', 'cookies', 'privacy',
|
|
'collectie', 'collection', 'archief', 'archive'}
|
|
|
|
for norm_name, occurrences in entity_occurrences.items():
|
|
orig = occurrences[0]['original']
|
|
|
|
# Check for language codes
|
|
for pat in lang_patterns:
|
|
if re.match(pat, orig, re.IGNORECASE):
|
|
issues['language_codes'].append((orig, len(occurrences)))
|
|
break
|
|
|
|
# Check for generic labels
|
|
if norm_name in generic_labels:
|
|
issues['generic_labels'].append((orig, len(occurrences)))
|
|
|
|
# Check for numeric-only
|
|
if re.match(r"^'?\d+'?$", orig):
|
|
issues['numeric_only'].append((orig, len(occurrences)))
|
|
|
|
# Check for single character
|
|
if len(norm_name) <= 2:
|
|
issues['single_char'].append((orig, len(occurrences)))
|
|
|
|
# Check for type mismatches
|
|
types = set(o['type'] for o in occurrences)
|
|
if len(types) > 1:
|
|
issues['type_mismatches'].append((orig, types, len(occurrences)))
|
|
|
|
# Generate report
|
|
report = []
|
|
report.append("# Entity Duplicate Analysis Report")
|
|
report.append(f"\n**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
|
|
report.append(f"\n## Data Quality Issues Summary")
|
|
report.append(f"\n| Issue Type | Count | Total Occurrences |")
|
|
report.append(f"|------------|-------|-------------------|")
|
|
|
|
total_issues = 0
|
|
total_occurrences = 0
|
|
|
|
for issue_type, items in issues.items():
|
|
count = len(items)
|
|
if issue_type == 'type_mismatches':
|
|
occ = sum(c for _, _, c in items)
|
|
else:
|
|
occ = sum(c for _, c in items)
|
|
total_issues += count
|
|
total_occurrences += occ
|
|
report.append(f"| {issue_type.replace('_', ' ').title()} | {count} | {occ} |")
|
|
|
|
report.append(f"| **TOTAL** | **{total_issues}** | **{total_occurrences}** |")
|
|
|
|
report.append(f"\n## Language Code Entities (Should Be Filtered)")
|
|
report.append(f"\nThese are HTML `lang` attribute values, not real entities:")
|
|
report.append(f"\n| Entity | Occurrences | Action |")
|
|
report.append(f"|--------|-------------|--------|")
|
|
for entity, count in sorted(issues['language_codes'], key=lambda x: -x[1])[:20]:
|
|
report.append(f"| `{entity}` | {count} | Filter out |")
|
|
|
|
report.append(f"\n## Generic Navigation Labels (Low Value)")
|
|
report.append(f"\nThese are website navigation labels, not heritage entities:")
|
|
report.append(f"\n| Entity | Occurrences | Action |")
|
|
report.append(f"|--------|-------------|--------|")
|
|
for entity, count in sorted(issues['generic_labels'], key=lambda x: -x[1])[:20]:
|
|
report.append(f"| `{entity}` | {count} | Filter out |")
|
|
|
|
report.append(f"\n## Numeric-Only Entities (Often Dimensions)")
|
|
report.append(f"\nThese are often image dimensions or years without context:")
|
|
report.append(f"\n| Entity | Occurrences | Action |")
|
|
report.append(f"|--------|-------------|--------|")
|
|
for entity, count in sorted(issues['numeric_only'], key=lambda x: -x[1])[:20]:
|
|
report.append(f"| `{entity}` | {count} | Review context |")
|
|
|
|
report.append(f"\n## Type Mismatches (Need Resolution)")
|
|
report.append(f"\nSame entity classified with different types:")
|
|
report.append(f"\n| Entity | Types | Occurrences | Action |")
|
|
report.append(f"|--------|-------|-------------|--------|")
|
|
for entity, types, count in sorted(issues['type_mismatches'], key=lambda x: -x[2])[:50]:
|
|
types_str = ', '.join(sorted(types))
|
|
report.append(f"| {entity[:40]} | `{types_str}` | {count} | Resolve type |")
|
|
|
|
# Calculate cleanup impact
|
|
report.append(f"\n## Cleanup Impact Analysis")
|
|
|
|
cleanup_candidates = set()
|
|
for entity, _ in issues['language_codes']:
|
|
cleanup_candidates.add(normalize_entity(entity))
|
|
for entity, _ in issues['generic_labels']:
|
|
cleanup_candidates.add(normalize_entity(entity))
|
|
for entity, _ in issues['numeric_only']:
|
|
cleanup_candidates.add(normalize_entity(entity))
|
|
|
|
cleanup_occurrences = sum(
|
|
len(entity_occurrences[norm])
|
|
for norm in cleanup_candidates
|
|
if norm in entity_occurrences
|
|
)
|
|
|
|
total_entities = sum(len(v) for v in entity_occurrences.values())
|
|
|
|
report.append(f"\n| Metric | Value |")
|
|
report.append(f"|--------|-------|")
|
|
report.append(f"| Total entity occurrences | {total_entities:,} |")
|
|
report.append(f"| Candidate cleanup occurrences | {cleanup_occurrences:,} |")
|
|
report.append(f"| Cleanup percentage | {100*cleanup_occurrences/total_entities:.1f}% |")
|
|
report.append(f"| Remaining after cleanup | {total_entities - cleanup_occurrences:,} |")
|
|
|
|
# High-value entities for linking
|
|
report.append(f"\n## High-Value Entities for Linking")
|
|
report.append(f"\nEntities that appear frequently and are good candidates for Wikidata/VIAF linking:")
|
|
|
|
# Filter to heritage-relevant types
|
|
heritage_types = {'GRP.HER', 'GRP.ASS', 'GRP.GOV', 'GRP.COR', 'GRP.EDU',
|
|
'AGT.PER', 'TOP.SET', 'TOP.REG', 'TOP.CTY', 'TOP.BLD'}
|
|
|
|
linking_candidates = []
|
|
for norm_name, occurrences in entity_occurrences.items():
|
|
# Skip cleanup candidates
|
|
if norm_name in cleanup_candidates:
|
|
continue
|
|
|
|
# Skip if too short
|
|
if len(norm_name) < 3:
|
|
continue
|
|
|
|
# Check if has heritage-relevant types
|
|
types = set(o['type'] for o in occurrences)
|
|
if types & heritage_types:
|
|
primary_type = Counter(o['type'] for o in occurrences).most_common(1)[0][0]
|
|
linking_candidates.append({
|
|
'name': occurrences[0]['original'],
|
|
'norm': norm_name,
|
|
'type': primary_type,
|
|
'count': len(occurrences)
|
|
})
|
|
|
|
linking_candidates.sort(key=lambda x: -x['count'])
|
|
|
|
report.append(f"\n| Entity | Type | Occurrences | Wikidata? |")
|
|
report.append(f"|--------|------|-------------|-----------|")
|
|
for cand in linking_candidates[:75]:
|
|
report.append(f"| {cand['name'][:40]} | `{cand['type']}` | {cand['count']} | 🔍 Search |")
|
|
|
|
# Write report
|
|
report_path = 'reports/entity_duplicate_analysis.md'
|
|
with open(report_path, 'w', encoding='utf-8') as f:
|
|
f.write('\n'.join(report))
|
|
|
|
print(f"\nReport written to: {report_path}")
|
|
|
|
# Also save linking candidates as JSON for automated processing
|
|
json_path = 'reports/entity_linking_candidates.json'
|
|
with open(json_path, 'w', encoding='utf-8') as f:
|
|
json.dump(linking_candidates[:200], f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Linking candidates JSON: {json_path}")
|
|
|
|
print(f"\nSummary:")
|
|
print(f" - {len(issues['language_codes'])} language code entities to filter")
|
|
print(f" - {len(issues['generic_labels'])} generic labels to filter")
|
|
print(f" - {len(issues['numeric_only'])} numeric-only entities to review")
|
|
print(f" - {len(issues['type_mismatches'])} type mismatches to resolve")
|
|
print(f" - {len(linking_candidates)} high-value linking candidates identified")
|
|
print(f" - Potential cleanup: {cleanup_occurrences:,} occurrences ({100*cleanup_occurrences/total_entities:.1f}%)")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|