#!/usr/bin/env python3 """Analyze duplicate entities across custodian files for deduplication.""" import os import re import glob from collections import Counter, defaultdict from datetime import datetime import json def extract_entities_fast(filepath): """Extract entity info using regex.""" entities = [] try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() if 'validated_entity_claims:' not in content: return [] claims_match = re.search(r'validated_entity_claims:.*?claims:(.*?)(?=\n[a-z_]+:|$)', content, re.DOTALL) if not claims_match: return [] claims_section = claims_match.group(1) entity_blocks = re.findall(r'- entity: (.+?)\n\s+entity_type: (\S+)', claims_section) for entity_name, entity_type in entity_blocks: entities.append({ 'name': entity_name.strip(), 'type': entity_type.strip() }) return entities except: return [] def normalize_entity(name): """Normalize entity name for comparison.""" # Lowercase norm = name.lower().strip() # Remove quotes norm = norm.strip("'\"") # Normalize whitespace norm = ' '.join(norm.split()) return norm def main(): print("Analyzing entity duplicates...") files = glob.glob('data/custodian/NL-*.yaml') # Track entities: normalized_name -> [(original_name, type, file)] entity_occurrences = defaultdict(list) for i, filepath in enumerate(files): if i % 200 == 0: print(f" Processing {i}/{len(files)}...") basename = os.path.basename(filepath) entities = extract_entities_fast(filepath) for ent in entities: norm_name = normalize_entity(ent['name']) entity_occurrences[norm_name].append({ 'original': ent['name'], 'type': ent['type'], 'file': basename }) print(f"\nTotal unique normalized entities: {len(entity_occurrences):,}") # Identify quality issues issues = { 'language_codes': [], # nl-NL, nl_NL, etc. 'generic_labels': [], # Home, Menu, etc. 'numeric_only': [], # '2025', '1200', etc. 'single_char': [], # Single characters 'type_mismatches': [], # Same entity with different types 'variant_spellings': [], # Same entity with slight variations } # Language code patterns lang_patterns = [ r'^[a-z]{2}[-_][A-Z]{2}$', # nl-NL, en-US r'^[a-z]{2}$', # nl, en r'^[a-z]{2}_[a-z]{2}$', # nl_nl ] # Generic navigation/UI labels generic_labels = {'home', 'menu', 'contact', 'over', 'about', 'search', 'zoeken', 'nieuws', 'news', 'agenda', 'events', 'login', 'logout', 'inloggen', 'uitloggen', 'cookie', 'cookies', 'privacy', 'collectie', 'collection', 'archief', 'archive'} for norm_name, occurrences in entity_occurrences.items(): orig = occurrences[0]['original'] # Check for language codes for pat in lang_patterns: if re.match(pat, orig, re.IGNORECASE): issues['language_codes'].append((orig, len(occurrences))) break # Check for generic labels if norm_name in generic_labels: issues['generic_labels'].append((orig, len(occurrences))) # Check for numeric-only if re.match(r"^'?\d+'?$", orig): issues['numeric_only'].append((orig, len(occurrences))) # Check for single character if len(norm_name) <= 2: issues['single_char'].append((orig, len(occurrences))) # Check for type mismatches types = set(o['type'] for o in occurrences) if len(types) > 1: issues['type_mismatches'].append((orig, types, len(occurrences))) # Generate report report = [] report.append("# Entity Duplicate Analysis Report") report.append(f"\n**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") report.append(f"\n## Data Quality Issues Summary") report.append(f"\n| Issue Type | Count | Total Occurrences |") report.append(f"|------------|-------|-------------------|") total_issues = 0 total_occurrences = 0 for issue_type, items in issues.items(): count = len(items) if issue_type == 'type_mismatches': occ = sum(c for _, _, c in items) else: occ = sum(c for _, c in items) total_issues += count total_occurrences += occ report.append(f"| {issue_type.replace('_', ' ').title()} | {count} | {occ} |") report.append(f"| **TOTAL** | **{total_issues}** | **{total_occurrences}** |") report.append(f"\n## Language Code Entities (Should Be Filtered)") report.append(f"\nThese are HTML `lang` attribute values, not real entities:") report.append(f"\n| Entity | Occurrences | Action |") report.append(f"|--------|-------------|--------|") for entity, count in sorted(issues['language_codes'], key=lambda x: -x[1])[:20]: report.append(f"| `{entity}` | {count} | Filter out |") report.append(f"\n## Generic Navigation Labels (Low Value)") report.append(f"\nThese are website navigation labels, not heritage entities:") report.append(f"\n| Entity | Occurrences | Action |") report.append(f"|--------|-------------|--------|") for entity, count in sorted(issues['generic_labels'], key=lambda x: -x[1])[:20]: report.append(f"| `{entity}` | {count} | Filter out |") report.append(f"\n## Numeric-Only Entities (Often Dimensions)") report.append(f"\nThese are often image dimensions or years without context:") report.append(f"\n| Entity | Occurrences | Action |") report.append(f"|--------|-------------|--------|") for entity, count in sorted(issues['numeric_only'], key=lambda x: -x[1])[:20]: report.append(f"| `{entity}` | {count} | Review context |") report.append(f"\n## Type Mismatches (Need Resolution)") report.append(f"\nSame entity classified with different types:") report.append(f"\n| Entity | Types | Occurrences | Action |") report.append(f"|--------|-------|-------------|--------|") for entity, types, count in sorted(issues['type_mismatches'], key=lambda x: -x[2])[:50]: types_str = ', '.join(sorted(types)) report.append(f"| {entity[:40]} | `{types_str}` | {count} | Resolve type |") # Calculate cleanup impact report.append(f"\n## Cleanup Impact Analysis") cleanup_candidates = set() for entity, _ in issues['language_codes']: cleanup_candidates.add(normalize_entity(entity)) for entity, _ in issues['generic_labels']: cleanup_candidates.add(normalize_entity(entity)) for entity, _ in issues['numeric_only']: cleanup_candidates.add(normalize_entity(entity)) cleanup_occurrences = sum( len(entity_occurrences[norm]) for norm in cleanup_candidates if norm in entity_occurrences ) total_entities = sum(len(v) for v in entity_occurrences.values()) report.append(f"\n| Metric | Value |") report.append(f"|--------|-------|") report.append(f"| Total entity occurrences | {total_entities:,} |") report.append(f"| Candidate cleanup occurrences | {cleanup_occurrences:,} |") report.append(f"| Cleanup percentage | {100*cleanup_occurrences/total_entities:.1f}% |") report.append(f"| Remaining after cleanup | {total_entities - cleanup_occurrences:,} |") # High-value entities for linking report.append(f"\n## High-Value Entities for Linking") report.append(f"\nEntities that appear frequently and are good candidates for Wikidata/VIAF linking:") # Filter to heritage-relevant types heritage_types = {'GRP.HER', 'GRP.ASS', 'GRP.GOV', 'GRP.COR', 'GRP.EDU', 'AGT.PER', 'TOP.SET', 'TOP.REG', 'TOP.CTY', 'TOP.BLD'} linking_candidates = [] for norm_name, occurrences in entity_occurrences.items(): # Skip cleanup candidates if norm_name in cleanup_candidates: continue # Skip if too short if len(norm_name) < 3: continue # Check if has heritage-relevant types types = set(o['type'] for o in occurrences) if types & heritage_types: primary_type = Counter(o['type'] for o in occurrences).most_common(1)[0][0] linking_candidates.append({ 'name': occurrences[0]['original'], 'norm': norm_name, 'type': primary_type, 'count': len(occurrences) }) linking_candidates.sort(key=lambda x: -x['count']) report.append(f"\n| Entity | Type | Occurrences | Wikidata? |") report.append(f"|--------|------|-------------|-----------|") for cand in linking_candidates[:75]: report.append(f"| {cand['name'][:40]} | `{cand['type']}` | {cand['count']} | 🔍 Search |") # Write report report_path = 'reports/entity_duplicate_analysis.md' with open(report_path, 'w', encoding='utf-8') as f: f.write('\n'.join(report)) print(f"\nReport written to: {report_path}") # Also save linking candidates as JSON for automated processing json_path = 'reports/entity_linking_candidates.json' with open(json_path, 'w', encoding='utf-8') as f: json.dump(linking_candidates[:200], f, indent=2, ensure_ascii=False) print(f"Linking candidates JSON: {json_path}") print(f"\nSummary:") print(f" - {len(issues['language_codes'])} language code entities to filter") print(f" - {len(issues['generic_labels'])} generic labels to filter") print(f" - {len(issues['numeric_only'])} numeric-only entities to review") print(f" - {len(issues['type_mismatches'])} type mismatches to resolve") print(f" - {len(linking_candidates)} high-value linking candidates identified") print(f" - Potential cleanup: {cleanup_occurrences:,} occurrences ({100*cleanup_occurrences/total_entities:.1f}%)") if __name__ == '__main__': main()