glam/data/entity_annotation/modules/processing/exclusions.yaml
2025-12-05 15:30:23 +01:00

127 lines
3 KiB
YAML

# =============================================================================
# Universal Exclusions Module
# =============================================================================
# Version: 1.7.0
# Module: processing/exclusions.yaml
#
# These exclusion rules apply to ALL entity types across all text sources.
# =============================================================================
id: universal_exclusions
name: "Universal Exclusion Rules"
description: |
Exclusion rules that apply to ALL entity types across all text sources.
These patterns should NEVER be tagged as named entities.
exclusion_rules:
navigation:
id: "UNI_EXC001"
description: "Navigation elements and menu items"
patterns:
- "Home"
- "About Us"
- "Contact"
- "Menu"
- "Search"
- "Login"
- "Sign Up"
- "Back to top"
- "Next"
- "Previous"
- "Read more"
- "Learn more"
- "Click here"
- "See all"
- "View more"
- "Show less"
note: "These are UI elements, not named entities"
calls_to_action:
id: "UNI_EXC002"
description: "Marketing and call-to-action phrases"
patterns:
- "Buy now"
- "Subscribe"
- "Book tickets"
- "Plan your visit"
- "Become a member"
- "Donate"
- "Shop"
- "Get tickets"
- "Reserve"
- "Join us"
note: "Action prompts, not entity references"
social_boilerplate:
id: "UNI_EXC003"
description: "Generic social media text"
patterns:
- "Follow us"
- "Share"
- "Like"
- "Tweet"
- "Pin it"
- "Share on Facebook"
- "Follow on Instagram"
note: "Tag actual handles (WRK.SOC), not these generic phrases"
technical_artifacts:
id: "UNI_EXC004"
description: "Content management system artifacts"
patterns:
- "Posted by"
- "Last updated"
- "Tags:"
- "Categories:"
- "Comments"
- "Leave a reply"
- "Related posts"
- "Powered by"
- "Cookie settings"
- "Privacy policy"
- "Terms of use"
- "All rights reserved"
note: "Technical/legal boilerplate, not content entities"
generic_words:
id: "UNI_EXC005"
description: "Common words that should not be tagged"
patterns:
- "Welcome"
- "Information"
- "Details"
- "Overview"
- "Introduction"
- "Summary"
- "Description"
- "Features"
- "Highlights"
- "News"
- "Events"
- "Updates"
note: "Section headers and generic labels, not named entities"
pronouns:
id: "UNI_EXC006"
description: "Pronouns and demonstrative words"
patterns:
- "he"
- "she"
- "it"
- "they"
- "we"
- "you"
- "I"
- "him"
- "her"
- "them"
- "us"
- "this"
- "that"
- "these"
- "those"
- "here"
- "there"
- "where"
note: "Pronouns require coreference resolution, not NER"