glam/data/entity_annotation/modules/processing/dutch_web_patterns.yaml
2025-12-14 17:09:55 +01:00

2095 lines
73 KiB
YAML

# Dutch Web Patterns for Entity Annotation
# =============================================================================
# Converted from cleanup_contact_false_positives_v2.py
#
# Purpose: Define patterns for extracting typed entities from Dutch heritage
# website content with relationship predicates to the custodian being processed.
#
# Each pattern can have:
# - entity_type: CH-Annotator hypernym code (GRP.*, TOP.*, WRK.*, ROL.*, AGT.*, null)
# - capture_groups: Named groups that capture sub-entities
# - relationships: Predicates connecting extracted entity to custodian or other entities
# - discard_reason: For patterns that identify non-entities (UI elements, etc.)
#
# Version: 1.0.0
# Date: 2025-12-13
# Source: scripts/cleanup_contact_false_positives_v2.py lines 28-1166
# =============================================================================
metadata:
id: dutch_web_patterns_v1
name: Dutch Web Content Entity Patterns
version: "1.1.0"
language: nl
description: >-
Patterns for extracting and classifying entities from Dutch heritage institution
websites. Patterns are derived from false positive cleanup analysis of 168 custodian
web archives. Version 1.1.0 adds layout_hints based on analysis of 1,525 annotated
web archives showing XPath → entity type correlations.
source_script: scripts/cleanup_contact_false_positives_v2.py
ch_annotator_version: "1.7.0"
pattern_count: 646
# Layout hints configuration based on analysis of 1,525 web archives
# These define which XPath locations are most predictive for each entity type
layout_hints:
description: >-
XPath location hints derived from analyzing 15,252 entity claims across 1,343
unique websites. Patterns found at expected locations receive confidence boost.
# High-confidence XPath → entity type mappings (>80% correlation)
high_confidence_locations:
GRP.HER:
description: "Heritage institutions (museums, archives, libraries)"
primary_xpaths:
- "head/title" # 41.8% of GRP.HER found here
- "body/*/h1" # Primary heading
- "head/meta[@name='description']/@content"
confidence_boost: 0.2
GRP.ASS:
description: "Associations and societies"
primary_xpaths:
- "head/title" # 39.3% of GRP.ASS found here
- "body/*/header/h1"
confidence_boost: 0.15
GRP.GOV:
description: "Government bodies"
primary_xpaths:
- "head/title"
- "body/*/h1"
- "body/*/header"
confidence_boost: 0.15
TOP.ADR:
description: "Addresses"
primary_xpaths:
- "body/footer/*" # 23.8% of addresses in footer
- "body/*/footer/*/p"
- "body/*/p" # Paragraphs
confidence_boost: 0.2
TMP.OPH:
description: "Opening hours"
primary_xpaths:
- "body/*/footer"
- "body/*/table" # Often in tables
confidence_boost: 0.15
AGT.PER:
description: "Person names"
primary_xpaths:
- "body/*/p" # 36.4% in paragraphs
- "body/*/ul/li" # Staff lists
confidence_boost: 0.1
# Locations to deprioritize (often noise)
low_confidence_locations:
- "body/*/nav" # Navigation (menu items, not entities)
- "body/*/script" # JavaScript
- "body/*/style" # CSS
# Discard locations (always ignore content from these)
discard_locations:
- "head/script" # JS in head
- "body/*/noscript" # Noscript fallbacks
# =============================================================================
# ENTITY PATTERNS - Patterns that identify extractable entities with types
# =============================================================================
entity_patterns:
# ---------------------------------------------------------------------------
# ORGANIZATION PATTERNS (GRP.*)
# Heritage organizations, associations, societies, government bodies
# ---------------------------------------------------------------------------
organizations:
description: "Patterns identifying heritage organizations and related bodies"
heritage_associations:
description: "Historical/heritage associations and societies"
patterns:
- pattern: '^historische\s+vereniging\s+(\w+)$'
entity_type: GRP.ASS
entity_subtype: GRP.ASS.HER # Heritage association
label_template: "Historische Vereniging {1}"
capture_groups:
1:
type: TOP.SET
role: location_name
description: "Settlement/place name"
relationships:
- predicate: schema:location
subject: $0
object: $1
confidence: 0.9
examples:
- text: "Historische Vereniging Aalten"
entity: "Historische Vereniging Aalten"
captures:
1: "Aalten"
- pattern: '^heemkundige\s+kring\s+(\w+)$'
entity_type: GRP.ASS
entity_subtype: GRP.ASS.HER
label_template: "Heemkundige Kring {1}"
capture_groups:
1:
type: TOP.SET
role: location_name
relationships:
- predicate: schema:location
subject: $0
object: $1
examples:
- text: "Heemkundige Kring Halle"
entity: "Heemkundige Kring Halle"
- pattern: '^heemkunde\s*kring\s+(\w+)$'
entity_type: GRP.ASS
entity_subtype: GRP.ASS.HER
capture_groups:
1:
type: TOP.SET
role: location_name
- pattern: '^heemkunde\s+werkgroep\s+(\w+)$'
entity_type: GRP.ASS
entity_subtype: GRP.ASS.HER
capture_groups:
1:
type: TOP.SET
role: location_name
- pattern: '^historische\s+werkgroep\s+(\w+)$'
entity_type: GRP.ASS
entity_subtype: GRP.ASS.HER
capture_groups:
1:
type: TOP.SET
role: location_name
- pattern: '^oudheidkundige?\s+(kring|vereniging)\s+(\w+)$'
entity_type: GRP.ASS
entity_subtype: GRP.ASS.HER
capture_groups:
1:
type: null
role: organization_type
2:
type: TOP.SET
role: location_name
- pattern: '^heemkundevereniging\s+(\w+)$'
entity_type: GRP.ASS
entity_subtype: GRP.ASS.HER
capture_groups:
1:
type: TOP.SET
role: location_name
- pattern: '^(\w+)se?\s+(historische\s+)?(vereniging|kring|werkgroep|stichting|genootschap)$'
entity_type: GRP.ASS
description: "Geographic modifier + organization type"
capture_groups:
1:
type: TOP.SET
role: location_adjective
3:
type: null
role: organization_type
examples:
- text: "Nijmeegse Historische Vereniging"
- text: "Leidse Kring"
- pattern: '^puttens\s+historisch\s+genootschap$'
entity_type: GRP.ASS
entity_subtype: GRP.ASS.HER
relationships:
- predicate: schema:location
subject: $0
object: "Putten"
object_type: TOP.SET
municipalities:
description: "Dutch municipal governments"
patterns:
- pattern: '^gemeente\s+([\w-]+)$'
entity_type: GRP.GOV
entity_subtype: GRP.GOV.MUN # Municipality
label_template: "Gemeente {1}"
capture_groups:
1:
type: TOP.SET
role: municipality_name
relationships:
- predicate: org:subOrganizationOf
subject: $0
object: "https://www.wikidata.org/entity/Q29999" # Kingdom of Netherlands
object_type: GRP.GOV
confidence: 1.0
examples:
- text: "Gemeente Borger-Odoorn"
entity: "Gemeente Borger-Odoorn"
captures:
1: "Borger-Odoorn"
- pattern: '^gemeentehuis\s+([\w-]+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.GOV # Government building
label_template: "Gemeentehuis {1}"
capture_groups:
1:
type: TOP.SET
role: municipality_name
relationships:
- predicate: org:hasSite
subject: "Gemeente {1}"
subject_type: GRP.GOV
object: $0
- pattern: '^gemeente\s+archieven$'
entity_type: GRP.HER
entity_subtype: GRP.HER.ARC
relationships:
- predicate: org:subOrganizationOf
subject: $0
object: CUSTODIAN
heritage_institutions:
description: "Museums, archives, libraries"
patterns:
- pattern: '^(het|de)\s+(\w+)\s*(museum|archief|bibliotheek)$'
entity_type: GRP.HER
capture_groups:
2:
type: APP.NAM
role: institution_name
3:
type: null
role: institution_type_keyword
relationships:
- predicate: org:linkedTo
subject: $0
object: CUSTODIAN
confidence: 0.7
examples:
- text: "Het Rijksmuseum"
- text: "De Bibliotheek"
- pattern: '^(\w+)\s+(\w+)\s+museum$'
entity_type: GRP.HER
entity_subtype: GRP.HER.MUS
description: "Two-word museum names"
capture_groups:
1:
type: APP.NAM
role: name_part_1
2:
type: APP.NAM
role: name_part_2
examples:
- text: "Pieter Vermeulen Museum"
- text: "Nederlands Graanmuseum"
- pattern: '^(nationaal|nederlands|nederlandse|oudheidkundig|virtueel)\s+(\w*)(museum|archief)$'
entity_type: GRP.HER
capture_groups:
1:
type: null
role: scope_modifier
2:
type: APP.NAM
role: subject_area
3:
type: null
role: institution_type_keyword
examples:
- text: "Nationaal Glasmuseum"
- text: "Nederlands Openluchtmuseum"
- pattern: '^regionaal\s+archief\s+(\w+)$'
entity_type: GRP.HER
entity_subtype: GRP.HER.ARC
capture_groups:
1:
type: TOP.REG
role: region_name
- pattern: '^stadsarchief\s+(\w+)$'
entity_type: GRP.HER
entity_subtype: GRP.HER.ARC
capture_groups:
1:
type: TOP.SET
role: city_name
- pattern: '^gemeentearchief\s+(\w+)$'
entity_type: GRP.HER
entity_subtype: GRP.HER.ARC
capture_groups:
1:
type: TOP.SET
role: municipality_name
- pattern: '^streekmuseum\s+(.+)$'
entity_type: GRP.HER
entity_subtype: GRP.HER.MUS
label_template: "Streekmuseum {1}"
capture_groups:
1:
type: APP.NAM
role: eponymous_name
relationships:
- predicate: org:linkedTo
subject: $0
object: CUSTODIAN
confidence: 0.8
examples:
- text: "Streekmuseum Jan Anderson"
- pattern: '^streekhistorisch\s+centrum\s+(\w+)$'
entity_type: GRP.HER
capture_groups:
1:
type: TOP.SET
role: region_name
provincial_heritage:
description: "Provincial heritage organizations"
patterns:
- pattern: '^erfgoed\s+(brabant|gelderland|zeeland|limburg|utrecht|friesland|drenthe|overijssel|flevoland|groningen)$'
entity_type: GRP.HER
entity_subtype: GRP.HER.OFF # Official heritage organization
capture_groups:
1:
type: TOP.REG
role: province_name
relationships:
- predicate: schema:areaServed
subject: $0
object: $1
object_type: TOP.REG
- pattern: '^gelderse\s+kerken$'
entity_type: GRP.ASS
entity_subtype: GRP.ASS.REL # Religious association
relationships:
- predicate: schema:areaServed
subject: $0
object: "Gelderland"
object_type: TOP.REG
- pattern: '^groninger\s+waddenmusea$'
entity_type: GRP.ASS
relationships:
- predicate: schema:areaServed
subject: $0
object: "Groningen"
object_type: TOP.REG
- pattern: '^flevolands\s+geheugen$'
entity_type: GRP.HER
entity_subtype: GRP.HER.DIG # Digital heritage platform
- pattern: '^fryske\s+akademy$'
entity_type: GRP.RES
entity_subtype: GRP.RES.ACA # Academic research
organizational_units:
description: "Internal organizational units and governance bodies"
patterns:
- pattern: '^raad\s+van\s+toezicht$'
entity_type: GRP.UNT
entity_subtype: GRP.UNT.GOV # Governance unit
relationships:
- predicate: org:unitOf
subject: $0
object: CUSTODIAN
confidence: 0.95
examples:
- text: "Raad van Toezicht"
- pattern: '^het\s+bestuur$'
entity_type: GRP.UNT
entity_subtype: GRP.UNT.GOV
relationships:
- predicate: org:unitOf
subject: $0
object: CUSTODIAN
- pattern: '^de\s+stichting$'
entity_type: GRP.ORG
entity_subtype: GRP.ORG.FND # Foundation
relationships:
- predicate: owl:sameAs
subject: $0
object: CUSTODIAN
confidence: 0.8
- pattern: '^de\s+vereniging$'
entity_type: GRP.ASS
relationships:
- predicate: owl:sameAs
subject: $0
object: CUSTODIAN
confidence: 0.8
- pattern: '^management\s+team$'
entity_type: GRP.UNT
relationships:
- predicate: org:unitOf
subject: $0
object: CUSTODIAN
government_bodies:
description: "Government bodies and positions"
patterns:
- pattern: '^gedeputeerde\s+staten$'
entity_type: GRP.GOV
entity_subtype: GRP.GOV.PRO # Provincial government
- pattern: '^provinciale\s+staten$'
entity_type: GRP.GOV
entity_subtype: GRP.GOV.PRO
- pattern: '^burgemeester\s+en\s+wethouders$'
entity_type: GRP.GOV
entity_subtype: GRP.GOV.MUN
- pattern: '^commissaris\s+van\s+de\s+koning$'
entity_type: ROL.POS
entity_subtype: ROL.POS.GOV # Government position
- pattern: '^raad\s+van\s+state$'
entity_type: GRP.GOV
entity_subtype: GRP.GOV.NAT # National government
- pattern: '^nationale\s+ombudsman$'
entity_type: ROL.POS
entity_subtype: ROL.POS.GOV
businesses:
description: "Commercial entities"
patterns:
- pattern: '^(\w+)er\s+handelsvereniging$'
entity_type: GRP.COR
entity_subtype: GRP.COR.ASS # Trade association
capture_groups:
1:
type: TOP.SET
role: place_adjective
examples:
- text: "Meppeler Handelsvereniging"
- pattern: '^bouwbedrijf\s+(\w+)$'
entity_type: GRP.COR
capture_groups:
1:
type: APP.NAM
role: company_name
- pattern: '^rabobank\s+(\w+)$'
entity_type: GRP.COR
entity_subtype: GRP.COR.BNK # Bank
capture_groups:
1:
type: TOP.SET
role: branch_location
cultural_organizations:
description: "Cultural and arts organizations"
patterns:
- pattern: '^dansstudio\s+(\w+)$'
entity_type: GRP.CUL
capture_groups:
1:
type: APP.NAM
role: studio_name
- pattern: '^toneelvereniging\s+(\w+)$'
entity_type: GRP.ASS
entity_subtype: GRP.ASS.CUL # Cultural association
capture_groups:
1:
type: APP.NAM
role: association_name
- pattern: '^schutterij\s+(de\s+)?(\w+)$'
entity_type: GRP.ASS
entity_subtype: GRP.ASS.TRD # Traditional association
capture_groups:
2:
type: APP.NAM
role: guild_name
- pattern: '^schuttersgilde\s+([\w-]+)$'
entity_type: GRP.ASS
entity_subtype: GRP.ASS.TRD
capture_groups:
1:
type: APP.NAM
role: guild_name
- pattern: '^schuttersvereniging\s+([\w-]+)$'
entity_type: GRP.ASS
entity_subtype: GRP.ASS.TRD
capture_groups:
1:
type: APP.NAM
role: association_name
# ---------------------------------------------------------------------------
# BUILDING/PLACE PATTERNS (TOP.*)
# Physical structures, estates, monuments, religious buildings
# ---------------------------------------------------------------------------
buildings_places:
description: "Patterns identifying physical locations and structures"
castles_estates:
description: "Castles, estates, and manor houses"
patterns:
- pattern: '^kasteel\s+(\w+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.CAS # Castle
label_template: "Kasteel {1}"
capture_groups:
1:
type: APP.NAM
role: castle_name
relationships:
- predicate: org:hasSite
subject: CUSTODIAN
object: $0
confidence: 0.7
examples:
- text: "Kasteel Oud Haarlem"
- pattern: '^kasteel\s+oud\s+haarlem$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.CAS
- pattern: '^landgoed\s+(\w+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.EST # Estate
capture_groups:
1:
type: APP.NAM
role: estate_name
relationships:
- predicate: org:hasSite
subject: CUSTODIAN
object: $0
confidence: 0.6
- pattern: '^landgoed\s+(borg|de)\s+(\w+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.EST
capture_groups:
2:
type: APP.NAM
role: estate_name
- pattern: '^huize\s+(\w+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.MAN # Manor house
capture_groups:
1:
type: APP.NAM
role: house_name
- pattern: '^huis\s+(ten|van|de)\s+(\w+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.MAN
capture_groups:
2:
type: APP.NAM
role: house_name
examples:
- text: "Huis ten Bosch"
- text: "Huis van Oud"
- pattern: '^hoeve\s+(de\s+)?(\w+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.FRM # Farm/farmhouse
capture_groups:
2:
type: APP.NAM
role: farm_name
- pattern: '^herberg\s+(de\s+)?(\w+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.INN # Historic inn
capture_groups:
2:
type: APP.NAM
role: inn_name
fortifications:
description: "Forts, bunkers, defensive structures"
patterns:
- pattern: '^fort\s+(\w+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.FOR # Fortification
capture_groups:
1:
type: APP.NAM
role: fort_name
relationships:
- predicate: crm:P53_has_former_or_current_location
subject: $0
object: CUSTODIAN_LOCATION
- pattern: '^de\s+atlantikwall$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.FOR
- pattern: '^kamp\s+vught$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.HIS # Historic site
relationships:
- predicate: schema:location
subject: $0
object: "Vught"
object_type: TOP.SET
religious_buildings:
description: "Churches, chapels, monasteries"
patterns:
- pattern: '^sint\s+(\w+)(kerk|gebouw)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.REL # Religious building
capture_groups:
1:
type: APP.NAM
role: saint_name
2:
type: null
role: building_type
- pattern: '^protestantse\s+(kerk|pastorie)\s+(\w+)?$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.REL
capture_groups:
1:
type: null
role: building_type
2:
type: TOP.SET
role: location_name
- pattern: '^kapel\s+van\s+(\w+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.REL
capture_groups:
1:
type: APP.NAM
role: dedication
- pattern: '^mariakapel\s+(\w+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.REL
capture_groups:
1:
type: TOP.SET
role: location_name
examples:
- text: "Mariakapel Nieuw-Dijk"
monuments:
description: "Monuments, memorials, historic markers"
patterns:
- pattern: '^monument(en)?\s+(in|didam|loil|nieuw-dijk|oud-dijk|buurtschap)\b'
entity_type: TOP.FEA
entity_subtype: TOP.FEA.MON # Monument
- pattern: '^grafheuvel\s+(\w+)$'
entity_type: TOP.FEA
entity_subtype: TOP.FEA.ARC # Archaeological feature
capture_groups:
1:
type: APP.NAM
role: feature_name
- pattern: '^nationaal\s+monument$'
entity_type: TOP.FEA
entity_subtype: TOP.FEA.MON
- pattern: '^kruisbeeld\s+op\s+(\w+)$'
entity_type: TOP.FEA
entity_subtype: TOP.FEA.REL # Religious monument
capture_groups:
1:
type: TOP.SET
role: location
cultural_venues:
description: "Theaters, community centers, museums"
patterns:
- pattern: '^theater\s+(de|het)\s+(\w+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.CUL # Cultural building
capture_groups:
2:
type: APP.NAM
role: theater_name
- pattern: '^buurthuis\s+(\w+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.COM # Community building
capture_groups:
1:
type: APP.NAM
role: building_name
- pattern: '^poppodium\s+(de\s+)?(\w+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.CUL
capture_groups:
2:
type: APP.NAM
role: venue_name
examples:
- text: "Poppodium de Peppel"
- pattern: '^aula\s+(\w+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.EDU # Educational building
capture_groups:
1:
type: APP.NAM
role: building_name
industrial_heritage:
description: "Mills, factories, industrial sites"
patterns:
- pattern: '^kalkoven\s+(\w+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.IND # Industrial building
capture_groups:
1:
type: APP.NAM
role: site_name
- pattern: '^scheepswerf\s+(\w+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.IND
capture_groups:
1:
type: APP.NAM
role: shipyard_name
- pattern: '^werkplaats\s+(\w+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.IND
capture_groups:
1:
type: APP.NAM
role: workshop_name
parks_gardens:
description: "Parks, gardens, nature reserves"
patterns:
- pattern: '^botanische\s+tuin\s+(\w+)?$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.GAR # Garden
capture_groups:
1:
type: APP.NAM
role: garden_name
- pattern: '^pinetum\s+(\w+)$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.GAR
capture_groups:
1:
type: APP.NAM
role: arboretum_name
- pattern: '^landschapspark\s+(\w+)$'
entity_type: TOP.GEO
entity_subtype: TOP.GEO.PRK # Park
capture_groups:
1:
type: APP.NAM
role: park_name
places_named:
description: "Named places and locations"
patterns:
- pattern: '^dekema\s+state$'
entity_type: TOP.BLD
entity_subtype: TOP.BLD.EST
- pattern: '^klein\s+(amerika|rome|zundert)$'
entity_type: TOP.SET
entity_subtype: TOP.SET.HAM # Hamlet/small settlement
capture_groups:
1:
type: APP.NAM
role: place_reference
# ---------------------------------------------------------------------------
# PUBLICATION/WORK PATTERNS (WRK.*)
# Publications, periodicals, books, reports
# ---------------------------------------------------------------------------
publications:
description: "Patterns identifying publications and works"
periodicals:
description: "Magazines, newsletters, journals"
patterns:
- pattern: '^jaarboek(en)?\s+(\w+)$'
entity_type: WRK.MAN
entity_subtype: WRK.MAN.SER # Serial publication
label_template: "Jaarboek {2}"
capture_groups:
2:
type: APP.NAM
role: publication_name
relationships:
- predicate: dcterms:publisher
subject: $0
object: CUSTODIAN
confidence: 0.85
examples:
- text: "Jaarboeken Aover Diem"
- pattern: '^jaarboek(en)?\s+aover\s+diem$'
entity_type: WRK.MAN
entity_subtype: WRK.MAN.SER
relationships:
- predicate: dcterms:publisher
subject: $0
object: CUSTODIAN
- pattern: '^verenigingsblad\s+(\w+)$'
entity_type: WRK.MAN
entity_subtype: WRK.MAN.SER
capture_groups:
1:
type: APP.NAM
role: publication_name
relationships:
- predicate: dcterms:publisher
subject: $0
object: CUSTODIAN
- pattern: '^verenigingsorgaan\s+(de\s+)?(\w+)$'
entity_type: WRK.MAN
entity_subtype: WRK.MAN.SER
capture_groups:
2:
type: APP.NAM
role: publication_name
- pattern: '^myerlese\s+koerier$'
entity_type: WRK.MAN
entity_subtype: WRK.MAN.SER
- pattern: '^nijmeegs\s+katern$'
entity_type: WRK.MAN
entity_subtype: WRK.MAN.SER
- pattern: '^old\s+ni-js(\s+edities)?$'
entity_type: WRK.MAN
entity_subtype: WRK.MAN.SER
- pattern: '^roggels\s+blaadje$'
entity_type: WRK.MAN
entity_subtype: WRK.MAN.SER
- pattern: '^suetan\s+kwartaalbladen$'
entity_type: WRK.MAN
entity_subtype: WRK.MAN.SER
- pattern: '^tusken\s+de\s+marren$'
entity_type: WRK.MAN
entity_subtype: WRK.MAN.SER
- pattern: '^verleden\s+tijdschrift$'
entity_type: WRK.MAN
entity_subtype: WRK.MAN.SER
- pattern: '^dedemsvaartse\s+courant$'
entity_type: WRK.MAN
entity_subtype: WRK.MAN.SER
book_series:
description: "Book series and monographs"
patterns:
- pattern: '^reeuwijkse\s+(bronnen|reeks)$'
entity_type: WRK.MAN
entity_subtype: WRK.MAN.SER
capture_groups:
1:
type: null
role: series_type
- pattern: '^uitgaven\s+(\w+)$'
entity_type: WRK.MAN
entity_subtype: WRK.MAN.COL # Collection
capture_groups:
1:
type: APP.NAM
role: publisher_name
- pattern: '^publicaties\s+(\w+)$'
entity_type: WRK.MAN
entity_subtype: WRK.MAN.COL
capture_groups:
1:
type: APP.NAM
role: collection_name
# ---------------------------------------------------------------------------
# ROLE/OCCUPATION PATTERNS (ROL.*)
# Job titles, positions, functions
# ---------------------------------------------------------------------------
roles:
description: "Patterns identifying roles, positions, and occupations"
job_titles:
description: "Professional job titles"
patterns:
- pattern: '^senior\s+(applicatiebeheerder|systeembeheerder)$'
entity_type: ROL.OCC
entity_subtype: ROL.OCC.TEC # Technical occupation
capture_groups:
1:
type: null
role: job_specialty
relationships:
- predicate: org:role
subject: CUSTODIAN
object: $0
examples:
- text: "Senior Applicatiebeheerder"
- pattern: '^financial\s+controller$'
entity_type: ROL.OCC
entity_subtype: ROL.OCC.ADM # Administrative occupation
# ---------------------------------------------------------------------------
# PERSON PATTERNS (AGT.*)
# Historical figures, references to people
# ---------------------------------------------------------------------------
persons:
description: "Patterns identifying references to persons (not contact persons)"
historical_figures:
description: "Famous historical figures mentioned in content"
patterns:
- pattern: '^vincent\s+van\s+gogh$'
entity_type: AGT.PER
entity_subtype: AGT.PER.ART # Artist
relationships:
- predicate: schema:mentions
subject: CUSTODIAN
object: $0
description: "Custodian mentions this historical figure"
wikidata_id: Q5582
- pattern: '^rembrandt(\s+van\s+rijn)?$'
entity_type: AGT.PER
entity_subtype: AGT.PER.ART
wikidata_id: Q5598
- pattern: '^johannes\s+vermeer$'
entity_type: AGT.PER
entity_subtype: AGT.PER.ART
wikidata_id: Q41264
- pattern: '^vermeer$'
entity_type: AGT.PER
entity_subtype: AGT.PER.ART
wikidata_id: Q41264
# =============================================================================
# DISCARD PATTERNS - Patterns that identify non-entities to be filtered out
# =============================================================================
discard_patterns:
description: >-
Patterns matching text that should NOT be extracted as entities.
These are UI elements, navigation text, form labels, etc.
urls_technical:
description: "URLs and technical strings"
patterns:
- pattern: '^https?://'
discard_reason: "URL - not an entity name"
- pattern: '^www\.'
discard_reason: "URL fragment"
- pattern: '\.html$'
discard_reason: "File extension"
- pattern: '\.php$'
discard_reason: "File extension"
- pattern: '\.aspx?$'
discard_reason: "File extension"
- pattern: '/photos/'
discard_reason: "URL path segment"
- pattern: '/places/'
discard_reason: "URL path segment"
- pattern: '^ChIJ'
discard_reason: "Google Place ID"
- pattern: '^AWn5SU'
discard_reason: "Google photo ID"
- pattern: 'WordPress'
discard_reason: "CMS name"
navigation:
description: "Website navigation elements"
patterns:
- pattern: '^menu\s+schakelen$'
discard_reason: "Navigation toggle"
- pattern: '^go\s+to\s+top$'
discard_reason: "Navigation link"
- pattern: '^page\s+load\s+link$'
discard_reason: "Navigation element"
- pattern: '^skip\s+to\b'
discard_reason: "Accessibility navigation"
- pattern: '^jump\s+to\b'
discard_reason: "Navigation link"
- pattern: '^ga\s+naar\b'
discard_reason: "Dutch navigation"
- pattern: '^terug\s+naar\b'
discard_reason: "Dutch navigation (back to)"
- pattern: '^naar\s+(de|het|inhoud|menu)\b'
discard_reason: "Dutch navigation"
- pattern: '^back\s+to\b'
discard_reason: "Navigation link"
- pattern: '^footer\s+navigatie$'
discard_reason: "Footer navigation"
- pattern: '^hoofd\s*navigatie$'
discard_reason: "Main navigation"
- pattern: '^volg\s+ons\b'
discard_reason: "Social media CTA"
- pattern: '^follow\s+(us|this)\b'
discard_reason: "Social media CTA"
- pattern: '^menu\s+overslaan$'
discard_reason: "Skip menu"
- pattern: '^scroll\s+naar\b'
discard_reason: "Scroll instruction"
form_buttons:
description: "Form labels and button text"
patterns:
- pattern: '^typ\s+hier\b'
discard_reason: "Form placeholder"
- pattern: '^vul\s+in\b'
discard_reason: "Form instruction"
- pattern: '^selecteer\b'
discard_reason: "Form instruction"
- pattern: '^kies\s+'
discard_reason: "Form instruction"
- pattern: '^zoek(en)?(\s+in)?$'
discard_reason: "Search button"
- pattern: '^aanmeld(en|ing)$'
discard_reason: "Registration button"
- pattern: '^afmeld(en|ing)$'
discard_reason: "Unsubscribe button"
- pattern: '^reserv(eren|ering)$'
discard_reason: "Reservation button"
- pattern: '^verzend(en)?$'
discard_reason: "Submit button"
- pattern: '^accepteer\b'
discard_reason: "Accept button"
- pattern: '^afwijzen\b'
discard_reason: "Reject button"
- pattern: '^akkoord$'
discard_reason: "OK button"
- pattern: '^instellingen\s+opslaan$'
discard_reason: "Save settings button"
section_headers:
description: "Generic section headers"
patterns:
- pattern: '^laatste\s+nieuws$'
discard_reason: "Section header"
- pattern: '^over\s+(ons|deze)$'
discard_reason: "About section"
- pattern: '^missie\s+en\s+visie$'
discard_reason: "Mission/vision section"
- pattern: '^contact$'
discard_reason: "Contact section"
- pattern: '^contactgegevens$'
discard_reason: "Contact details section"
- pattern: '^bereikbaarheid$'
discard_reason: "Directions section"
- pattern: '^openingstijden$'
discard_reason: "Opening hours section"
- pattern: '^parkeren$'
discard_reason: "Parking section"
- pattern: '^bezoekadres$'
discard_reason: "Visitor address section"
- pattern: '^postadres$'
discard_reason: "Postal address section"
- pattern: '^privacybeleid$'
discard_reason: "Privacy policy"
- pattern: '^disclaimer$'
discard_reason: "Disclaimer section"
- pattern: '^colofon$'
discard_reason: "Colophon section"
- pattern: '^sitemap$'
discard_reason: "Sitemap"
- pattern: '^veelgestelde\s+vragen$'
discard_reason: "FAQ section"
website_content:
description: "Common website content phrases"
patterns:
- pattern: '^lees\s+meer\b'
discard_reason: "Read more link"
- pattern: '^bekijk\s+(de|het|alle|meer)\b'
discard_reason: "View more link"
- pattern: '^download\s+(de|het)\b'
discard_reason: "Download link"
- pattern: '^meer\s+info(rmatie)?$'
discard_reason: "More info link"
- pattern: '^handige\s+(info|links)$'
discard_reason: "Useful links section"
- pattern: '^gratis\s+(toegang|qr)\b'
discard_reason: "Free access notice"
single_words:
description: "Single-word false positives"
values:
- admin
- contact
- home
- menu
- zoeken
- search
- login
- inloggen
- registreren
- help
- info
- nieuws
- agenda
- kalender
- archief
- collectie
- beeldbank
- bronnen
- links
- partners
- sponsors
- doneren
- lidmaatschap
- privacy
- disclaimer
- sitemap
- colofon
- cookies
- vacatures
# ---------------------------------------------------------------------------
# ADDITIONAL PHRASE PATTERNS - Action verbs, instructions, content phrases
# From PHRASE_PATTERNS lines 167-1144
# ---------------------------------------------------------------------------
action_instructions:
description: "Call-to-action and instruction phrases"
patterns:
- pattern: '^meld\s+(je|u|een)\b'
discard_reason: "Registration CTA"
- pattern: '^geef\s+(je|uw)\b'
discard_reason: "Form instruction"
- pattern: '^word[t]?\s+(lid|vriend|abonnee)$'
discard_reason: "Membership CTA"
- pattern: '^steun\s+(het|de|ons)\b'
discard_reason: "Donation CTA"
- pattern: '^huur\s+(een|het|de|eigendom)\b'
discard_reason: "Rental CTA"
- pattern: '^schrijf\s+(je|ons)$'
discard_reason: "Subscription CTA"
- pattern: '^stuur\s+(een|foto)$'
discard_reason: "Submission CTA"
- pattern: '^reserveer\s+een\b'
discard_reason: "Reservation CTA"
- pattern: '^plan\s+uw\b'
discard_reason: "Planning CTA"
- pattern: '^kom\s+(in|verder)$'
discard_reason: "Invitation CTA"
- pattern: '^klik\s+voor\b'
discard_reason: "Click instruction"
- pattern: '^ontdek\s+(de|jouw|ons)$'
discard_reason: "Discovery CTA"
- pattern: '^vind\s+ons$'
discard_reason: "Find us CTA"
- pattern: '^verstuur\s+bericht$'
discard_reason: "Send message CTA"
- pattern: '^profiel\s+wijzigen$'
discard_reason: "Edit profile CTA"
- pattern: '^scans\s+aanvragen$'
discard_reason: "Request scans CTA"
- pattern: '^vraag\s+(en|of|stellen|afvalpas)$'
discard_reason: "Question/request CTA"
- pattern: '^vragen\s+(en|staat|over)\b'
discard_reason: "Questions section"
membership_sections:
description: "Membership and subscription sections"
patterns:
- pattern: '^leden\s+(administratie|en\s+lidmaatschap)$'
discard_reason: "Membership admin section"
- pattern: '^lid\s+(worden|worden\s+inloggen)$'
discard_reason: "Join membership CTA"
- pattern: '^lidmaatschap\s+\w+$'
discard_reason: "Membership section"
- pattern: '^soort\s+lidmaatschap$'
discard_reason: "Membership type section"
- pattern: '^jaarlijkse\s+bijdrage$'
discard_reason: "Annual contribution section"
water_management:
description: "Water board and environmental management content"
patterns:
- pattern: '^(legger|peilbesluit|proefsluiting|vervanging|vernieuwen|onderhoud|metingen|bediening)\s+'
discard_reason: "Water board operations"
- pattern: '^waterschapsbelasting\b'
discard_reason: "Water board tax"
- pattern: '^ons\s+gebied\b'
discard_reason: "Our area section"
- pattern: '^handhavingsverzoek\b'
discard_reason: "Enforcement request"
- pattern: '^waterbeheer\s+en\b'
discard_reason: "Water management section"
- pattern: '^waterkwaliteit\s+\w+$'
discard_reason: "Water quality section"
- pattern: '^waterschap\s+\w+$'
discard_reason: "Water board name"
- pattern: '^waterschapsverordening\s+en\b'
discard_reason: "Water board regulation"
- pattern: '^waterpeil\s+en\b'
discard_reason: "Water level section"
- pattern: '^natuur\s+en\s+waterkwaliteit$'
discard_reason: "Nature and water quality"
- pattern: '^recreatie\s+rondom\s+water$'
discard_reason: "Recreation around water"
- pattern: '^landbouw\s+en\s+water(kwaliteit)?$'
discard_reason: "Agriculture and water"
- pattern: '^klimaat\s+en\s+veiligheid$'
discard_reason: "Climate and safety"
- pattern: '^kaderrichtlijn\s+water$'
discard_reason: "Water framework directive"
- pattern: '^meten\s+van\s+de\s+waterkwaliteit$'
discard_reason: "Water quality measurement"
- pattern: '^voldoende\s+water$'
discard_reason: "Sufficient water section"
- pattern: '^natuurvriendelijke\s+oever$'
discard_reason: "Natural riverbank"
- pattern: '^oevers\s+\w+$'
discard_reason: "Riverbanks section"
heritage_content:
description: "Heritage and historical content phrases"
patterns:
- pattern: '^historie\s+(van\s+)?\w+$'
discard_reason: "History section"
- pattern: '^historisch(e)?\s+(coevorden|spektakel|avond|fietsroute|geografie|groenten|projecten|wandeling)\b'
discard_reason: "Historical content section"
- pattern: '^gevelstenen\s+in\b'
discard_reason: "Facade stones section"
- pattern: '^grafvondst\s+bij\b'
discard_reason: "Grave find section"
- pattern: '^erfgoedcollecties\s+van\b'
discard_reason: "Heritage collections section"
- pattern: '^vondsten\s+in\b'
discard_reason: "Finds section"
- pattern: '^lokale\s+vondsten$'
discard_reason: "Local finds section"
- pattern: '^tijdlijn\s+vondsten$'
discard_reason: "Finds timeline"
- pattern: '^opgraving\s+\w+$'
discard_reason: "Excavation section"
- pattern: '^militaire\s+historie$'
discard_reason: "Military history section"
- pattern: '^genealogische\s+begrippen$'
discard_reason: "Genealogical terms"
- pattern: '^notariele\s+archieven$'
discard_reason: "Notarial archives section"
- pattern: '^voorouders\s+op\b'
discard_reason: "Ancestors section"
- pattern: '^larense\s+voorouders$'
discard_reason: "Laren ancestors section"
- pattern: '^personenbestand\s+\w+$'
discard_reason: "Person database section"
- pattern: '^namenlijst\s+\w+$'
discard_reason: "Name list section"
tours_visits:
description: "Tour and visit related content"
patterns:
- pattern: '^rondleiding\s+\w+$'
discard_reason: "Tour section"
- pattern: '^rondleidingen\s+en\b'
discard_reason: "Tours section"
- pattern: '^rondwandeling\s+door\b'
discard_reason: "Walking tour"
- pattern: '^virtuele\s+tour$'
discard_reason: "Virtual tour"
- pattern: '^groepen\s+aanmelden$'
discard_reason: "Group registration"
- pattern: '^programma\s+voor\s+groepen$'
discard_reason: "Group program"
- pattern: '^wensen\s+rondleiding$'
discard_reason: "Tour wishes"
- pattern: '^landgoedrondleiding\b'
discard_reason: "Estate tour"
location_directions:
description: "Location and directions content"
patterns:
- pattern: '^locatie\s+\w+$'
discard_reason: "Location section"
- pattern: '^locaties\s+\w+$'
discard_reason: "Locations section"
- pattern: '^overige\s+locaties$'
discard_reason: "Other locations"
- pattern: '^vestiging\s+\w+$'
discard_reason: "Branch location"
- pattern: '^route\s+(en|per)$'
discard_reason: "Route section"
- pattern: '^per\s+(auto|boot)$'
discard_reason: "By car/boat directions"
- pattern: '^met\s+het\s+ov$'
discard_reason: "Public transport"
- pattern: '^naar\s+(google\s+maps|bestuurspagina|boven\s+scrollen|veelgestelde\s+vragen)$'
discard_reason: "Navigation link"
- pattern: '^vanuit\s+\w+$'
discard_reason: "From location"
time_schedule:
description: "Time and schedule related content"
patterns:
- pattern: '^dag\s+tijden$'
discard_reason: "Day times"
- pattern: '^\w+dag\s+gesloten$'
discard_reason: "Day closed"
- pattern: '^goede\s+vrijdag$'
discard_reason: "Good Friday"
- pattern: '^tweede\s+(paasdag|pinksterdag)$'
discard_reason: "Holiday name"
- pattern: '^vandaag\s+gesloten$'
discard_reason: "Closed today"
- pattern: '^morgen\s+gesloten$'
discard_reason: "Closed tomorrow"
- pattern: '^ook\s+(aanwezig|gesloten)$'
discard_reason: "Also present/closed"
- pattern: '^gesloten\s+op\b'
discard_reason: "Closed on"
events_activities:
description: "Events and activities content"
patterns:
- pattern: '^lezingen\s+en\s+\w+$'
discard_reason: "Lectures section"
- pattern: '^thema\s+avonden$'
discard_reason: "Theme evenings"
- pattern: '^komende\s+activiteiten$'
discard_reason: "Upcoming activities"
- pattern: '^cursus\s+\w+$'
discard_reason: "Course section"
- pattern: '^wandel\s+en\b'
discard_reason: "Walking section"
- pattern: '^wandelapp\s+\w+$'
discard_reason: "Walking app"
- pattern: '^wandelen\s+en\s+fietsen$'
discard_reason: "Walking and cycling"
- pattern: '^wandelkaart\s+\w+$'
discard_reason: "Walking map"
- pattern: '^struinpad\s+wandelingen$'
discard_reason: "Trail walks"
- pattern: '^trouwen\s+in\b'
discard_reason: "Weddings section"
- pattern: '^zakelijke\s+bijeenkomsten$'
discard_reason: "Business meetings"
- pattern: '^jubileum\s+fietsroute$'
discard_reason: "Anniversary bike route"
online_services:
description: "Online services and digital content"
patterns:
- pattern: '^online\s+(afspraak|betalen|doneren|exposities|platform|reserveren|vraag)$'
discard_reason: "Online service"
- pattern: '^website\s+(beheer|gemeenteraad)$'
discard_reason: "Website section"
- pattern: '^webdesign\s+bureau\b'
discard_reason: "Web design"
- pattern: '^google\s+maps$'
discard_reason: "Google Maps reference"
- pattern: '^gebruik\s+google\s+maps$'
discard_reason: "Use Google Maps"
- pattern: '^flickr\s+fotoalbum$'
discard_reason: "Flickr photo album"
about_sections:
description: "About and overview sections"
patterns:
- pattern: '^over\s+(batavialand|bergh|haaksbergen|heemskerk|lkca|laren|museumpark|numaga|nuwelant|rijnland|roosendaal|rozet|ruurd|onze\s+website)$'
discard_reason: "About section"
- pattern: '^over\s+(de|het|dekema)\s+'
discard_reason: "About section"
- pattern: '^ons\s+(adres|bestuur|huisblad|kantoor|team|werkgebied)$'
discard_reason: "Our [X] section"
- pattern: '^wat\s+(doen|doet|we|wij)$'
discard_reason: "What we do section"
- pattern: '^wie\s+(we|wij)\s+zijn$'
discard_reason: "Who we are section"
- pattern: '^visie\s+en\s+missie$'
discard_reason: "Vision and mission"
- pattern: '^missie\s+en\s+doelen$'
discard_reason: "Mission and goals"
- pattern: '^ontstaan\s+\w+$'
discard_reason: "Origin section"
overview_sections:
description: "Overview and list sections"
patterns:
- pattern: '^overzicht\s+(rijksmonumenten|skriemers|archeologische|bouwlocaties|exposities|formulieren|tijdschriften)$'
discard_reason: "Overview section"
- pattern: '^overige\s+(uitgaven|documenten|locaties|organisaties|vrijwilligers)$'
discard_reason: "Other [X] section"
- pattern: '^meer\s+(fers|meldingen|natuurmusea|over|telefoonnummers|weten)$'
discard_reason: "More [X] section"
- pattern: '^meest\s+(bekeken|recente)\b'
discard_reason: "Most [X] section"
- pattern: '^recente\s+berichten$'
discard_reason: "Recent posts"
- pattern: '^laatst(e)?\s+(verschenen|update|nieuws)$'
discard_reason: "Latest [X] section"
links_references:
description: "Links and reference sections"
patterns:
- pattern: '^link\s+naar\b'
discard_reason: "Link to"
- pattern: '^links\s+\w+$'
discard_reason: "Links section"
- pattern: '^interessante\s+links$'
discard_reason: "Interesting links"
- pattern: '^partner\s+links$'
discard_reason: "Partner links"
- pattern: '^nuttige\s+websites$'
discard_reason: "Useful websites"
- pattern: '^wikipedia\s+\w+$'
discard_reason: "Wikipedia reference"
reports_complaints:
description: "Reports and complaints sections"
patterns:
- pattern: '^klacht\s+\w+$'
discard_reason: "Complaint section"
- pattern: '^klachten\s+\w+$'
discard_reason: "Complaints section"
- pattern: '^meldingen\s+(en|zonder|over)\b'
discard_reason: "Reports section"
- pattern: '^meld\s+(direct|het|overlast)$'
discard_reason: "Report CTA"
- pattern: '^incident\s+melden$'
discard_reason: "Report incident"
- pattern: '^storing\s+melden$'
discard_reason: "Report malfunction"
- pattern: '^schade\s+\w+\s+melden$'
discard_reason: "Report damage"
- pattern: '^spoedeisende\s+meldingen$'
discard_reason: "Emergency reports"
governance_policy:
description: "Governance and policy content"
patterns:
- pattern: '^committee\s+van\b'
discard_reason: "Committee section"
- pattern: '^governance\s+code\b'
discard_reason: "Governance code"
- pattern: '^coordinated\s+vulnerability\b'
discard_reason: "Security policy"
- pattern: '^privacyverklaring\s+\w+$'
discard_reason: "Privacy statement"
- pattern: '^wijziging\s+privacyverklaring$'
discard_reason: "Privacy statement change"
- pattern: '^voorwaarden\s+(en|zonder)$'
discard_reason: "Terms section"
- pattern: '^richtlijnen\s+en\b'
discard_reason: "Guidelines section"
- pattern: '^wet\s+open\s+overheid$'
discard_reason: "Open government law"
- pattern: '^toetsing\s+\w+$'
discard_reason: "Assessment section"
- pattern: '^toezicht\s+en\b'
discard_reason: "Supervision section"
- pattern: '^verbonden\s+partijen$'
discard_reason: "Related parties"
services_facilities:
description: "Services and facilities content"
patterns:
- pattern: '^service\s+contact$'
discard_reason: "Service contact"
- pattern: '^servicepunt\s+\w+$'
discard_reason: "Service point"
- pattern: '^infopunt\s+\w+$'
discard_reason: "Info point"
- pattern: '^informatiepunt\s+\w+$'
discard_reason: "Information point"
- pattern: '^informatiecentrum\s+\w+$'
discard_reason: "Information center"
- pattern: '^kenniscentrum\s+\w+$'
discard_reason: "Knowledge center"
- pattern: '^uitleenpunt\s+\w+$'
discard_reason: "Lending point"
- pattern: '^portaal\s+\w+$'
discard_reason: "Portal section"
tickets_shop:
description: "Tickets and shopping content"
patterns:
- pattern: '^tickets\s+(contact|en|kopen)$'
discard_reason: "Tickets section"
- pattern: '^tarieven\s+en\b'
discard_reason: "Prices section"
- pattern: '^winkelwagen\s+\w+$'
discard_reason: "Shopping cart"
- pattern: '^shopping\s+cart$'
discard_reason: "Shopping cart"
sponsors_support:
description: "Sponsors and support content"
patterns:
- pattern: '^sponsors\s+en\b'
discard_reason: "Sponsors section"
- pattern: '^sponsoring\s+(en|aanvragen)$'
discard_reason: "Sponsoring section"
- pattern: '^steun\s+(orientalis|structureel)$'
discard_reason: "Support section"
- pattern: '^structurele\s+ondersteuning$'
discard_reason: "Structural support"
- pattern: '^subsidies\s+en\b'
discard_reason: "Subsidies section"
- pattern: '^subsidieverstrekkers\b'
discard_reason: "Subsidy providers"
- pattern: '^supporter\s+\w+$'
discard_reason: "Supporter section"
education_youth:
description: "Education and youth content"
patterns:
- pattern: '^jeugd\s+en\s+onderwijs$'
discard_reason: "Youth and education"
- pattern: '^onderwijs\s+en\s+jeugd$'
discard_reason: "Education and youth"
- pattern: '^scholen\s+\w+$'
discard_reason: "Schools section"
- pattern: '^voor\s+(wo\s+ii|bezoekers|de\s+(jeugd|media|pers)|het\s+onderwijs|onderwijsinstellingen|professionals)$'
discard_reason: "For [audience] section"
- pattern: '^voorschoolse\s+\w+$'
discard_reason: "Preschool section"
- pattern: '^kids\s+academy$'
discard_reason: "Kids academy"
- pattern: '^kinderen\s+bij\b'
discard_reason: "Children section"
- pattern: '^middelbaar\s+beroepsonderwijs$'
discard_reason: "Vocational education"
- pattern: '^stages\s+en\s+afstuderen$'
discard_reason: "Internships section"
volunteers_staff:
description: "Volunteers and staff content"
patterns:
- pattern: '^vrijwilligersuitje\s+\w+$'
discard_reason: "Volunteer outing"
- pattern: '^welkom\s+nieuwe\s+vrijwilliger$'
discard_reason: "Welcome new volunteer"
- pattern: '^vacature\s+\w+$'
discard_reason: "Vacancy section"
- pattern: '^werken\s+bij\b'
discard_reason: "Work at section"
- pattern: '^medewerkers\s+zoeken$'
discard_reason: "Search employees"
technical_website:
description: "Technical website elements"
patterns:
- pattern: '^no\s+events$'
discard_reason: "No events message"
- pattern: '^recent\s+(comments|posts)$'
discard_reason: "Recent content"
- pattern: '^search\s+submit\b'
discard_reason: "Search submit"
- pattern: '^share\s+this$'
discard_reason: "Share this"
- pattern: '^statistics\s+statistics$'
discard_reason: "Statistics"
- pattern: '^strictly\s+necessary$'
discard_reason: "Cookie notice"
- pattern: '^system\s+management$'
discard_reason: "System management"
- pattern: '^my\s+account$'
discard_reason: "My account"
- pattern: '^other\s+languages$'
discard_reason: "Language selector"
- pattern: '^product\s+families$'
discard_reason: "Product categories"
- pattern: '^form\s+submissions$'
discard_reason: "Form submissions"
- pattern: '^global\s+websites$'
discard_reason: "Global websites"
- pattern: '^inloggen\s+leden$'
discard_reason: "Member login"
- pattern: '^inhoud\s+website$'
discard_reason: "Website contents"
- pattern: '^inhoudsopgave\s+inhoudsopgave$'
discard_reason: "Table of contents"
foreign_language:
description: "Foreign language navigation"
patterns:
- pattern: '^mit\s+dem\s+(auto|fahrrad)$'
discard_reason: "German directions"
- pattern: '^kontakt\s+und\b'
discard_reason: "German contact"
- pattern: '^polskie\s+informacje$'
discard_reason: "Polish information"
- pattern: '^preparez\s+votre\b'
discard_reason: "French prepare"
- pattern: '^folgen\s+sie\b'
discard_reason: "German follow"
- pattern: '^sich\s+einschreiben$'
discard_reason: "German register"
- pattern: '^international\s+visitors$'
discard_reason: "International visitors"
miscellaneous_content:
description: "Miscellaneous content phrases"
patterns:
- pattern: '^(feesten|geboorte|groen|foto|inkoop|eten|zien|beeld|groepen|genealogie|wonen)\s+en\s+\w+$'
discard_reason: "X and Y content phrase"
- pattern: '^\w+\s+en\s+(gebruiken|overlijden|onderhoud|film|aanbesteden|drinken|doen|geluid|bidprentjes|rondleidingen|leefomgeving|ontginning|links)$'
discard_reason: "X and Y content phrase"
- pattern: '^het\s+(geheugen|geheim|ontstaan|kantoor|heemhuis|lichtruim|natuurhistorisch|nevelhorstmeer|olieslaan|schip|stift|veenkloosterbos)\b'
discard_reason: "Het X section"
- pattern: '^de\s+(dorpsdokter|drie|egeling|japanse|klinker|oude)\s+'
discard_reason: "De X section"
- pattern: '^de\s+(atlantikwall|basis|bilt|bongard|buffer|haarslag|klok|kring|lindenhoeve|mansjes|mariahoeve|nestbouwers|noodwoning|omgevingswet|quiz|skriemer|vlotter|wazerweijen|werf|zoolstede)$'
discard_reason: "De X place/thing"
- pattern: '^genieten\s+van\b'
discard_reason: "Enjoy section"
- pattern: '^hulp\s+bij\b'
discard_reason: "Help with"
- pattern: '^opening\s+museum$'
discard_reason: "Museum opening"
- pattern: '^renovatie\s+\w+$'
discard_reason: "Renovation section"
- pattern: '^verhuizing\s+naar\b'
discard_reason: "Move to"
- pattern: '^home\s+contact\b'
discard_reason: "Home contact navigation"
- pattern: '^in\s+(buurthuis|de\s+(laar|stad))\b'
discard_reason: "In location"
- pattern: '^(vier|zes)\s+(typen|kernen)\b'
discard_reason: "Number types"
- pattern: '^werken\s+met\s+\w+$'
discard_reason: "Working with"
- pattern: '^werkgebied\s+\w+$'
discard_reason: "Work area"
- pattern: '^werk\s+in\s+uitvoering$'
discard_reason: "Work in progress"
- pattern: '^serie\s+\w+$'
discard_reason: "Series section"
- pattern: '^dromen\s+denken\s+doen$'
discard_reason: "Slogan"
- pattern: '^founding\s+fathers$'
discard_reason: "Founding fathers section"
- pattern: '^green\s+team$'
discard_reason: "Green team"
- pattern: '^kijk\s+en\s+beleef$'
discard_reason: "Look and experience"
- pattern: '^toen\s+en\s+nu$'
discard_reason: "Then and now"
- pattern: '^trein\s+en\s+spoor$'
discard_reason: "Train and track"
- pattern: '^uit\s+in\b'
discard_reason: "Out in"
- pattern: '^vrij\s+zoeken$'
discard_reason: "Free search"
- pattern: '^object\s+van\s+de\s+maand$'
discard_reason: "Object of the month"
- pattern: '^objecten\s+\w+$'
discard_reason: "Objects section"
- pattern: '^post\s+en\s+e-mail$'
discard_reason: "Post and email"
- pattern: '^resultaat\s+\w+$'
discard_reason: "Result section"
- pattern: '^reviews\s+op\b'
discard_reason: "Reviews on"
- pattern: '^suggesties\s+en\b'
discard_reason: "Suggestions section"
- pattern: '^vergunningen\s+en\b'
discard_reason: "Permits section"
- pattern: '^verhalend\s+ontwerpen$'
discard_reason: "Narrative design"
- pattern: '^verhuizen\s+en\b'
discard_reason: "Moving section"
- pattern: '^verslagen\s+van\b'
discard_reason: "Reports of"
- pattern: '^verzonden\s+nieuwsbrieven$'
discard_reason: "Sent newsletters"
- pattern: '^windmolens\s+en\b'
discard_reason: "Windmills section"
- pattern: '^winter\s+in\b'
discard_reason: "Winter in"
- pattern: '^korte\s+lijnen$'
discard_reason: "Short lines"
- pattern: '^huidige\s+aanbod$'
discard_reason: "Current offer"
- pattern: '^iets\s+vragen$'
discard_reason: "Ask something"
- pattern: '^in\s+engeland$'
discard_reason: "In England"
- pattern: '^inkomende\s+telefoongesprekken$'
discard_reason: "Incoming calls"
- pattern: '^inleiding\s+\w+$'
discard_reason: "Introduction"
- pattern: '^inloop\s+\w+$'
discard_reason: "Walk-in section"
- pattern: '^internationale\s+samenwerking$'
discard_reason: "International cooperation"
- pattern: '^informatiebrochures\s+molens$'
discard_reason: "Mill brochures"
- pattern: '^info\s+borden$'
discard_reason: "Info boards"
- pattern: '^index\s+\w+$'
discard_reason: "Index section"
- pattern: '^samenwerking\s+met$'
discard_reason: "Cooperation with"
- pattern: '^schilderijen\s+kunstschilders$'
discard_reason: "Paintings section"
- pattern: '^planten\s+en\s+dieren$'
discard_reason: "Plants and animals"
- pattern: '^positieve\s+gezondheid$'
discard_reason: "Positive health"
- pattern: '^unieke\s+combinatie$'
discard_reason: "Unique combination"
- pattern: '^uittreksels\s+en\b'
discard_reason: "Extracts section"
- pattern: '^uitwisselen\s+van\b'
discard_reason: "Exchange of"
- pattern: '^voortgang\s+procedure$'
discard_reason: "Progress procedure"
- pattern: '^vorige\s+volgende\b'
discard_reason: "Previous next navigation"
- pattern: '^welkom\s+terug$'
discard_reason: "Welcome back"
- pattern: '^wereld\s+van\b'
discard_reason: "World of"
- pattern: '^wapen\s+van\b'
discard_reason: "Coat of arms"
- pattern: '^verdwenen\s+\w+$'
discard_reason: "Disappeared section"
- pattern: '^vernieuwing\s+museum\b'
discard_reason: "Museum renewal"
- pattern: '^vroegere\s+kringactiviteiten$'
discard_reason: "Former activities"
- pattern: '^vrouwelijke\s+engelandvaarders$'
discard_reason: "Female England travelers"
- pattern: '^wegkruisenwandelboekje\s+\w+$'
discard_reason: "Cross walk booklet"
- pattern: '^wegwerkzaamheden\s+en\b'
discard_reason: "Roadworks section"
- pattern: '^opgewekte\s+geschiedenissen$'
discard_reason: "Cheerful histories"
- pattern: '^omschrijving\s+van\b'
discard_reason: "Description of"
- pattern: '^ommetje\s+\w+$'
discard_reason: "Short walk"
- pattern: '^ondernemen(d)?\s+(in|nijeveen)$'
discard_reason: "Entrepreneurship section"
- pattern: '^ondernemers\s+kunnen\s+contact$'
discard_reason: "Entrepreneurs contact"
- pattern: '^onderwerpen\s+onderwerpen$'
discard_reason: "Subjects section"
- pattern: '^onderzoeksresultaten\s+\w+$'
discard_reason: "Research results"
- pattern: '^oorsprong\s+\w+$'
discard_reason: "Origin section"
- pattern: '^op\s+(de|het)\s+\w+$'
discard_reason: "On the X"
- pattern: '^openbare\s+inschrijving$'
discard_reason: "Public registration"
- pattern: '^openstelling\s+en\b'
discard_reason: "Opening section"
- pattern: '^pagina\s+voor\b'
discard_reason: "Page for"
- pattern: '^pakje\s+kunst$'
discard_reason: "Package of art"
- pattern: '^panorama\s+van\b'
discard_reason: "Panorama of"
- pattern: '^partner\s+webshop$'
discard_reason: "Partner webshop"
- pattern: '^pers\s+toolkit$'
discard_reason: "Press toolkit"
- pattern: '^persoonlijk\s+contact$'
discard_reason: "Personal contact"
- pattern: '^plaatselijk\s+belang\b'
discard_reason: "Local interest"
- pattern: '^poortinstructie\s+voor\b'
discard_reason: "Gate instruction"
- pattern: '^praat\s+mar\s+frysk$'
discard_reason: "Speak Frisian"
- pattern: '^recht\s+van\s+opstal$'
discard_reason: "Right of superficies"
- pattern: '^rabo\s+clubsupport$'
discard_reason: "Rabo club support"
- pattern: '^toegankelijk\s+voor\b'
discard_reason: "Accessible for"
- pattern: '^toelichting\s+beeldbank$'
discard_reason: "Image bank explanation"
- pattern: '^tijd\s+geconstateerd$'
discard_reason: "Time detected"
- pattern: '^varen\s+in\b'
discard_reason: "Sailing in"
- pattern: '^veel\s+gestelde\s+vragen$'
discard_reason: "FAQ"
- pattern: '^veilig\s+mailen$'
discard_reason: "Safe email"
- pattern: '^vakantie\s+in\b'
discard_reason: "Holiday in"
- pattern: '^van\s+(nieuwegeinse\s+bodem|noord|wirskaante|de\s+(bestuurstafel|voorzitter))$'
discard_reason: "From X section"
- pattern: '^fiscaal\s+nummer$'
discard_reason: "Tax number"
- pattern: '^financiele\s+verantwoording$'
discard_reason: "Financial accountability"
- pattern: '^nieuwe\s+(aanwinsten|zaak\s+starten)$'
discard_reason: "New acquisitions/start"
- pattern: '^nieuw\s+wachtwoord\s+aanvragen$'
discard_reason: "Request new password"
- pattern: '^minder\s+valide$'
discard_reason: "Disabled access"
- pattern: '^methode\s+van\b'
discard_reason: "Method of"
- pattern: '^kaart\s+kernen$'
discard_reason: "Map cores"
- pattern: '^molens\s+(in|loil)$'
discard_reason: "Mills section"
- pattern: '^of\s+zocht\s+u$'
discard_reason: "Or did you search"
- pattern: '^oude\s+(ansichtkaarten|films|kerkhof)$'
discard_reason: "Old X section"
- pattern: '^straten\s+in\b'
discard_reason: "Streets in"
- pattern: '^studie\s+hoek$'
discard_reason: "Study corner"
- pattern: '^kunstenaars\s+in\b'
discard_reason: "Artists in"
- pattern: '^na\s+wo\s+ii$'
discard_reason: "After WW2"
- pattern: '^rijwielvordering\s+wo\s+ii$'
discard_reason: "WW2 bicycle requisition"
- pattern: '^rijnlands\s+vastgoed$'
discard_reason: "Rijnland real estate"
- pattern: '^rijnlandse\s+mascottes$'
discard_reason: "Rijnland mascots"
- pattern: '^rietwijk\s+of\s+reewijk$'
discard_reason: "Rietwijk or Reewijk"
- pattern: '^roggel\s+(leef|en\s+omgeving)$'
discard_reason: "Roggel section"
- pattern: '^roggelse\s+verenigingen$'
discard_reason: "Roggel associations"
- pattern: '^rozet\s+voor\s+jou$'
discard_reason: "Rozet for you"
- pattern: '^veluws\s+schoon$'
discard_reason: "Clean Veluwe"
- pattern: '^katolieke\s+emancipatie$'
discard_reason: "Catholic emancipation"
- pattern: '^keur\s+van\s+grafstenen$'
discard_reason: "Selection of gravestones"
specific_false_positives:
description: "Specific strings identified as false positives"
patterns:
- pattern: '^foto\s+(actief|herkenning|inzenden|album)$'
discard_reason: "Photo section"
- pattern: '^foto\s+kenneth\s+stamp$'
discard_reason: "Photo credit"
- pattern: '^comics\s+plus$'
discard_reason: "Product name"
- pattern: '^canon\s+production\b'
discard_reason: "Canon production"
- pattern: '^cultuurimpuls\b'
discard_reason: "Culture impulse"
- pattern: '^edmond\s+\w+\s+penning$'
discard_reason: "Medal name"
- pattern: '^eigen\s+uitgaven$'
discard_reason: "Own publications"
- pattern: '^een\s+australische\b'
discard_reason: "An Australian"
- pattern: '^brabants\s+heem$'
discard_reason: "Brabant heritage"
- pattern: '^buurt\s+battle$'
discard_reason: "Neighborhood battle"
- pattern: '^middengebied\b'
discard_reason: "Middle area"
- pattern: '^zwolse\s+parken$'
discard_reason: "Zwolle parks"
- pattern: '^zandeind\s+in\b'
discard_reason: "Zandeind location"
- pattern: '^zelf\s+bewaren$'
discard_reason: "Self storage"
- pattern: '^zeldzame\s+voorwerpen$'
discard_reason: "Rare objects"
- pattern: '^woldzigt\s+agenda$'
discard_reason: "Woldzigt agenda"
- pattern: '^acht\s+van\s+chaam$'
discard_reason: "Eight of Chaam"
- pattern: '^bij\s+de\s+barones$'
discard_reason: "At the baroness"
- pattern: '^boek\s+elle\s+klop$'
discard_reason: "Book title"
- pattern: '^bemmel\s+ressen\b'
discard_reason: "Place names"
- pattern: '^aold\s+hoksebarge$'
discard_reason: "Dialect place name"
- pattern: '^bientien\s+over\b'
discard_reason: "Room name"
- pattern: '^tonnie\s+en\s+kee\b'
discard_reason: "Show characters"
- pattern: '^den\s+brouwer$'
discard_reason: "Place/building name"
- pattern: '\bop\s+gastenboek\b'
discard_reason: "Guestbook navigation"
- pattern: '^de\s+laar$'
discard_reason: "Place name"
- pattern: '^serie\s+droge\s+voeten$'
discard_reason: "Publication series"
- pattern: '^familie\s+(bindels|janssen)$'
discard_reason: "Family section header"
- pattern: '^hof\s+loil$'
discard_reason: "Place name"
- pattern: '^hoolten\s+klinte$'
discard_reason: "Dialect place name"
- pattern: '^verhildersum\s+to\s+go$'
discard_reason: "Product name"
- pattern: '^voerman\s+verwondert$'
discard_reason: "Exhibition title"
- pattern: '^heemskerker\s+ezels$'
discard_reason: "Organization name"
- pattern: '^jolly\s+duck$'
discard_reason: "Venue name"
- pattern: '^maria\s+kleuterschool$'
discard_reason: "School name"
- pattern: '^waterlandsmuseum\s+de\s+speeltoren$'
discard_reason: "Museum name"
- pattern: '^pelt\s+als\s+architect$'
discard_reason: "Article title"
- pattern: '^mierlo\s+puzzel$'
discard_reason: "Puzzle name"
- pattern: '^ozosnel\s+fandagen$'
discard_reason: "Event name"
- pattern: '^spijkerserve\s+\w+$'
discard_reason: "Place name"
- pattern: '^stalpers\s+opleidingen\b'
discard_reason: "Training company"
- pattern: '^taalbrigade\s+kids$'
discard_reason: "Program name"
- pattern: '^numaga\s+(excursies|jaarboek)$'
discard_reason: "Numaga section"
- pattern: '^meierijse\s+schoutsrekeningen$'
discard_reason: "Historical records"
- pattern: '^nieuwegein\s+lokaal$'
discard_reason: "Local Nieuwegein"
- pattern: '^nieuwjaarke\s+zingen$'
discard_reason: "New Year singing"
- pattern: '^nieuwveense\s+landen$'
discard_reason: "Place name"
- pattern: '^oijen\s+en\s+teeffelen$'
discard_reason: "Place names"
- pattern: '^molukse\s+(graven|muziek)$'
discard_reason: "Moluccan section"
- pattern: '^kruisen\s+en\s+\w+$'
discard_reason: "Crosses section"
# =============================================================================
# RELATIONSHIP PREDICATES REFERENCE
# From: data/entity_annotation/modules/advanced/relationship_annotations.yaml
# =============================================================================
relationship_predicates:
organizational:
- id: org:memberOf
description: "Entity is member of organization"
domain: [AGT.PER, GRP.*]
range: [GRP.*]
- id: org:subOrganizationOf
description: "Organization is part of larger organization"
domain: [GRP.*]
range: [GRP.*]
- id: org:unitOf
description: "Organizational unit is part of organization"
domain: [GRP.UNT]
range: [GRP.*]
- id: org:hasSite
description: "Organization has location/building"
domain: [GRP.*]
range: [TOP.BLD, TOP.SET]
- id: org:linkedTo
description: "Organization is linked to another"
domain: [GRP.*]
range: [GRP.*]
- id: org:role
description: "Organization has role/position"
domain: [GRP.*]
range: [ROL.*]
spatial:
- id: schema:location
description: "Entity is located at place"
domain: [GRP.*, TOP.BLD]
range: [TOP.SET, TOP.REG, TOP.ADR]
- id: schema:areaServed
description: "Organization serves geographic area"
domain: [GRP.*]
range: [TOP.SET, TOP.REG]
- id: crm:P53_has_former_or_current_location
description: "Heritage site location (temporal)"
domain: [TOP.BLD, TOP.FEA]
range: [TOP.SET]
creative:
- id: dcterms:publisher
description: "Work published by organization"
domain: [WRK.*]
range: [GRP.*]
- id: dcterms:references
description: "Entity references work"
domain: [GRP.*, AGT.PER]
range: [WRK.*]
- id: schema:mentions
description: "Entity mentions person/thing"
domain: [GRP.*, WRK.*]
range: [AGT.PER, GRP.*]
identity:
- id: owl:sameAs
description: "Entities are the same"
domain: ["*"]
range: ["*"]