2095 lines
73 KiB
YAML
2095 lines
73 KiB
YAML
# Dutch Web Patterns for Entity Annotation
|
|
# =============================================================================
|
|
# Converted from cleanup_contact_false_positives_v2.py
|
|
#
|
|
# Purpose: Define patterns for extracting typed entities from Dutch heritage
|
|
# website content with relationship predicates to the custodian being processed.
|
|
#
|
|
# Each pattern can have:
|
|
# - entity_type: CH-Annotator hypernym code (GRP.*, TOP.*, WRK.*, ROL.*, AGT.*, null)
|
|
# - capture_groups: Named groups that capture sub-entities
|
|
# - relationships: Predicates connecting extracted entity to custodian or other entities
|
|
# - discard_reason: For patterns that identify non-entities (UI elements, etc.)
|
|
#
|
|
# Version: 1.0.0
|
|
# Date: 2025-12-13
|
|
# Source: scripts/cleanup_contact_false_positives_v2.py lines 28-1166
|
|
# =============================================================================
|
|
|
|
metadata:
|
|
id: dutch_web_patterns_v1
|
|
name: Dutch Web Content Entity Patterns
|
|
version: "1.1.0"
|
|
language: nl
|
|
description: >-
|
|
Patterns for extracting and classifying entities from Dutch heritage institution
|
|
websites. Patterns are derived from false positive cleanup analysis of 168 custodian
|
|
web archives. Version 1.1.0 adds layout_hints based on analysis of 1,525 annotated
|
|
web archives showing XPath → entity type correlations.
|
|
source_script: scripts/cleanup_contact_false_positives_v2.py
|
|
ch_annotator_version: "1.7.0"
|
|
pattern_count: 646
|
|
|
|
# Layout hints configuration based on analysis of 1,525 web archives
|
|
# These define which XPath locations are most predictive for each entity type
|
|
layout_hints:
|
|
description: >-
|
|
XPath location hints derived from analyzing 15,252 entity claims across 1,343
|
|
unique websites. Patterns found at expected locations receive confidence boost.
|
|
|
|
# High-confidence XPath → entity type mappings (>80% correlation)
|
|
high_confidence_locations:
|
|
GRP.HER:
|
|
description: "Heritage institutions (museums, archives, libraries)"
|
|
primary_xpaths:
|
|
- "head/title" # 41.8% of GRP.HER found here
|
|
- "body/*/h1" # Primary heading
|
|
- "head/meta[@name='description']/@content"
|
|
confidence_boost: 0.2
|
|
|
|
GRP.ASS:
|
|
description: "Associations and societies"
|
|
primary_xpaths:
|
|
- "head/title" # 39.3% of GRP.ASS found here
|
|
- "body/*/header/h1"
|
|
confidence_boost: 0.15
|
|
|
|
GRP.GOV:
|
|
description: "Government bodies"
|
|
primary_xpaths:
|
|
- "head/title"
|
|
- "body/*/h1"
|
|
- "body/*/header"
|
|
confidence_boost: 0.15
|
|
|
|
TOP.ADR:
|
|
description: "Addresses"
|
|
primary_xpaths:
|
|
- "body/footer/*" # 23.8% of addresses in footer
|
|
- "body/*/footer/*/p"
|
|
- "body/*/p" # Paragraphs
|
|
confidence_boost: 0.2
|
|
|
|
TMP.OPH:
|
|
description: "Opening hours"
|
|
primary_xpaths:
|
|
- "body/*/footer"
|
|
- "body/*/table" # Often in tables
|
|
confidence_boost: 0.15
|
|
|
|
AGT.PER:
|
|
description: "Person names"
|
|
primary_xpaths:
|
|
- "body/*/p" # 36.4% in paragraphs
|
|
- "body/*/ul/li" # Staff lists
|
|
confidence_boost: 0.1
|
|
|
|
# Locations to deprioritize (often noise)
|
|
low_confidence_locations:
|
|
- "body/*/nav" # Navigation (menu items, not entities)
|
|
- "body/*/script" # JavaScript
|
|
- "body/*/style" # CSS
|
|
|
|
# Discard locations (always ignore content from these)
|
|
discard_locations:
|
|
- "head/script" # JS in head
|
|
- "body/*/noscript" # Noscript fallbacks
|
|
|
|
# =============================================================================
|
|
# ENTITY PATTERNS - Patterns that identify extractable entities with types
|
|
# =============================================================================
|
|
|
|
entity_patterns:
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ORGANIZATION PATTERNS (GRP.*)
|
|
# Heritage organizations, associations, societies, government bodies
|
|
# ---------------------------------------------------------------------------
|
|
|
|
organizations:
|
|
description: "Patterns identifying heritage organizations and related bodies"
|
|
|
|
heritage_associations:
|
|
description: "Historical/heritage associations and societies"
|
|
patterns:
|
|
- pattern: '^historische\s+vereniging\s+(\w+)$'
|
|
entity_type: GRP.ASS
|
|
entity_subtype: GRP.ASS.HER # Heritage association
|
|
label_template: "Historische Vereniging {1}"
|
|
capture_groups:
|
|
1:
|
|
type: TOP.SET
|
|
role: location_name
|
|
description: "Settlement/place name"
|
|
relationships:
|
|
- predicate: schema:location
|
|
subject: $0
|
|
object: $1
|
|
confidence: 0.9
|
|
examples:
|
|
- text: "Historische Vereniging Aalten"
|
|
entity: "Historische Vereniging Aalten"
|
|
captures:
|
|
1: "Aalten"
|
|
|
|
- pattern: '^heemkundige\s+kring\s+(\w+)$'
|
|
entity_type: GRP.ASS
|
|
entity_subtype: GRP.ASS.HER
|
|
label_template: "Heemkundige Kring {1}"
|
|
capture_groups:
|
|
1:
|
|
type: TOP.SET
|
|
role: location_name
|
|
relationships:
|
|
- predicate: schema:location
|
|
subject: $0
|
|
object: $1
|
|
examples:
|
|
- text: "Heemkundige Kring Halle"
|
|
entity: "Heemkundige Kring Halle"
|
|
|
|
- pattern: '^heemkunde\s*kring\s+(\w+)$'
|
|
entity_type: GRP.ASS
|
|
entity_subtype: GRP.ASS.HER
|
|
capture_groups:
|
|
1:
|
|
type: TOP.SET
|
|
role: location_name
|
|
|
|
- pattern: '^heemkunde\s+werkgroep\s+(\w+)$'
|
|
entity_type: GRP.ASS
|
|
entity_subtype: GRP.ASS.HER
|
|
capture_groups:
|
|
1:
|
|
type: TOP.SET
|
|
role: location_name
|
|
|
|
- pattern: '^historische\s+werkgroep\s+(\w+)$'
|
|
entity_type: GRP.ASS
|
|
entity_subtype: GRP.ASS.HER
|
|
capture_groups:
|
|
1:
|
|
type: TOP.SET
|
|
role: location_name
|
|
|
|
- pattern: '^oudheidkundige?\s+(kring|vereniging)\s+(\w+)$'
|
|
entity_type: GRP.ASS
|
|
entity_subtype: GRP.ASS.HER
|
|
capture_groups:
|
|
1:
|
|
type: null
|
|
role: organization_type
|
|
2:
|
|
type: TOP.SET
|
|
role: location_name
|
|
|
|
- pattern: '^heemkundevereniging\s+(\w+)$'
|
|
entity_type: GRP.ASS
|
|
entity_subtype: GRP.ASS.HER
|
|
capture_groups:
|
|
1:
|
|
type: TOP.SET
|
|
role: location_name
|
|
|
|
- pattern: '^(\w+)se?\s+(historische\s+)?(vereniging|kring|werkgroep|stichting|genootschap)$'
|
|
entity_type: GRP.ASS
|
|
description: "Geographic modifier + organization type"
|
|
capture_groups:
|
|
1:
|
|
type: TOP.SET
|
|
role: location_adjective
|
|
3:
|
|
type: null
|
|
role: organization_type
|
|
examples:
|
|
- text: "Nijmeegse Historische Vereniging"
|
|
- text: "Leidse Kring"
|
|
|
|
- pattern: '^puttens\s+historisch\s+genootschap$'
|
|
entity_type: GRP.ASS
|
|
entity_subtype: GRP.ASS.HER
|
|
relationships:
|
|
- predicate: schema:location
|
|
subject: $0
|
|
object: "Putten"
|
|
object_type: TOP.SET
|
|
|
|
municipalities:
|
|
description: "Dutch municipal governments"
|
|
patterns:
|
|
- pattern: '^gemeente\s+([\w-]+)$'
|
|
entity_type: GRP.GOV
|
|
entity_subtype: GRP.GOV.MUN # Municipality
|
|
label_template: "Gemeente {1}"
|
|
capture_groups:
|
|
1:
|
|
type: TOP.SET
|
|
role: municipality_name
|
|
relationships:
|
|
- predicate: org:subOrganizationOf
|
|
subject: $0
|
|
object: "https://www.wikidata.org/entity/Q29999" # Kingdom of Netherlands
|
|
object_type: GRP.GOV
|
|
confidence: 1.0
|
|
examples:
|
|
- text: "Gemeente Borger-Odoorn"
|
|
entity: "Gemeente Borger-Odoorn"
|
|
captures:
|
|
1: "Borger-Odoorn"
|
|
|
|
- pattern: '^gemeentehuis\s+([\w-]+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.GOV # Government building
|
|
label_template: "Gemeentehuis {1}"
|
|
capture_groups:
|
|
1:
|
|
type: TOP.SET
|
|
role: municipality_name
|
|
relationships:
|
|
- predicate: org:hasSite
|
|
subject: "Gemeente {1}"
|
|
subject_type: GRP.GOV
|
|
object: $0
|
|
|
|
- pattern: '^gemeente\s+archieven$'
|
|
entity_type: GRP.HER
|
|
entity_subtype: GRP.HER.ARC
|
|
relationships:
|
|
- predicate: org:subOrganizationOf
|
|
subject: $0
|
|
object: CUSTODIAN
|
|
|
|
heritage_institutions:
|
|
description: "Museums, archives, libraries"
|
|
patterns:
|
|
- pattern: '^(het|de)\s+(\w+)\s*(museum|archief|bibliotheek)$'
|
|
entity_type: GRP.HER
|
|
capture_groups:
|
|
2:
|
|
type: APP.NAM
|
|
role: institution_name
|
|
3:
|
|
type: null
|
|
role: institution_type_keyword
|
|
relationships:
|
|
- predicate: org:linkedTo
|
|
subject: $0
|
|
object: CUSTODIAN
|
|
confidence: 0.7
|
|
examples:
|
|
- text: "Het Rijksmuseum"
|
|
- text: "De Bibliotheek"
|
|
|
|
- pattern: '^(\w+)\s+(\w+)\s+museum$'
|
|
entity_type: GRP.HER
|
|
entity_subtype: GRP.HER.MUS
|
|
description: "Two-word museum names"
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: name_part_1
|
|
2:
|
|
type: APP.NAM
|
|
role: name_part_2
|
|
examples:
|
|
- text: "Pieter Vermeulen Museum"
|
|
- text: "Nederlands Graanmuseum"
|
|
|
|
- pattern: '^(nationaal|nederlands|nederlandse|oudheidkundig|virtueel)\s+(\w*)(museum|archief)$'
|
|
entity_type: GRP.HER
|
|
capture_groups:
|
|
1:
|
|
type: null
|
|
role: scope_modifier
|
|
2:
|
|
type: APP.NAM
|
|
role: subject_area
|
|
3:
|
|
type: null
|
|
role: institution_type_keyword
|
|
examples:
|
|
- text: "Nationaal Glasmuseum"
|
|
- text: "Nederlands Openluchtmuseum"
|
|
|
|
- pattern: '^regionaal\s+archief\s+(\w+)$'
|
|
entity_type: GRP.HER
|
|
entity_subtype: GRP.HER.ARC
|
|
capture_groups:
|
|
1:
|
|
type: TOP.REG
|
|
role: region_name
|
|
|
|
- pattern: '^stadsarchief\s+(\w+)$'
|
|
entity_type: GRP.HER
|
|
entity_subtype: GRP.HER.ARC
|
|
capture_groups:
|
|
1:
|
|
type: TOP.SET
|
|
role: city_name
|
|
|
|
- pattern: '^gemeentearchief\s+(\w+)$'
|
|
entity_type: GRP.HER
|
|
entity_subtype: GRP.HER.ARC
|
|
capture_groups:
|
|
1:
|
|
type: TOP.SET
|
|
role: municipality_name
|
|
|
|
- pattern: '^streekmuseum\s+(.+)$'
|
|
entity_type: GRP.HER
|
|
entity_subtype: GRP.HER.MUS
|
|
label_template: "Streekmuseum {1}"
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: eponymous_name
|
|
relationships:
|
|
- predicate: org:linkedTo
|
|
subject: $0
|
|
object: CUSTODIAN
|
|
confidence: 0.8
|
|
examples:
|
|
- text: "Streekmuseum Jan Anderson"
|
|
|
|
- pattern: '^streekhistorisch\s+centrum\s+(\w+)$'
|
|
entity_type: GRP.HER
|
|
capture_groups:
|
|
1:
|
|
type: TOP.SET
|
|
role: region_name
|
|
|
|
provincial_heritage:
|
|
description: "Provincial heritage organizations"
|
|
patterns:
|
|
- pattern: '^erfgoed\s+(brabant|gelderland|zeeland|limburg|utrecht|friesland|drenthe|overijssel|flevoland|groningen)$'
|
|
entity_type: GRP.HER
|
|
entity_subtype: GRP.HER.OFF # Official heritage organization
|
|
capture_groups:
|
|
1:
|
|
type: TOP.REG
|
|
role: province_name
|
|
relationships:
|
|
- predicate: schema:areaServed
|
|
subject: $0
|
|
object: $1
|
|
object_type: TOP.REG
|
|
|
|
- pattern: '^gelderse\s+kerken$'
|
|
entity_type: GRP.ASS
|
|
entity_subtype: GRP.ASS.REL # Religious association
|
|
relationships:
|
|
- predicate: schema:areaServed
|
|
subject: $0
|
|
object: "Gelderland"
|
|
object_type: TOP.REG
|
|
|
|
- pattern: '^groninger\s+waddenmusea$'
|
|
entity_type: GRP.ASS
|
|
relationships:
|
|
- predicate: schema:areaServed
|
|
subject: $0
|
|
object: "Groningen"
|
|
object_type: TOP.REG
|
|
|
|
- pattern: '^flevolands\s+geheugen$'
|
|
entity_type: GRP.HER
|
|
entity_subtype: GRP.HER.DIG # Digital heritage platform
|
|
|
|
- pattern: '^fryske\s+akademy$'
|
|
entity_type: GRP.RES
|
|
entity_subtype: GRP.RES.ACA # Academic research
|
|
|
|
organizational_units:
|
|
description: "Internal organizational units and governance bodies"
|
|
patterns:
|
|
- pattern: '^raad\s+van\s+toezicht$'
|
|
entity_type: GRP.UNT
|
|
entity_subtype: GRP.UNT.GOV # Governance unit
|
|
relationships:
|
|
- predicate: org:unitOf
|
|
subject: $0
|
|
object: CUSTODIAN
|
|
confidence: 0.95
|
|
examples:
|
|
- text: "Raad van Toezicht"
|
|
|
|
- pattern: '^het\s+bestuur$'
|
|
entity_type: GRP.UNT
|
|
entity_subtype: GRP.UNT.GOV
|
|
relationships:
|
|
- predicate: org:unitOf
|
|
subject: $0
|
|
object: CUSTODIAN
|
|
|
|
- pattern: '^de\s+stichting$'
|
|
entity_type: GRP.ORG
|
|
entity_subtype: GRP.ORG.FND # Foundation
|
|
relationships:
|
|
- predicate: owl:sameAs
|
|
subject: $0
|
|
object: CUSTODIAN
|
|
confidence: 0.8
|
|
|
|
- pattern: '^de\s+vereniging$'
|
|
entity_type: GRP.ASS
|
|
relationships:
|
|
- predicate: owl:sameAs
|
|
subject: $0
|
|
object: CUSTODIAN
|
|
confidence: 0.8
|
|
|
|
- pattern: '^management\s+team$'
|
|
entity_type: GRP.UNT
|
|
relationships:
|
|
- predicate: org:unitOf
|
|
subject: $0
|
|
object: CUSTODIAN
|
|
|
|
government_bodies:
|
|
description: "Government bodies and positions"
|
|
patterns:
|
|
- pattern: '^gedeputeerde\s+staten$'
|
|
entity_type: GRP.GOV
|
|
entity_subtype: GRP.GOV.PRO # Provincial government
|
|
|
|
- pattern: '^provinciale\s+staten$'
|
|
entity_type: GRP.GOV
|
|
entity_subtype: GRP.GOV.PRO
|
|
|
|
- pattern: '^burgemeester\s+en\s+wethouders$'
|
|
entity_type: GRP.GOV
|
|
entity_subtype: GRP.GOV.MUN
|
|
|
|
- pattern: '^commissaris\s+van\s+de\s+koning$'
|
|
entity_type: ROL.POS
|
|
entity_subtype: ROL.POS.GOV # Government position
|
|
|
|
- pattern: '^raad\s+van\s+state$'
|
|
entity_type: GRP.GOV
|
|
entity_subtype: GRP.GOV.NAT # National government
|
|
|
|
- pattern: '^nationale\s+ombudsman$'
|
|
entity_type: ROL.POS
|
|
entity_subtype: ROL.POS.GOV
|
|
|
|
businesses:
|
|
description: "Commercial entities"
|
|
patterns:
|
|
- pattern: '^(\w+)er\s+handelsvereniging$'
|
|
entity_type: GRP.COR
|
|
entity_subtype: GRP.COR.ASS # Trade association
|
|
capture_groups:
|
|
1:
|
|
type: TOP.SET
|
|
role: place_adjective
|
|
examples:
|
|
- text: "Meppeler Handelsvereniging"
|
|
|
|
- pattern: '^bouwbedrijf\s+(\w+)$'
|
|
entity_type: GRP.COR
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: company_name
|
|
|
|
- pattern: '^rabobank\s+(\w+)$'
|
|
entity_type: GRP.COR
|
|
entity_subtype: GRP.COR.BNK # Bank
|
|
capture_groups:
|
|
1:
|
|
type: TOP.SET
|
|
role: branch_location
|
|
|
|
cultural_organizations:
|
|
description: "Cultural and arts organizations"
|
|
patterns:
|
|
- pattern: '^dansstudio\s+(\w+)$'
|
|
entity_type: GRP.CUL
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: studio_name
|
|
|
|
- pattern: '^toneelvereniging\s+(\w+)$'
|
|
entity_type: GRP.ASS
|
|
entity_subtype: GRP.ASS.CUL # Cultural association
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: association_name
|
|
|
|
- pattern: '^schutterij\s+(de\s+)?(\w+)$'
|
|
entity_type: GRP.ASS
|
|
entity_subtype: GRP.ASS.TRD # Traditional association
|
|
capture_groups:
|
|
2:
|
|
type: APP.NAM
|
|
role: guild_name
|
|
|
|
- pattern: '^schuttersgilde\s+([\w-]+)$'
|
|
entity_type: GRP.ASS
|
|
entity_subtype: GRP.ASS.TRD
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: guild_name
|
|
|
|
- pattern: '^schuttersvereniging\s+([\w-]+)$'
|
|
entity_type: GRP.ASS
|
|
entity_subtype: GRP.ASS.TRD
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: association_name
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# BUILDING/PLACE PATTERNS (TOP.*)
|
|
# Physical structures, estates, monuments, religious buildings
|
|
# ---------------------------------------------------------------------------
|
|
|
|
buildings_places:
|
|
description: "Patterns identifying physical locations and structures"
|
|
|
|
castles_estates:
|
|
description: "Castles, estates, and manor houses"
|
|
patterns:
|
|
- pattern: '^kasteel\s+(\w+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.CAS # Castle
|
|
label_template: "Kasteel {1}"
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: castle_name
|
|
relationships:
|
|
- predicate: org:hasSite
|
|
subject: CUSTODIAN
|
|
object: $0
|
|
confidence: 0.7
|
|
examples:
|
|
- text: "Kasteel Oud Haarlem"
|
|
|
|
- pattern: '^kasteel\s+oud\s+haarlem$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.CAS
|
|
|
|
- pattern: '^landgoed\s+(\w+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.EST # Estate
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: estate_name
|
|
relationships:
|
|
- predicate: org:hasSite
|
|
subject: CUSTODIAN
|
|
object: $0
|
|
confidence: 0.6
|
|
|
|
- pattern: '^landgoed\s+(borg|de)\s+(\w+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.EST
|
|
capture_groups:
|
|
2:
|
|
type: APP.NAM
|
|
role: estate_name
|
|
|
|
- pattern: '^huize\s+(\w+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.MAN # Manor house
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: house_name
|
|
|
|
- pattern: '^huis\s+(ten|van|de)\s+(\w+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.MAN
|
|
capture_groups:
|
|
2:
|
|
type: APP.NAM
|
|
role: house_name
|
|
examples:
|
|
- text: "Huis ten Bosch"
|
|
- text: "Huis van Oud"
|
|
|
|
- pattern: '^hoeve\s+(de\s+)?(\w+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.FRM # Farm/farmhouse
|
|
capture_groups:
|
|
2:
|
|
type: APP.NAM
|
|
role: farm_name
|
|
|
|
- pattern: '^herberg\s+(de\s+)?(\w+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.INN # Historic inn
|
|
capture_groups:
|
|
2:
|
|
type: APP.NAM
|
|
role: inn_name
|
|
|
|
fortifications:
|
|
description: "Forts, bunkers, defensive structures"
|
|
patterns:
|
|
- pattern: '^fort\s+(\w+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.FOR # Fortification
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: fort_name
|
|
relationships:
|
|
- predicate: crm:P53_has_former_or_current_location
|
|
subject: $0
|
|
object: CUSTODIAN_LOCATION
|
|
|
|
- pattern: '^de\s+atlantikwall$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.FOR
|
|
|
|
- pattern: '^kamp\s+vught$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.HIS # Historic site
|
|
relationships:
|
|
- predicate: schema:location
|
|
subject: $0
|
|
object: "Vught"
|
|
object_type: TOP.SET
|
|
|
|
religious_buildings:
|
|
description: "Churches, chapels, monasteries"
|
|
patterns:
|
|
- pattern: '^sint\s+(\w+)(kerk|gebouw)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.REL # Religious building
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: saint_name
|
|
2:
|
|
type: null
|
|
role: building_type
|
|
|
|
- pattern: '^protestantse\s+(kerk|pastorie)\s+(\w+)?$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.REL
|
|
capture_groups:
|
|
1:
|
|
type: null
|
|
role: building_type
|
|
2:
|
|
type: TOP.SET
|
|
role: location_name
|
|
|
|
- pattern: '^kapel\s+van\s+(\w+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.REL
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: dedication
|
|
|
|
- pattern: '^mariakapel\s+(\w+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.REL
|
|
capture_groups:
|
|
1:
|
|
type: TOP.SET
|
|
role: location_name
|
|
examples:
|
|
- text: "Mariakapel Nieuw-Dijk"
|
|
|
|
monuments:
|
|
description: "Monuments, memorials, historic markers"
|
|
patterns:
|
|
- pattern: '^monument(en)?\s+(in|didam|loil|nieuw-dijk|oud-dijk|buurtschap)\b'
|
|
entity_type: TOP.FEA
|
|
entity_subtype: TOP.FEA.MON # Monument
|
|
|
|
- pattern: '^grafheuvel\s+(\w+)$'
|
|
entity_type: TOP.FEA
|
|
entity_subtype: TOP.FEA.ARC # Archaeological feature
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: feature_name
|
|
|
|
- pattern: '^nationaal\s+monument$'
|
|
entity_type: TOP.FEA
|
|
entity_subtype: TOP.FEA.MON
|
|
|
|
- pattern: '^kruisbeeld\s+op\s+(\w+)$'
|
|
entity_type: TOP.FEA
|
|
entity_subtype: TOP.FEA.REL # Religious monument
|
|
capture_groups:
|
|
1:
|
|
type: TOP.SET
|
|
role: location
|
|
|
|
cultural_venues:
|
|
description: "Theaters, community centers, museums"
|
|
patterns:
|
|
- pattern: '^theater\s+(de|het)\s+(\w+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.CUL # Cultural building
|
|
capture_groups:
|
|
2:
|
|
type: APP.NAM
|
|
role: theater_name
|
|
|
|
- pattern: '^buurthuis\s+(\w+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.COM # Community building
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: building_name
|
|
|
|
- pattern: '^poppodium\s+(de\s+)?(\w+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.CUL
|
|
capture_groups:
|
|
2:
|
|
type: APP.NAM
|
|
role: venue_name
|
|
examples:
|
|
- text: "Poppodium de Peppel"
|
|
|
|
- pattern: '^aula\s+(\w+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.EDU # Educational building
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: building_name
|
|
|
|
industrial_heritage:
|
|
description: "Mills, factories, industrial sites"
|
|
patterns:
|
|
- pattern: '^kalkoven\s+(\w+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.IND # Industrial building
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: site_name
|
|
|
|
- pattern: '^scheepswerf\s+(\w+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.IND
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: shipyard_name
|
|
|
|
- pattern: '^werkplaats\s+(\w+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.IND
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: workshop_name
|
|
|
|
parks_gardens:
|
|
description: "Parks, gardens, nature reserves"
|
|
patterns:
|
|
- pattern: '^botanische\s+tuin\s+(\w+)?$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.GAR # Garden
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: garden_name
|
|
|
|
- pattern: '^pinetum\s+(\w+)$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.GAR
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: arboretum_name
|
|
|
|
- pattern: '^landschapspark\s+(\w+)$'
|
|
entity_type: TOP.GEO
|
|
entity_subtype: TOP.GEO.PRK # Park
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: park_name
|
|
|
|
places_named:
|
|
description: "Named places and locations"
|
|
patterns:
|
|
- pattern: '^dekema\s+state$'
|
|
entity_type: TOP.BLD
|
|
entity_subtype: TOP.BLD.EST
|
|
|
|
- pattern: '^klein\s+(amerika|rome|zundert)$'
|
|
entity_type: TOP.SET
|
|
entity_subtype: TOP.SET.HAM # Hamlet/small settlement
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: place_reference
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# PUBLICATION/WORK PATTERNS (WRK.*)
|
|
# Publications, periodicals, books, reports
|
|
# ---------------------------------------------------------------------------
|
|
|
|
publications:
|
|
description: "Patterns identifying publications and works"
|
|
|
|
periodicals:
|
|
description: "Magazines, newsletters, journals"
|
|
patterns:
|
|
- pattern: '^jaarboek(en)?\s+(\w+)$'
|
|
entity_type: WRK.MAN
|
|
entity_subtype: WRK.MAN.SER # Serial publication
|
|
label_template: "Jaarboek {2}"
|
|
capture_groups:
|
|
2:
|
|
type: APP.NAM
|
|
role: publication_name
|
|
relationships:
|
|
- predicate: dcterms:publisher
|
|
subject: $0
|
|
object: CUSTODIAN
|
|
confidence: 0.85
|
|
examples:
|
|
- text: "Jaarboeken Aover Diem"
|
|
|
|
- pattern: '^jaarboek(en)?\s+aover\s+diem$'
|
|
entity_type: WRK.MAN
|
|
entity_subtype: WRK.MAN.SER
|
|
relationships:
|
|
- predicate: dcterms:publisher
|
|
subject: $0
|
|
object: CUSTODIAN
|
|
|
|
- pattern: '^verenigingsblad\s+(\w+)$'
|
|
entity_type: WRK.MAN
|
|
entity_subtype: WRK.MAN.SER
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: publication_name
|
|
relationships:
|
|
- predicate: dcterms:publisher
|
|
subject: $0
|
|
object: CUSTODIAN
|
|
|
|
- pattern: '^verenigingsorgaan\s+(de\s+)?(\w+)$'
|
|
entity_type: WRK.MAN
|
|
entity_subtype: WRK.MAN.SER
|
|
capture_groups:
|
|
2:
|
|
type: APP.NAM
|
|
role: publication_name
|
|
|
|
- pattern: '^myerlese\s+koerier$'
|
|
entity_type: WRK.MAN
|
|
entity_subtype: WRK.MAN.SER
|
|
|
|
- pattern: '^nijmeegs\s+katern$'
|
|
entity_type: WRK.MAN
|
|
entity_subtype: WRK.MAN.SER
|
|
|
|
- pattern: '^old\s+ni-js(\s+edities)?$'
|
|
entity_type: WRK.MAN
|
|
entity_subtype: WRK.MAN.SER
|
|
|
|
- pattern: '^roggels\s+blaadje$'
|
|
entity_type: WRK.MAN
|
|
entity_subtype: WRK.MAN.SER
|
|
|
|
- pattern: '^suetan\s+kwartaalbladen$'
|
|
entity_type: WRK.MAN
|
|
entity_subtype: WRK.MAN.SER
|
|
|
|
- pattern: '^tusken\s+de\s+marren$'
|
|
entity_type: WRK.MAN
|
|
entity_subtype: WRK.MAN.SER
|
|
|
|
- pattern: '^verleden\s+tijdschrift$'
|
|
entity_type: WRK.MAN
|
|
entity_subtype: WRK.MAN.SER
|
|
|
|
- pattern: '^dedemsvaartse\s+courant$'
|
|
entity_type: WRK.MAN
|
|
entity_subtype: WRK.MAN.SER
|
|
|
|
book_series:
|
|
description: "Book series and monographs"
|
|
patterns:
|
|
- pattern: '^reeuwijkse\s+(bronnen|reeks)$'
|
|
entity_type: WRK.MAN
|
|
entity_subtype: WRK.MAN.SER
|
|
capture_groups:
|
|
1:
|
|
type: null
|
|
role: series_type
|
|
|
|
- pattern: '^uitgaven\s+(\w+)$'
|
|
entity_type: WRK.MAN
|
|
entity_subtype: WRK.MAN.COL # Collection
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: publisher_name
|
|
|
|
- pattern: '^publicaties\s+(\w+)$'
|
|
entity_type: WRK.MAN
|
|
entity_subtype: WRK.MAN.COL
|
|
capture_groups:
|
|
1:
|
|
type: APP.NAM
|
|
role: collection_name
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ROLE/OCCUPATION PATTERNS (ROL.*)
|
|
# Job titles, positions, functions
|
|
# ---------------------------------------------------------------------------
|
|
|
|
roles:
|
|
description: "Patterns identifying roles, positions, and occupations"
|
|
|
|
job_titles:
|
|
description: "Professional job titles"
|
|
patterns:
|
|
- pattern: '^senior\s+(applicatiebeheerder|systeembeheerder)$'
|
|
entity_type: ROL.OCC
|
|
entity_subtype: ROL.OCC.TEC # Technical occupation
|
|
capture_groups:
|
|
1:
|
|
type: null
|
|
role: job_specialty
|
|
relationships:
|
|
- predicate: org:role
|
|
subject: CUSTODIAN
|
|
object: $0
|
|
examples:
|
|
- text: "Senior Applicatiebeheerder"
|
|
|
|
- pattern: '^financial\s+controller$'
|
|
entity_type: ROL.OCC
|
|
entity_subtype: ROL.OCC.ADM # Administrative occupation
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# PERSON PATTERNS (AGT.*)
|
|
# Historical figures, references to people
|
|
# ---------------------------------------------------------------------------
|
|
|
|
persons:
|
|
description: "Patterns identifying references to persons (not contact persons)"
|
|
|
|
historical_figures:
|
|
description: "Famous historical figures mentioned in content"
|
|
patterns:
|
|
- pattern: '^vincent\s+van\s+gogh$'
|
|
entity_type: AGT.PER
|
|
entity_subtype: AGT.PER.ART # Artist
|
|
relationships:
|
|
- predicate: schema:mentions
|
|
subject: CUSTODIAN
|
|
object: $0
|
|
description: "Custodian mentions this historical figure"
|
|
wikidata_id: Q5582
|
|
|
|
- pattern: '^rembrandt(\s+van\s+rijn)?$'
|
|
entity_type: AGT.PER
|
|
entity_subtype: AGT.PER.ART
|
|
wikidata_id: Q5598
|
|
|
|
- pattern: '^johannes\s+vermeer$'
|
|
entity_type: AGT.PER
|
|
entity_subtype: AGT.PER.ART
|
|
wikidata_id: Q41264
|
|
|
|
- pattern: '^vermeer$'
|
|
entity_type: AGT.PER
|
|
entity_subtype: AGT.PER.ART
|
|
wikidata_id: Q41264
|
|
|
|
# =============================================================================
|
|
# DISCARD PATTERNS - Patterns that identify non-entities to be filtered out
|
|
# =============================================================================
|
|
|
|
discard_patterns:
|
|
description: >-
|
|
Patterns matching text that should NOT be extracted as entities.
|
|
These are UI elements, navigation text, form labels, etc.
|
|
|
|
urls_technical:
|
|
description: "URLs and technical strings"
|
|
patterns:
|
|
- pattern: '^https?://'
|
|
discard_reason: "URL - not an entity name"
|
|
- pattern: '^www\.'
|
|
discard_reason: "URL fragment"
|
|
- pattern: '\.html$'
|
|
discard_reason: "File extension"
|
|
- pattern: '\.php$'
|
|
discard_reason: "File extension"
|
|
- pattern: '\.aspx?$'
|
|
discard_reason: "File extension"
|
|
- pattern: '/photos/'
|
|
discard_reason: "URL path segment"
|
|
- pattern: '/places/'
|
|
discard_reason: "URL path segment"
|
|
- pattern: '^ChIJ'
|
|
discard_reason: "Google Place ID"
|
|
- pattern: '^AWn5SU'
|
|
discard_reason: "Google photo ID"
|
|
- pattern: 'WordPress'
|
|
discard_reason: "CMS name"
|
|
|
|
navigation:
|
|
description: "Website navigation elements"
|
|
patterns:
|
|
- pattern: '^menu\s+schakelen$'
|
|
discard_reason: "Navigation toggle"
|
|
- pattern: '^go\s+to\s+top$'
|
|
discard_reason: "Navigation link"
|
|
- pattern: '^page\s+load\s+link$'
|
|
discard_reason: "Navigation element"
|
|
- pattern: '^skip\s+to\b'
|
|
discard_reason: "Accessibility navigation"
|
|
- pattern: '^jump\s+to\b'
|
|
discard_reason: "Navigation link"
|
|
- pattern: '^ga\s+naar\b'
|
|
discard_reason: "Dutch navigation"
|
|
- pattern: '^terug\s+naar\b'
|
|
discard_reason: "Dutch navigation (back to)"
|
|
- pattern: '^naar\s+(de|het|inhoud|menu)\b'
|
|
discard_reason: "Dutch navigation"
|
|
- pattern: '^back\s+to\b'
|
|
discard_reason: "Navigation link"
|
|
- pattern: '^footer\s+navigatie$'
|
|
discard_reason: "Footer navigation"
|
|
- pattern: '^hoofd\s*navigatie$'
|
|
discard_reason: "Main navigation"
|
|
- pattern: '^volg\s+ons\b'
|
|
discard_reason: "Social media CTA"
|
|
- pattern: '^follow\s+(us|this)\b'
|
|
discard_reason: "Social media CTA"
|
|
- pattern: '^menu\s+overslaan$'
|
|
discard_reason: "Skip menu"
|
|
- pattern: '^scroll\s+naar\b'
|
|
discard_reason: "Scroll instruction"
|
|
|
|
form_buttons:
|
|
description: "Form labels and button text"
|
|
patterns:
|
|
- pattern: '^typ\s+hier\b'
|
|
discard_reason: "Form placeholder"
|
|
- pattern: '^vul\s+in\b'
|
|
discard_reason: "Form instruction"
|
|
- pattern: '^selecteer\b'
|
|
discard_reason: "Form instruction"
|
|
- pattern: '^kies\s+'
|
|
discard_reason: "Form instruction"
|
|
- pattern: '^zoek(en)?(\s+in)?$'
|
|
discard_reason: "Search button"
|
|
- pattern: '^aanmeld(en|ing)$'
|
|
discard_reason: "Registration button"
|
|
- pattern: '^afmeld(en|ing)$'
|
|
discard_reason: "Unsubscribe button"
|
|
- pattern: '^reserv(eren|ering)$'
|
|
discard_reason: "Reservation button"
|
|
- pattern: '^verzend(en)?$'
|
|
discard_reason: "Submit button"
|
|
- pattern: '^accepteer\b'
|
|
discard_reason: "Accept button"
|
|
- pattern: '^afwijzen\b'
|
|
discard_reason: "Reject button"
|
|
- pattern: '^akkoord$'
|
|
discard_reason: "OK button"
|
|
- pattern: '^instellingen\s+opslaan$'
|
|
discard_reason: "Save settings button"
|
|
|
|
section_headers:
|
|
description: "Generic section headers"
|
|
patterns:
|
|
- pattern: '^laatste\s+nieuws$'
|
|
discard_reason: "Section header"
|
|
- pattern: '^over\s+(ons|deze)$'
|
|
discard_reason: "About section"
|
|
- pattern: '^missie\s+en\s+visie$'
|
|
discard_reason: "Mission/vision section"
|
|
- pattern: '^contact$'
|
|
discard_reason: "Contact section"
|
|
- pattern: '^contactgegevens$'
|
|
discard_reason: "Contact details section"
|
|
- pattern: '^bereikbaarheid$'
|
|
discard_reason: "Directions section"
|
|
- pattern: '^openingstijden$'
|
|
discard_reason: "Opening hours section"
|
|
- pattern: '^parkeren$'
|
|
discard_reason: "Parking section"
|
|
- pattern: '^bezoekadres$'
|
|
discard_reason: "Visitor address section"
|
|
- pattern: '^postadres$'
|
|
discard_reason: "Postal address section"
|
|
- pattern: '^privacybeleid$'
|
|
discard_reason: "Privacy policy"
|
|
- pattern: '^disclaimer$'
|
|
discard_reason: "Disclaimer section"
|
|
- pattern: '^colofon$'
|
|
discard_reason: "Colophon section"
|
|
- pattern: '^sitemap$'
|
|
discard_reason: "Sitemap"
|
|
- pattern: '^veelgestelde\s+vragen$'
|
|
discard_reason: "FAQ section"
|
|
|
|
website_content:
|
|
description: "Common website content phrases"
|
|
patterns:
|
|
- pattern: '^lees\s+meer\b'
|
|
discard_reason: "Read more link"
|
|
- pattern: '^bekijk\s+(de|het|alle|meer)\b'
|
|
discard_reason: "View more link"
|
|
- pattern: '^download\s+(de|het)\b'
|
|
discard_reason: "Download link"
|
|
- pattern: '^meer\s+info(rmatie)?$'
|
|
discard_reason: "More info link"
|
|
- pattern: '^handige\s+(info|links)$'
|
|
discard_reason: "Useful links section"
|
|
- pattern: '^gratis\s+(toegang|qr)\b'
|
|
discard_reason: "Free access notice"
|
|
|
|
single_words:
|
|
description: "Single-word false positives"
|
|
values:
|
|
- admin
|
|
- contact
|
|
- home
|
|
- menu
|
|
- zoeken
|
|
- search
|
|
- login
|
|
- inloggen
|
|
- registreren
|
|
- help
|
|
- info
|
|
- nieuws
|
|
- agenda
|
|
- kalender
|
|
- archief
|
|
- collectie
|
|
- beeldbank
|
|
- bronnen
|
|
- links
|
|
- partners
|
|
- sponsors
|
|
- doneren
|
|
- lidmaatschap
|
|
- privacy
|
|
- disclaimer
|
|
- sitemap
|
|
- colofon
|
|
- cookies
|
|
- vacatures
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ADDITIONAL PHRASE PATTERNS - Action verbs, instructions, content phrases
|
|
# From PHRASE_PATTERNS lines 167-1144
|
|
# ---------------------------------------------------------------------------
|
|
|
|
action_instructions:
|
|
description: "Call-to-action and instruction phrases"
|
|
patterns:
|
|
- pattern: '^meld\s+(je|u|een)\b'
|
|
discard_reason: "Registration CTA"
|
|
- pattern: '^geef\s+(je|uw)\b'
|
|
discard_reason: "Form instruction"
|
|
- pattern: '^word[t]?\s+(lid|vriend|abonnee)$'
|
|
discard_reason: "Membership CTA"
|
|
- pattern: '^steun\s+(het|de|ons)\b'
|
|
discard_reason: "Donation CTA"
|
|
- pattern: '^huur\s+(een|het|de|eigendom)\b'
|
|
discard_reason: "Rental CTA"
|
|
- pattern: '^schrijf\s+(je|ons)$'
|
|
discard_reason: "Subscription CTA"
|
|
- pattern: '^stuur\s+(een|foto)$'
|
|
discard_reason: "Submission CTA"
|
|
- pattern: '^reserveer\s+een\b'
|
|
discard_reason: "Reservation CTA"
|
|
- pattern: '^plan\s+uw\b'
|
|
discard_reason: "Planning CTA"
|
|
- pattern: '^kom\s+(in|verder)$'
|
|
discard_reason: "Invitation CTA"
|
|
- pattern: '^klik\s+voor\b'
|
|
discard_reason: "Click instruction"
|
|
- pattern: '^ontdek\s+(de|jouw|ons)$'
|
|
discard_reason: "Discovery CTA"
|
|
- pattern: '^vind\s+ons$'
|
|
discard_reason: "Find us CTA"
|
|
- pattern: '^verstuur\s+bericht$'
|
|
discard_reason: "Send message CTA"
|
|
- pattern: '^profiel\s+wijzigen$'
|
|
discard_reason: "Edit profile CTA"
|
|
- pattern: '^scans\s+aanvragen$'
|
|
discard_reason: "Request scans CTA"
|
|
- pattern: '^vraag\s+(en|of|stellen|afvalpas)$'
|
|
discard_reason: "Question/request CTA"
|
|
- pattern: '^vragen\s+(en|staat|over)\b'
|
|
discard_reason: "Questions section"
|
|
|
|
membership_sections:
|
|
description: "Membership and subscription sections"
|
|
patterns:
|
|
- pattern: '^leden\s+(administratie|en\s+lidmaatschap)$'
|
|
discard_reason: "Membership admin section"
|
|
- pattern: '^lid\s+(worden|worden\s+inloggen)$'
|
|
discard_reason: "Join membership CTA"
|
|
- pattern: '^lidmaatschap\s+\w+$'
|
|
discard_reason: "Membership section"
|
|
- pattern: '^soort\s+lidmaatschap$'
|
|
discard_reason: "Membership type section"
|
|
- pattern: '^jaarlijkse\s+bijdrage$'
|
|
discard_reason: "Annual contribution section"
|
|
|
|
water_management:
|
|
description: "Water board and environmental management content"
|
|
patterns:
|
|
- pattern: '^(legger|peilbesluit|proefsluiting|vervanging|vernieuwen|onderhoud|metingen|bediening)\s+'
|
|
discard_reason: "Water board operations"
|
|
- pattern: '^waterschapsbelasting\b'
|
|
discard_reason: "Water board tax"
|
|
- pattern: '^ons\s+gebied\b'
|
|
discard_reason: "Our area section"
|
|
- pattern: '^handhavingsverzoek\b'
|
|
discard_reason: "Enforcement request"
|
|
- pattern: '^waterbeheer\s+en\b'
|
|
discard_reason: "Water management section"
|
|
- pattern: '^waterkwaliteit\s+\w+$'
|
|
discard_reason: "Water quality section"
|
|
- pattern: '^waterschap\s+\w+$'
|
|
discard_reason: "Water board name"
|
|
- pattern: '^waterschapsverordening\s+en\b'
|
|
discard_reason: "Water board regulation"
|
|
- pattern: '^waterpeil\s+en\b'
|
|
discard_reason: "Water level section"
|
|
- pattern: '^natuur\s+en\s+waterkwaliteit$'
|
|
discard_reason: "Nature and water quality"
|
|
- pattern: '^recreatie\s+rondom\s+water$'
|
|
discard_reason: "Recreation around water"
|
|
- pattern: '^landbouw\s+en\s+water(kwaliteit)?$'
|
|
discard_reason: "Agriculture and water"
|
|
- pattern: '^klimaat\s+en\s+veiligheid$'
|
|
discard_reason: "Climate and safety"
|
|
- pattern: '^kaderrichtlijn\s+water$'
|
|
discard_reason: "Water framework directive"
|
|
- pattern: '^meten\s+van\s+de\s+waterkwaliteit$'
|
|
discard_reason: "Water quality measurement"
|
|
- pattern: '^voldoende\s+water$'
|
|
discard_reason: "Sufficient water section"
|
|
- pattern: '^natuurvriendelijke\s+oever$'
|
|
discard_reason: "Natural riverbank"
|
|
- pattern: '^oevers\s+\w+$'
|
|
discard_reason: "Riverbanks section"
|
|
|
|
heritage_content:
|
|
description: "Heritage and historical content phrases"
|
|
patterns:
|
|
- pattern: '^historie\s+(van\s+)?\w+$'
|
|
discard_reason: "History section"
|
|
- pattern: '^historisch(e)?\s+(coevorden|spektakel|avond|fietsroute|geografie|groenten|projecten|wandeling)\b'
|
|
discard_reason: "Historical content section"
|
|
- pattern: '^gevelstenen\s+in\b'
|
|
discard_reason: "Facade stones section"
|
|
- pattern: '^grafvondst\s+bij\b'
|
|
discard_reason: "Grave find section"
|
|
- pattern: '^erfgoedcollecties\s+van\b'
|
|
discard_reason: "Heritage collections section"
|
|
- pattern: '^vondsten\s+in\b'
|
|
discard_reason: "Finds section"
|
|
- pattern: '^lokale\s+vondsten$'
|
|
discard_reason: "Local finds section"
|
|
- pattern: '^tijdlijn\s+vondsten$'
|
|
discard_reason: "Finds timeline"
|
|
- pattern: '^opgraving\s+\w+$'
|
|
discard_reason: "Excavation section"
|
|
- pattern: '^militaire\s+historie$'
|
|
discard_reason: "Military history section"
|
|
- pattern: '^genealogische\s+begrippen$'
|
|
discard_reason: "Genealogical terms"
|
|
- pattern: '^notariele\s+archieven$'
|
|
discard_reason: "Notarial archives section"
|
|
- pattern: '^voorouders\s+op\b'
|
|
discard_reason: "Ancestors section"
|
|
- pattern: '^larense\s+voorouders$'
|
|
discard_reason: "Laren ancestors section"
|
|
- pattern: '^personenbestand\s+\w+$'
|
|
discard_reason: "Person database section"
|
|
- pattern: '^namenlijst\s+\w+$'
|
|
discard_reason: "Name list section"
|
|
|
|
tours_visits:
|
|
description: "Tour and visit related content"
|
|
patterns:
|
|
- pattern: '^rondleiding\s+\w+$'
|
|
discard_reason: "Tour section"
|
|
- pattern: '^rondleidingen\s+en\b'
|
|
discard_reason: "Tours section"
|
|
- pattern: '^rondwandeling\s+door\b'
|
|
discard_reason: "Walking tour"
|
|
- pattern: '^virtuele\s+tour$'
|
|
discard_reason: "Virtual tour"
|
|
- pattern: '^groepen\s+aanmelden$'
|
|
discard_reason: "Group registration"
|
|
- pattern: '^programma\s+voor\s+groepen$'
|
|
discard_reason: "Group program"
|
|
- pattern: '^wensen\s+rondleiding$'
|
|
discard_reason: "Tour wishes"
|
|
- pattern: '^landgoedrondleiding\b'
|
|
discard_reason: "Estate tour"
|
|
|
|
location_directions:
|
|
description: "Location and directions content"
|
|
patterns:
|
|
- pattern: '^locatie\s+\w+$'
|
|
discard_reason: "Location section"
|
|
- pattern: '^locaties\s+\w+$'
|
|
discard_reason: "Locations section"
|
|
- pattern: '^overige\s+locaties$'
|
|
discard_reason: "Other locations"
|
|
- pattern: '^vestiging\s+\w+$'
|
|
discard_reason: "Branch location"
|
|
- pattern: '^route\s+(en|per)$'
|
|
discard_reason: "Route section"
|
|
- pattern: '^per\s+(auto|boot)$'
|
|
discard_reason: "By car/boat directions"
|
|
- pattern: '^met\s+het\s+ov$'
|
|
discard_reason: "Public transport"
|
|
- pattern: '^naar\s+(google\s+maps|bestuurspagina|boven\s+scrollen|veelgestelde\s+vragen)$'
|
|
discard_reason: "Navigation link"
|
|
- pattern: '^vanuit\s+\w+$'
|
|
discard_reason: "From location"
|
|
|
|
time_schedule:
|
|
description: "Time and schedule related content"
|
|
patterns:
|
|
- pattern: '^dag\s+tijden$'
|
|
discard_reason: "Day times"
|
|
- pattern: '^\w+dag\s+gesloten$'
|
|
discard_reason: "Day closed"
|
|
- pattern: '^goede\s+vrijdag$'
|
|
discard_reason: "Good Friday"
|
|
- pattern: '^tweede\s+(paasdag|pinksterdag)$'
|
|
discard_reason: "Holiday name"
|
|
- pattern: '^vandaag\s+gesloten$'
|
|
discard_reason: "Closed today"
|
|
- pattern: '^morgen\s+gesloten$'
|
|
discard_reason: "Closed tomorrow"
|
|
- pattern: '^ook\s+(aanwezig|gesloten)$'
|
|
discard_reason: "Also present/closed"
|
|
- pattern: '^gesloten\s+op\b'
|
|
discard_reason: "Closed on"
|
|
|
|
events_activities:
|
|
description: "Events and activities content"
|
|
patterns:
|
|
- pattern: '^lezingen\s+en\s+\w+$'
|
|
discard_reason: "Lectures section"
|
|
- pattern: '^thema\s+avonden$'
|
|
discard_reason: "Theme evenings"
|
|
- pattern: '^komende\s+activiteiten$'
|
|
discard_reason: "Upcoming activities"
|
|
- pattern: '^cursus\s+\w+$'
|
|
discard_reason: "Course section"
|
|
- pattern: '^wandel\s+en\b'
|
|
discard_reason: "Walking section"
|
|
- pattern: '^wandelapp\s+\w+$'
|
|
discard_reason: "Walking app"
|
|
- pattern: '^wandelen\s+en\s+fietsen$'
|
|
discard_reason: "Walking and cycling"
|
|
- pattern: '^wandelkaart\s+\w+$'
|
|
discard_reason: "Walking map"
|
|
- pattern: '^struinpad\s+wandelingen$'
|
|
discard_reason: "Trail walks"
|
|
- pattern: '^trouwen\s+in\b'
|
|
discard_reason: "Weddings section"
|
|
- pattern: '^zakelijke\s+bijeenkomsten$'
|
|
discard_reason: "Business meetings"
|
|
- pattern: '^jubileum\s+fietsroute$'
|
|
discard_reason: "Anniversary bike route"
|
|
|
|
online_services:
|
|
description: "Online services and digital content"
|
|
patterns:
|
|
- pattern: '^online\s+(afspraak|betalen|doneren|exposities|platform|reserveren|vraag)$'
|
|
discard_reason: "Online service"
|
|
- pattern: '^website\s+(beheer|gemeenteraad)$'
|
|
discard_reason: "Website section"
|
|
- pattern: '^webdesign\s+bureau\b'
|
|
discard_reason: "Web design"
|
|
- pattern: '^google\s+maps$'
|
|
discard_reason: "Google Maps reference"
|
|
- pattern: '^gebruik\s+google\s+maps$'
|
|
discard_reason: "Use Google Maps"
|
|
- pattern: '^flickr\s+fotoalbum$'
|
|
discard_reason: "Flickr photo album"
|
|
|
|
about_sections:
|
|
description: "About and overview sections"
|
|
patterns:
|
|
- pattern: '^over\s+(batavialand|bergh|haaksbergen|heemskerk|lkca|laren|museumpark|numaga|nuwelant|rijnland|roosendaal|rozet|ruurd|onze\s+website)$'
|
|
discard_reason: "About section"
|
|
- pattern: '^over\s+(de|het|dekema)\s+'
|
|
discard_reason: "About section"
|
|
- pattern: '^ons\s+(adres|bestuur|huisblad|kantoor|team|werkgebied)$'
|
|
discard_reason: "Our [X] section"
|
|
- pattern: '^wat\s+(doen|doet|we|wij)$'
|
|
discard_reason: "What we do section"
|
|
- pattern: '^wie\s+(we|wij)\s+zijn$'
|
|
discard_reason: "Who we are section"
|
|
- pattern: '^visie\s+en\s+missie$'
|
|
discard_reason: "Vision and mission"
|
|
- pattern: '^missie\s+en\s+doelen$'
|
|
discard_reason: "Mission and goals"
|
|
- pattern: '^ontstaan\s+\w+$'
|
|
discard_reason: "Origin section"
|
|
|
|
overview_sections:
|
|
description: "Overview and list sections"
|
|
patterns:
|
|
- pattern: '^overzicht\s+(rijksmonumenten|skriemers|archeologische|bouwlocaties|exposities|formulieren|tijdschriften)$'
|
|
discard_reason: "Overview section"
|
|
- pattern: '^overige\s+(uitgaven|documenten|locaties|organisaties|vrijwilligers)$'
|
|
discard_reason: "Other [X] section"
|
|
- pattern: '^meer\s+(fers|meldingen|natuurmusea|over|telefoonnummers|weten)$'
|
|
discard_reason: "More [X] section"
|
|
- pattern: '^meest\s+(bekeken|recente)\b'
|
|
discard_reason: "Most [X] section"
|
|
- pattern: '^recente\s+berichten$'
|
|
discard_reason: "Recent posts"
|
|
- pattern: '^laatst(e)?\s+(verschenen|update|nieuws)$'
|
|
discard_reason: "Latest [X] section"
|
|
|
|
links_references:
|
|
description: "Links and reference sections"
|
|
patterns:
|
|
- pattern: '^link\s+naar\b'
|
|
discard_reason: "Link to"
|
|
- pattern: '^links\s+\w+$'
|
|
discard_reason: "Links section"
|
|
- pattern: '^interessante\s+links$'
|
|
discard_reason: "Interesting links"
|
|
- pattern: '^partner\s+links$'
|
|
discard_reason: "Partner links"
|
|
- pattern: '^nuttige\s+websites$'
|
|
discard_reason: "Useful websites"
|
|
- pattern: '^wikipedia\s+\w+$'
|
|
discard_reason: "Wikipedia reference"
|
|
|
|
reports_complaints:
|
|
description: "Reports and complaints sections"
|
|
patterns:
|
|
- pattern: '^klacht\s+\w+$'
|
|
discard_reason: "Complaint section"
|
|
- pattern: '^klachten\s+\w+$'
|
|
discard_reason: "Complaints section"
|
|
- pattern: '^meldingen\s+(en|zonder|over)\b'
|
|
discard_reason: "Reports section"
|
|
- pattern: '^meld\s+(direct|het|overlast)$'
|
|
discard_reason: "Report CTA"
|
|
- pattern: '^incident\s+melden$'
|
|
discard_reason: "Report incident"
|
|
- pattern: '^storing\s+melden$'
|
|
discard_reason: "Report malfunction"
|
|
- pattern: '^schade\s+\w+\s+melden$'
|
|
discard_reason: "Report damage"
|
|
- pattern: '^spoedeisende\s+meldingen$'
|
|
discard_reason: "Emergency reports"
|
|
|
|
governance_policy:
|
|
description: "Governance and policy content"
|
|
patterns:
|
|
- pattern: '^committee\s+van\b'
|
|
discard_reason: "Committee section"
|
|
- pattern: '^governance\s+code\b'
|
|
discard_reason: "Governance code"
|
|
- pattern: '^coordinated\s+vulnerability\b'
|
|
discard_reason: "Security policy"
|
|
- pattern: '^privacyverklaring\s+\w+$'
|
|
discard_reason: "Privacy statement"
|
|
- pattern: '^wijziging\s+privacyverklaring$'
|
|
discard_reason: "Privacy statement change"
|
|
- pattern: '^voorwaarden\s+(en|zonder)$'
|
|
discard_reason: "Terms section"
|
|
- pattern: '^richtlijnen\s+en\b'
|
|
discard_reason: "Guidelines section"
|
|
- pattern: '^wet\s+open\s+overheid$'
|
|
discard_reason: "Open government law"
|
|
- pattern: '^toetsing\s+\w+$'
|
|
discard_reason: "Assessment section"
|
|
- pattern: '^toezicht\s+en\b'
|
|
discard_reason: "Supervision section"
|
|
- pattern: '^verbonden\s+partijen$'
|
|
discard_reason: "Related parties"
|
|
|
|
services_facilities:
|
|
description: "Services and facilities content"
|
|
patterns:
|
|
- pattern: '^service\s+contact$'
|
|
discard_reason: "Service contact"
|
|
- pattern: '^servicepunt\s+\w+$'
|
|
discard_reason: "Service point"
|
|
- pattern: '^infopunt\s+\w+$'
|
|
discard_reason: "Info point"
|
|
- pattern: '^informatiepunt\s+\w+$'
|
|
discard_reason: "Information point"
|
|
- pattern: '^informatiecentrum\s+\w+$'
|
|
discard_reason: "Information center"
|
|
- pattern: '^kenniscentrum\s+\w+$'
|
|
discard_reason: "Knowledge center"
|
|
- pattern: '^uitleenpunt\s+\w+$'
|
|
discard_reason: "Lending point"
|
|
- pattern: '^portaal\s+\w+$'
|
|
discard_reason: "Portal section"
|
|
|
|
tickets_shop:
|
|
description: "Tickets and shopping content"
|
|
patterns:
|
|
- pattern: '^tickets\s+(contact|en|kopen)$'
|
|
discard_reason: "Tickets section"
|
|
- pattern: '^tarieven\s+en\b'
|
|
discard_reason: "Prices section"
|
|
- pattern: '^winkelwagen\s+\w+$'
|
|
discard_reason: "Shopping cart"
|
|
- pattern: '^shopping\s+cart$'
|
|
discard_reason: "Shopping cart"
|
|
|
|
sponsors_support:
|
|
description: "Sponsors and support content"
|
|
patterns:
|
|
- pattern: '^sponsors\s+en\b'
|
|
discard_reason: "Sponsors section"
|
|
- pattern: '^sponsoring\s+(en|aanvragen)$'
|
|
discard_reason: "Sponsoring section"
|
|
- pattern: '^steun\s+(orientalis|structureel)$'
|
|
discard_reason: "Support section"
|
|
- pattern: '^structurele\s+ondersteuning$'
|
|
discard_reason: "Structural support"
|
|
- pattern: '^subsidies\s+en\b'
|
|
discard_reason: "Subsidies section"
|
|
- pattern: '^subsidieverstrekkers\b'
|
|
discard_reason: "Subsidy providers"
|
|
- pattern: '^supporter\s+\w+$'
|
|
discard_reason: "Supporter section"
|
|
|
|
education_youth:
|
|
description: "Education and youth content"
|
|
patterns:
|
|
- pattern: '^jeugd\s+en\s+onderwijs$'
|
|
discard_reason: "Youth and education"
|
|
- pattern: '^onderwijs\s+en\s+jeugd$'
|
|
discard_reason: "Education and youth"
|
|
- pattern: '^scholen\s+\w+$'
|
|
discard_reason: "Schools section"
|
|
- pattern: '^voor\s+(wo\s+ii|bezoekers|de\s+(jeugd|media|pers)|het\s+onderwijs|onderwijsinstellingen|professionals)$'
|
|
discard_reason: "For [audience] section"
|
|
- pattern: '^voorschoolse\s+\w+$'
|
|
discard_reason: "Preschool section"
|
|
- pattern: '^kids\s+academy$'
|
|
discard_reason: "Kids academy"
|
|
- pattern: '^kinderen\s+bij\b'
|
|
discard_reason: "Children section"
|
|
- pattern: '^middelbaar\s+beroepsonderwijs$'
|
|
discard_reason: "Vocational education"
|
|
- pattern: '^stages\s+en\s+afstuderen$'
|
|
discard_reason: "Internships section"
|
|
|
|
volunteers_staff:
|
|
description: "Volunteers and staff content"
|
|
patterns:
|
|
- pattern: '^vrijwilligersuitje\s+\w+$'
|
|
discard_reason: "Volunteer outing"
|
|
- pattern: '^welkom\s+nieuwe\s+vrijwilliger$'
|
|
discard_reason: "Welcome new volunteer"
|
|
- pattern: '^vacature\s+\w+$'
|
|
discard_reason: "Vacancy section"
|
|
- pattern: '^werken\s+bij\b'
|
|
discard_reason: "Work at section"
|
|
- pattern: '^medewerkers\s+zoeken$'
|
|
discard_reason: "Search employees"
|
|
|
|
technical_website:
|
|
description: "Technical website elements"
|
|
patterns:
|
|
- pattern: '^no\s+events$'
|
|
discard_reason: "No events message"
|
|
- pattern: '^recent\s+(comments|posts)$'
|
|
discard_reason: "Recent content"
|
|
- pattern: '^search\s+submit\b'
|
|
discard_reason: "Search submit"
|
|
- pattern: '^share\s+this$'
|
|
discard_reason: "Share this"
|
|
- pattern: '^statistics\s+statistics$'
|
|
discard_reason: "Statistics"
|
|
- pattern: '^strictly\s+necessary$'
|
|
discard_reason: "Cookie notice"
|
|
- pattern: '^system\s+management$'
|
|
discard_reason: "System management"
|
|
- pattern: '^my\s+account$'
|
|
discard_reason: "My account"
|
|
- pattern: '^other\s+languages$'
|
|
discard_reason: "Language selector"
|
|
- pattern: '^product\s+families$'
|
|
discard_reason: "Product categories"
|
|
- pattern: '^form\s+submissions$'
|
|
discard_reason: "Form submissions"
|
|
- pattern: '^global\s+websites$'
|
|
discard_reason: "Global websites"
|
|
- pattern: '^inloggen\s+leden$'
|
|
discard_reason: "Member login"
|
|
- pattern: '^inhoud\s+website$'
|
|
discard_reason: "Website contents"
|
|
- pattern: '^inhoudsopgave\s+inhoudsopgave$'
|
|
discard_reason: "Table of contents"
|
|
|
|
foreign_language:
|
|
description: "Foreign language navigation"
|
|
patterns:
|
|
- pattern: '^mit\s+dem\s+(auto|fahrrad)$'
|
|
discard_reason: "German directions"
|
|
- pattern: '^kontakt\s+und\b'
|
|
discard_reason: "German contact"
|
|
- pattern: '^polskie\s+informacje$'
|
|
discard_reason: "Polish information"
|
|
- pattern: '^preparez\s+votre\b'
|
|
discard_reason: "French prepare"
|
|
- pattern: '^folgen\s+sie\b'
|
|
discard_reason: "German follow"
|
|
- pattern: '^sich\s+einschreiben$'
|
|
discard_reason: "German register"
|
|
- pattern: '^international\s+visitors$'
|
|
discard_reason: "International visitors"
|
|
|
|
miscellaneous_content:
|
|
description: "Miscellaneous content phrases"
|
|
patterns:
|
|
- pattern: '^(feesten|geboorte|groen|foto|inkoop|eten|zien|beeld|groepen|genealogie|wonen)\s+en\s+\w+$'
|
|
discard_reason: "X and Y content phrase"
|
|
- pattern: '^\w+\s+en\s+(gebruiken|overlijden|onderhoud|film|aanbesteden|drinken|doen|geluid|bidprentjes|rondleidingen|leefomgeving|ontginning|links)$'
|
|
discard_reason: "X and Y content phrase"
|
|
- pattern: '^het\s+(geheugen|geheim|ontstaan|kantoor|heemhuis|lichtruim|natuurhistorisch|nevelhorstmeer|olieslaan|schip|stift|veenkloosterbos)\b'
|
|
discard_reason: "Het X section"
|
|
- pattern: '^de\s+(dorpsdokter|drie|egeling|japanse|klinker|oude)\s+'
|
|
discard_reason: "De X section"
|
|
- pattern: '^de\s+(atlantikwall|basis|bilt|bongard|buffer|haarslag|klok|kring|lindenhoeve|mansjes|mariahoeve|nestbouwers|noodwoning|omgevingswet|quiz|skriemer|vlotter|wazerweijen|werf|zoolstede)$'
|
|
discard_reason: "De X place/thing"
|
|
- pattern: '^genieten\s+van\b'
|
|
discard_reason: "Enjoy section"
|
|
- pattern: '^hulp\s+bij\b'
|
|
discard_reason: "Help with"
|
|
- pattern: '^opening\s+museum$'
|
|
discard_reason: "Museum opening"
|
|
- pattern: '^renovatie\s+\w+$'
|
|
discard_reason: "Renovation section"
|
|
- pattern: '^verhuizing\s+naar\b'
|
|
discard_reason: "Move to"
|
|
- pattern: '^home\s+contact\b'
|
|
discard_reason: "Home contact navigation"
|
|
- pattern: '^in\s+(buurthuis|de\s+(laar|stad))\b'
|
|
discard_reason: "In location"
|
|
- pattern: '^(vier|zes)\s+(typen|kernen)\b'
|
|
discard_reason: "Number types"
|
|
- pattern: '^werken\s+met\s+\w+$'
|
|
discard_reason: "Working with"
|
|
- pattern: '^werkgebied\s+\w+$'
|
|
discard_reason: "Work area"
|
|
- pattern: '^werk\s+in\s+uitvoering$'
|
|
discard_reason: "Work in progress"
|
|
- pattern: '^serie\s+\w+$'
|
|
discard_reason: "Series section"
|
|
- pattern: '^dromen\s+denken\s+doen$'
|
|
discard_reason: "Slogan"
|
|
- pattern: '^founding\s+fathers$'
|
|
discard_reason: "Founding fathers section"
|
|
- pattern: '^green\s+team$'
|
|
discard_reason: "Green team"
|
|
- pattern: '^kijk\s+en\s+beleef$'
|
|
discard_reason: "Look and experience"
|
|
- pattern: '^toen\s+en\s+nu$'
|
|
discard_reason: "Then and now"
|
|
- pattern: '^trein\s+en\s+spoor$'
|
|
discard_reason: "Train and track"
|
|
- pattern: '^uit\s+in\b'
|
|
discard_reason: "Out in"
|
|
- pattern: '^vrij\s+zoeken$'
|
|
discard_reason: "Free search"
|
|
- pattern: '^object\s+van\s+de\s+maand$'
|
|
discard_reason: "Object of the month"
|
|
- pattern: '^objecten\s+\w+$'
|
|
discard_reason: "Objects section"
|
|
- pattern: '^post\s+en\s+e-mail$'
|
|
discard_reason: "Post and email"
|
|
- pattern: '^resultaat\s+\w+$'
|
|
discard_reason: "Result section"
|
|
- pattern: '^reviews\s+op\b'
|
|
discard_reason: "Reviews on"
|
|
- pattern: '^suggesties\s+en\b'
|
|
discard_reason: "Suggestions section"
|
|
- pattern: '^vergunningen\s+en\b'
|
|
discard_reason: "Permits section"
|
|
- pattern: '^verhalend\s+ontwerpen$'
|
|
discard_reason: "Narrative design"
|
|
- pattern: '^verhuizen\s+en\b'
|
|
discard_reason: "Moving section"
|
|
- pattern: '^verslagen\s+van\b'
|
|
discard_reason: "Reports of"
|
|
- pattern: '^verzonden\s+nieuwsbrieven$'
|
|
discard_reason: "Sent newsletters"
|
|
- pattern: '^windmolens\s+en\b'
|
|
discard_reason: "Windmills section"
|
|
- pattern: '^winter\s+in\b'
|
|
discard_reason: "Winter in"
|
|
- pattern: '^korte\s+lijnen$'
|
|
discard_reason: "Short lines"
|
|
- pattern: '^huidige\s+aanbod$'
|
|
discard_reason: "Current offer"
|
|
- pattern: '^iets\s+vragen$'
|
|
discard_reason: "Ask something"
|
|
- pattern: '^in\s+engeland$'
|
|
discard_reason: "In England"
|
|
- pattern: '^inkomende\s+telefoongesprekken$'
|
|
discard_reason: "Incoming calls"
|
|
- pattern: '^inleiding\s+\w+$'
|
|
discard_reason: "Introduction"
|
|
- pattern: '^inloop\s+\w+$'
|
|
discard_reason: "Walk-in section"
|
|
- pattern: '^internationale\s+samenwerking$'
|
|
discard_reason: "International cooperation"
|
|
- pattern: '^informatiebrochures\s+molens$'
|
|
discard_reason: "Mill brochures"
|
|
- pattern: '^info\s+borden$'
|
|
discard_reason: "Info boards"
|
|
- pattern: '^index\s+\w+$'
|
|
discard_reason: "Index section"
|
|
- pattern: '^samenwerking\s+met$'
|
|
discard_reason: "Cooperation with"
|
|
- pattern: '^schilderijen\s+kunstschilders$'
|
|
discard_reason: "Paintings section"
|
|
- pattern: '^planten\s+en\s+dieren$'
|
|
discard_reason: "Plants and animals"
|
|
- pattern: '^positieve\s+gezondheid$'
|
|
discard_reason: "Positive health"
|
|
- pattern: '^unieke\s+combinatie$'
|
|
discard_reason: "Unique combination"
|
|
- pattern: '^uittreksels\s+en\b'
|
|
discard_reason: "Extracts section"
|
|
- pattern: '^uitwisselen\s+van\b'
|
|
discard_reason: "Exchange of"
|
|
- pattern: '^voortgang\s+procedure$'
|
|
discard_reason: "Progress procedure"
|
|
- pattern: '^vorige\s+volgende\b'
|
|
discard_reason: "Previous next navigation"
|
|
- pattern: '^welkom\s+terug$'
|
|
discard_reason: "Welcome back"
|
|
- pattern: '^wereld\s+van\b'
|
|
discard_reason: "World of"
|
|
- pattern: '^wapen\s+van\b'
|
|
discard_reason: "Coat of arms"
|
|
- pattern: '^verdwenen\s+\w+$'
|
|
discard_reason: "Disappeared section"
|
|
- pattern: '^vernieuwing\s+museum\b'
|
|
discard_reason: "Museum renewal"
|
|
- pattern: '^vroegere\s+kringactiviteiten$'
|
|
discard_reason: "Former activities"
|
|
- pattern: '^vrouwelijke\s+engelandvaarders$'
|
|
discard_reason: "Female England travelers"
|
|
- pattern: '^wegkruisenwandelboekje\s+\w+$'
|
|
discard_reason: "Cross walk booklet"
|
|
- pattern: '^wegwerkzaamheden\s+en\b'
|
|
discard_reason: "Roadworks section"
|
|
- pattern: '^opgewekte\s+geschiedenissen$'
|
|
discard_reason: "Cheerful histories"
|
|
- pattern: '^omschrijving\s+van\b'
|
|
discard_reason: "Description of"
|
|
- pattern: '^ommetje\s+\w+$'
|
|
discard_reason: "Short walk"
|
|
- pattern: '^ondernemen(d)?\s+(in|nijeveen)$'
|
|
discard_reason: "Entrepreneurship section"
|
|
- pattern: '^ondernemers\s+kunnen\s+contact$'
|
|
discard_reason: "Entrepreneurs contact"
|
|
- pattern: '^onderwerpen\s+onderwerpen$'
|
|
discard_reason: "Subjects section"
|
|
- pattern: '^onderzoeksresultaten\s+\w+$'
|
|
discard_reason: "Research results"
|
|
- pattern: '^oorsprong\s+\w+$'
|
|
discard_reason: "Origin section"
|
|
- pattern: '^op\s+(de|het)\s+\w+$'
|
|
discard_reason: "On the X"
|
|
- pattern: '^openbare\s+inschrijving$'
|
|
discard_reason: "Public registration"
|
|
- pattern: '^openstelling\s+en\b'
|
|
discard_reason: "Opening section"
|
|
- pattern: '^pagina\s+voor\b'
|
|
discard_reason: "Page for"
|
|
- pattern: '^pakje\s+kunst$'
|
|
discard_reason: "Package of art"
|
|
- pattern: '^panorama\s+van\b'
|
|
discard_reason: "Panorama of"
|
|
- pattern: '^partner\s+webshop$'
|
|
discard_reason: "Partner webshop"
|
|
- pattern: '^pers\s+toolkit$'
|
|
discard_reason: "Press toolkit"
|
|
- pattern: '^persoonlijk\s+contact$'
|
|
discard_reason: "Personal contact"
|
|
- pattern: '^plaatselijk\s+belang\b'
|
|
discard_reason: "Local interest"
|
|
- pattern: '^poortinstructie\s+voor\b'
|
|
discard_reason: "Gate instruction"
|
|
- pattern: '^praat\s+mar\s+frysk$'
|
|
discard_reason: "Speak Frisian"
|
|
- pattern: '^recht\s+van\s+opstal$'
|
|
discard_reason: "Right of superficies"
|
|
- pattern: '^rabo\s+clubsupport$'
|
|
discard_reason: "Rabo club support"
|
|
- pattern: '^toegankelijk\s+voor\b'
|
|
discard_reason: "Accessible for"
|
|
- pattern: '^toelichting\s+beeldbank$'
|
|
discard_reason: "Image bank explanation"
|
|
- pattern: '^tijd\s+geconstateerd$'
|
|
discard_reason: "Time detected"
|
|
- pattern: '^varen\s+in\b'
|
|
discard_reason: "Sailing in"
|
|
- pattern: '^veel\s+gestelde\s+vragen$'
|
|
discard_reason: "FAQ"
|
|
- pattern: '^veilig\s+mailen$'
|
|
discard_reason: "Safe email"
|
|
- pattern: '^vakantie\s+in\b'
|
|
discard_reason: "Holiday in"
|
|
- pattern: '^van\s+(nieuwegeinse\s+bodem|noord|wirskaante|de\s+(bestuurstafel|voorzitter))$'
|
|
discard_reason: "From X section"
|
|
- pattern: '^fiscaal\s+nummer$'
|
|
discard_reason: "Tax number"
|
|
- pattern: '^financiele\s+verantwoording$'
|
|
discard_reason: "Financial accountability"
|
|
- pattern: '^nieuwe\s+(aanwinsten|zaak\s+starten)$'
|
|
discard_reason: "New acquisitions/start"
|
|
- pattern: '^nieuw\s+wachtwoord\s+aanvragen$'
|
|
discard_reason: "Request new password"
|
|
- pattern: '^minder\s+valide$'
|
|
discard_reason: "Disabled access"
|
|
- pattern: '^methode\s+van\b'
|
|
discard_reason: "Method of"
|
|
- pattern: '^kaart\s+kernen$'
|
|
discard_reason: "Map cores"
|
|
- pattern: '^molens\s+(in|loil)$'
|
|
discard_reason: "Mills section"
|
|
- pattern: '^of\s+zocht\s+u$'
|
|
discard_reason: "Or did you search"
|
|
- pattern: '^oude\s+(ansichtkaarten|films|kerkhof)$'
|
|
discard_reason: "Old X section"
|
|
- pattern: '^straten\s+in\b'
|
|
discard_reason: "Streets in"
|
|
- pattern: '^studie\s+hoek$'
|
|
discard_reason: "Study corner"
|
|
- pattern: '^kunstenaars\s+in\b'
|
|
discard_reason: "Artists in"
|
|
- pattern: '^na\s+wo\s+ii$'
|
|
discard_reason: "After WW2"
|
|
- pattern: '^rijwielvordering\s+wo\s+ii$'
|
|
discard_reason: "WW2 bicycle requisition"
|
|
- pattern: '^rijnlands\s+vastgoed$'
|
|
discard_reason: "Rijnland real estate"
|
|
- pattern: '^rijnlandse\s+mascottes$'
|
|
discard_reason: "Rijnland mascots"
|
|
- pattern: '^rietwijk\s+of\s+reewijk$'
|
|
discard_reason: "Rietwijk or Reewijk"
|
|
- pattern: '^roggel\s+(leef|en\s+omgeving)$'
|
|
discard_reason: "Roggel section"
|
|
- pattern: '^roggelse\s+verenigingen$'
|
|
discard_reason: "Roggel associations"
|
|
- pattern: '^rozet\s+voor\s+jou$'
|
|
discard_reason: "Rozet for you"
|
|
- pattern: '^veluws\s+schoon$'
|
|
discard_reason: "Clean Veluwe"
|
|
- pattern: '^katolieke\s+emancipatie$'
|
|
discard_reason: "Catholic emancipation"
|
|
- pattern: '^keur\s+van\s+grafstenen$'
|
|
discard_reason: "Selection of gravestones"
|
|
|
|
specific_false_positives:
|
|
description: "Specific strings identified as false positives"
|
|
patterns:
|
|
- pattern: '^foto\s+(actief|herkenning|inzenden|album)$'
|
|
discard_reason: "Photo section"
|
|
- pattern: '^foto\s+kenneth\s+stamp$'
|
|
discard_reason: "Photo credit"
|
|
- pattern: '^comics\s+plus$'
|
|
discard_reason: "Product name"
|
|
- pattern: '^canon\s+production\b'
|
|
discard_reason: "Canon production"
|
|
- pattern: '^cultuurimpuls\b'
|
|
discard_reason: "Culture impulse"
|
|
- pattern: '^edmond\s+\w+\s+penning$'
|
|
discard_reason: "Medal name"
|
|
- pattern: '^eigen\s+uitgaven$'
|
|
discard_reason: "Own publications"
|
|
- pattern: '^een\s+australische\b'
|
|
discard_reason: "An Australian"
|
|
- pattern: '^brabants\s+heem$'
|
|
discard_reason: "Brabant heritage"
|
|
- pattern: '^buurt\s+battle$'
|
|
discard_reason: "Neighborhood battle"
|
|
- pattern: '^middengebied\b'
|
|
discard_reason: "Middle area"
|
|
- pattern: '^zwolse\s+parken$'
|
|
discard_reason: "Zwolle parks"
|
|
- pattern: '^zandeind\s+in\b'
|
|
discard_reason: "Zandeind location"
|
|
- pattern: '^zelf\s+bewaren$'
|
|
discard_reason: "Self storage"
|
|
- pattern: '^zeldzame\s+voorwerpen$'
|
|
discard_reason: "Rare objects"
|
|
- pattern: '^woldzigt\s+agenda$'
|
|
discard_reason: "Woldzigt agenda"
|
|
- pattern: '^acht\s+van\s+chaam$'
|
|
discard_reason: "Eight of Chaam"
|
|
- pattern: '^bij\s+de\s+barones$'
|
|
discard_reason: "At the baroness"
|
|
- pattern: '^boek\s+elle\s+klop$'
|
|
discard_reason: "Book title"
|
|
- pattern: '^bemmel\s+ressen\b'
|
|
discard_reason: "Place names"
|
|
- pattern: '^aold\s+hoksebarge$'
|
|
discard_reason: "Dialect place name"
|
|
- pattern: '^bientien\s+over\b'
|
|
discard_reason: "Room name"
|
|
- pattern: '^tonnie\s+en\s+kee\b'
|
|
discard_reason: "Show characters"
|
|
- pattern: '^den\s+brouwer$'
|
|
discard_reason: "Place/building name"
|
|
- pattern: '\bop\s+gastenboek\b'
|
|
discard_reason: "Guestbook navigation"
|
|
- pattern: '^de\s+laar$'
|
|
discard_reason: "Place name"
|
|
- pattern: '^serie\s+droge\s+voeten$'
|
|
discard_reason: "Publication series"
|
|
- pattern: '^familie\s+(bindels|janssen)$'
|
|
discard_reason: "Family section header"
|
|
- pattern: '^hof\s+loil$'
|
|
discard_reason: "Place name"
|
|
- pattern: '^hoolten\s+klinte$'
|
|
discard_reason: "Dialect place name"
|
|
- pattern: '^verhildersum\s+to\s+go$'
|
|
discard_reason: "Product name"
|
|
- pattern: '^voerman\s+verwondert$'
|
|
discard_reason: "Exhibition title"
|
|
- pattern: '^heemskerker\s+ezels$'
|
|
discard_reason: "Organization name"
|
|
- pattern: '^jolly\s+duck$'
|
|
discard_reason: "Venue name"
|
|
- pattern: '^maria\s+kleuterschool$'
|
|
discard_reason: "School name"
|
|
- pattern: '^waterlandsmuseum\s+de\s+speeltoren$'
|
|
discard_reason: "Museum name"
|
|
- pattern: '^pelt\s+als\s+architect$'
|
|
discard_reason: "Article title"
|
|
- pattern: '^mierlo\s+puzzel$'
|
|
discard_reason: "Puzzle name"
|
|
- pattern: '^ozosnel\s+fandagen$'
|
|
discard_reason: "Event name"
|
|
- pattern: '^spijkerserve\s+\w+$'
|
|
discard_reason: "Place name"
|
|
- pattern: '^stalpers\s+opleidingen\b'
|
|
discard_reason: "Training company"
|
|
- pattern: '^taalbrigade\s+kids$'
|
|
discard_reason: "Program name"
|
|
- pattern: '^numaga\s+(excursies|jaarboek)$'
|
|
discard_reason: "Numaga section"
|
|
- pattern: '^meierijse\s+schoutsrekeningen$'
|
|
discard_reason: "Historical records"
|
|
- pattern: '^nieuwegein\s+lokaal$'
|
|
discard_reason: "Local Nieuwegein"
|
|
- pattern: '^nieuwjaarke\s+zingen$'
|
|
discard_reason: "New Year singing"
|
|
- pattern: '^nieuwveense\s+landen$'
|
|
discard_reason: "Place name"
|
|
- pattern: '^oijen\s+en\s+teeffelen$'
|
|
discard_reason: "Place names"
|
|
- pattern: '^molukse\s+(graven|muziek)$'
|
|
discard_reason: "Moluccan section"
|
|
- pattern: '^kruisen\s+en\s+\w+$'
|
|
discard_reason: "Crosses section"
|
|
|
|
# =============================================================================
|
|
# RELATIONSHIP PREDICATES REFERENCE
|
|
# From: data/entity_annotation/modules/advanced/relationship_annotations.yaml
|
|
# =============================================================================
|
|
|
|
relationship_predicates:
|
|
organizational:
|
|
- id: org:memberOf
|
|
description: "Entity is member of organization"
|
|
domain: [AGT.PER, GRP.*]
|
|
range: [GRP.*]
|
|
|
|
- id: org:subOrganizationOf
|
|
description: "Organization is part of larger organization"
|
|
domain: [GRP.*]
|
|
range: [GRP.*]
|
|
|
|
- id: org:unitOf
|
|
description: "Organizational unit is part of organization"
|
|
domain: [GRP.UNT]
|
|
range: [GRP.*]
|
|
|
|
- id: org:hasSite
|
|
description: "Organization has location/building"
|
|
domain: [GRP.*]
|
|
range: [TOP.BLD, TOP.SET]
|
|
|
|
- id: org:linkedTo
|
|
description: "Organization is linked to another"
|
|
domain: [GRP.*]
|
|
range: [GRP.*]
|
|
|
|
- id: org:role
|
|
description: "Organization has role/position"
|
|
domain: [GRP.*]
|
|
range: [ROL.*]
|
|
|
|
spatial:
|
|
- id: schema:location
|
|
description: "Entity is located at place"
|
|
domain: [GRP.*, TOP.BLD]
|
|
range: [TOP.SET, TOP.REG, TOP.ADR]
|
|
|
|
- id: schema:areaServed
|
|
description: "Organization serves geographic area"
|
|
domain: [GRP.*]
|
|
range: [TOP.SET, TOP.REG]
|
|
|
|
- id: crm:P53_has_former_or_current_location
|
|
description: "Heritage site location (temporal)"
|
|
domain: [TOP.BLD, TOP.FEA]
|
|
range: [TOP.SET]
|
|
|
|
creative:
|
|
- id: dcterms:publisher
|
|
description: "Work published by organization"
|
|
domain: [WRK.*]
|
|
range: [GRP.*]
|
|
|
|
- id: dcterms:references
|
|
description: "Entity references work"
|
|
domain: [GRP.*, AGT.PER]
|
|
range: [WRK.*]
|
|
|
|
- id: schema:mentions
|
|
description: "Entity mentions person/thing"
|
|
domain: [GRP.*, WRK.*]
|
|
range: [AGT.PER, GRP.*]
|
|
|
|
identity:
|
|
- id: owl:sameAs
|
|
description: "Entities are the same"
|
|
domain: ["*"]
|
|
range: ["*"]
|