# Dutch Web Patterns for Entity Annotation # ============================================================================= # Converted from cleanup_contact_false_positives_v2.py # # Purpose: Define patterns for extracting typed entities from Dutch heritage # website content with relationship predicates to the custodian being processed. # # Each pattern can have: # - entity_type: CH-Annotator hypernym code (GRP.*, TOP.*, WRK.*, ROL.*, AGT.*, null) # - capture_groups: Named groups that capture sub-entities # - relationships: Predicates connecting extracted entity to custodian or other entities # - discard_reason: For patterns that identify non-entities (UI elements, etc.) # # Version: 1.0.0 # Date: 2025-12-13 # Source: scripts/cleanup_contact_false_positives_v2.py lines 28-1166 # ============================================================================= metadata: id: dutch_web_patterns_v1 name: Dutch Web Content Entity Patterns version: "1.1.0" language: nl description: >- Patterns for extracting and classifying entities from Dutch heritage institution websites. Patterns are derived from false positive cleanup analysis of 168 custodian web archives. Version 1.1.0 adds layout_hints based on analysis of 1,525 annotated web archives showing XPath → entity type correlations. source_script: scripts/cleanup_contact_false_positives_v2.py ch_annotator_version: "1.7.0" pattern_count: 646 # Layout hints configuration based on analysis of 1,525 web archives # These define which XPath locations are most predictive for each entity type layout_hints: description: >- XPath location hints derived from analyzing 15,252 entity claims across 1,343 unique websites. Patterns found at expected locations receive confidence boost. # High-confidence XPath → entity type mappings (>80% correlation) high_confidence_locations: GRP.HER: description: "Heritage institutions (museums, archives, libraries)" primary_xpaths: - "head/title" # 41.8% of GRP.HER found here - "body/*/h1" # Primary heading - "head/meta[@name='description']/@content" confidence_boost: 0.2 GRP.ASS: description: "Associations and societies" primary_xpaths: - "head/title" # 39.3% of GRP.ASS found here - "body/*/header/h1" confidence_boost: 0.15 GRP.GOV: description: "Government bodies" primary_xpaths: - "head/title" - "body/*/h1" - "body/*/header" confidence_boost: 0.15 TOP.ADR: description: "Addresses" primary_xpaths: - "body/footer/*" # 23.8% of addresses in footer - "body/*/footer/*/p" - "body/*/p" # Paragraphs confidence_boost: 0.2 TMP.OPH: description: "Opening hours" primary_xpaths: - "body/*/footer" - "body/*/table" # Often in tables confidence_boost: 0.15 AGT.PER: description: "Person names" primary_xpaths: - "body/*/p" # 36.4% in paragraphs - "body/*/ul/li" # Staff lists confidence_boost: 0.1 # Locations to deprioritize (often noise) low_confidence_locations: - "body/*/nav" # Navigation (menu items, not entities) - "body/*/script" # JavaScript - "body/*/style" # CSS # Discard locations (always ignore content from these) discard_locations: - "head/script" # JS in head - "body/*/noscript" # Noscript fallbacks # ============================================================================= # ENTITY PATTERNS - Patterns that identify extractable entities with types # ============================================================================= entity_patterns: # --------------------------------------------------------------------------- # ORGANIZATION PATTERNS (GRP.*) # Heritage organizations, associations, societies, government bodies # --------------------------------------------------------------------------- organizations: description: "Patterns identifying heritage organizations and related bodies" heritage_associations: description: "Historical/heritage associations and societies" patterns: - pattern: '^historische\s+vereniging\s+(\w+)$' entity_type: GRP.ASS entity_subtype: GRP.ASS.HER # Heritage association label_template: "Historische Vereniging {1}" capture_groups: 1: type: TOP.SET role: location_name description: "Settlement/place name" relationships: - predicate: schema:location subject: $0 object: $1 confidence: 0.9 examples: - text: "Historische Vereniging Aalten" entity: "Historische Vereniging Aalten" captures: 1: "Aalten" - pattern: '^heemkundige\s+kring\s+(\w+)$' entity_type: GRP.ASS entity_subtype: GRP.ASS.HER label_template: "Heemkundige Kring {1}" capture_groups: 1: type: TOP.SET role: location_name relationships: - predicate: schema:location subject: $0 object: $1 examples: - text: "Heemkundige Kring Halle" entity: "Heemkundige Kring Halle" - pattern: '^heemkunde\s*kring\s+(\w+)$' entity_type: GRP.ASS entity_subtype: GRP.ASS.HER capture_groups: 1: type: TOP.SET role: location_name - pattern: '^heemkunde\s+werkgroep\s+(\w+)$' entity_type: GRP.ASS entity_subtype: GRP.ASS.HER capture_groups: 1: type: TOP.SET role: location_name - pattern: '^historische\s+werkgroep\s+(\w+)$' entity_type: GRP.ASS entity_subtype: GRP.ASS.HER capture_groups: 1: type: TOP.SET role: location_name - pattern: '^oudheidkundige?\s+(kring|vereniging)\s+(\w+)$' entity_type: GRP.ASS entity_subtype: GRP.ASS.HER capture_groups: 1: type: null role: organization_type 2: type: TOP.SET role: location_name - pattern: '^heemkundevereniging\s+(\w+)$' entity_type: GRP.ASS entity_subtype: GRP.ASS.HER capture_groups: 1: type: TOP.SET role: location_name - pattern: '^(\w+)se?\s+(historische\s+)?(vereniging|kring|werkgroep|stichting|genootschap)$' entity_type: GRP.ASS description: "Geographic modifier + organization type" capture_groups: 1: type: TOP.SET role: location_adjective 3: type: null role: organization_type examples: - text: "Nijmeegse Historische Vereniging" - text: "Leidse Kring" - pattern: '^puttens\s+historisch\s+genootschap$' entity_type: GRP.ASS entity_subtype: GRP.ASS.HER relationships: - predicate: schema:location subject: $0 object: "Putten" object_type: TOP.SET municipalities: description: "Dutch municipal governments" patterns: - pattern: '^gemeente\s+([\w-]+)$' entity_type: GRP.GOV entity_subtype: GRP.GOV.MUN # Municipality label_template: "Gemeente {1}" capture_groups: 1: type: TOP.SET role: municipality_name relationships: - predicate: org:subOrganizationOf subject: $0 object: "https://www.wikidata.org/entity/Q29999" # Kingdom of Netherlands object_type: GRP.GOV confidence: 1.0 examples: - text: "Gemeente Borger-Odoorn" entity: "Gemeente Borger-Odoorn" captures: 1: "Borger-Odoorn" - pattern: '^gemeentehuis\s+([\w-]+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.GOV # Government building label_template: "Gemeentehuis {1}" capture_groups: 1: type: TOP.SET role: municipality_name relationships: - predicate: org:hasSite subject: "Gemeente {1}" subject_type: GRP.GOV object: $0 - pattern: '^gemeente\s+archieven$' entity_type: GRP.HER entity_subtype: GRP.HER.ARC relationships: - predicate: org:subOrganizationOf subject: $0 object: CUSTODIAN heritage_institutions: description: "Museums, archives, libraries" patterns: - pattern: '^(het|de)\s+(\w+)\s*(museum|archief|bibliotheek)$' entity_type: GRP.HER capture_groups: 2: type: APP.NAM role: institution_name 3: type: null role: institution_type_keyword relationships: - predicate: org:linkedTo subject: $0 object: CUSTODIAN confidence: 0.7 examples: - text: "Het Rijksmuseum" - text: "De Bibliotheek" - pattern: '^(\w+)\s+(\w+)\s+museum$' entity_type: GRP.HER entity_subtype: GRP.HER.MUS description: "Two-word museum names" capture_groups: 1: type: APP.NAM role: name_part_1 2: type: APP.NAM role: name_part_2 examples: - text: "Pieter Vermeulen Museum" - text: "Nederlands Graanmuseum" - pattern: '^(nationaal|nederlands|nederlandse|oudheidkundig|virtueel)\s+(\w*)(museum|archief)$' entity_type: GRP.HER capture_groups: 1: type: null role: scope_modifier 2: type: APP.NAM role: subject_area 3: type: null role: institution_type_keyword examples: - text: "Nationaal Glasmuseum" - text: "Nederlands Openluchtmuseum" - pattern: '^regionaal\s+archief\s+(\w+)$' entity_type: GRP.HER entity_subtype: GRP.HER.ARC capture_groups: 1: type: TOP.REG role: region_name - pattern: '^stadsarchief\s+(\w+)$' entity_type: GRP.HER entity_subtype: GRP.HER.ARC capture_groups: 1: type: TOP.SET role: city_name - pattern: '^gemeentearchief\s+(\w+)$' entity_type: GRP.HER entity_subtype: GRP.HER.ARC capture_groups: 1: type: TOP.SET role: municipality_name - pattern: '^streekmuseum\s+(.+)$' entity_type: GRP.HER entity_subtype: GRP.HER.MUS label_template: "Streekmuseum {1}" capture_groups: 1: type: APP.NAM role: eponymous_name relationships: - predicate: org:linkedTo subject: $0 object: CUSTODIAN confidence: 0.8 examples: - text: "Streekmuseum Jan Anderson" - pattern: '^streekhistorisch\s+centrum\s+(\w+)$' entity_type: GRP.HER capture_groups: 1: type: TOP.SET role: region_name provincial_heritage: description: "Provincial heritage organizations" patterns: - pattern: '^erfgoed\s+(brabant|gelderland|zeeland|limburg|utrecht|friesland|drenthe|overijssel|flevoland|groningen)$' entity_type: GRP.HER entity_subtype: GRP.HER.OFF # Official heritage organization capture_groups: 1: type: TOP.REG role: province_name relationships: - predicate: schema:areaServed subject: $0 object: $1 object_type: TOP.REG - pattern: '^gelderse\s+kerken$' entity_type: GRP.ASS entity_subtype: GRP.ASS.REL # Religious association relationships: - predicate: schema:areaServed subject: $0 object: "Gelderland" object_type: TOP.REG - pattern: '^groninger\s+waddenmusea$' entity_type: GRP.ASS relationships: - predicate: schema:areaServed subject: $0 object: "Groningen" object_type: TOP.REG - pattern: '^flevolands\s+geheugen$' entity_type: GRP.HER entity_subtype: GRP.HER.DIG # Digital heritage platform - pattern: '^fryske\s+akademy$' entity_type: GRP.RES entity_subtype: GRP.RES.ACA # Academic research organizational_units: description: "Internal organizational units and governance bodies" patterns: - pattern: '^raad\s+van\s+toezicht$' entity_type: GRP.UNT entity_subtype: GRP.UNT.GOV # Governance unit relationships: - predicate: org:unitOf subject: $0 object: CUSTODIAN confidence: 0.95 examples: - text: "Raad van Toezicht" - pattern: '^het\s+bestuur$' entity_type: GRP.UNT entity_subtype: GRP.UNT.GOV relationships: - predicate: org:unitOf subject: $0 object: CUSTODIAN - pattern: '^de\s+stichting$' entity_type: GRP.ORG entity_subtype: GRP.ORG.FND # Foundation relationships: - predicate: owl:sameAs subject: $0 object: CUSTODIAN confidence: 0.8 - pattern: '^de\s+vereniging$' entity_type: GRP.ASS relationships: - predicate: owl:sameAs subject: $0 object: CUSTODIAN confidence: 0.8 - pattern: '^management\s+team$' entity_type: GRP.UNT relationships: - predicate: org:unitOf subject: $0 object: CUSTODIAN government_bodies: description: "Government bodies and positions" patterns: - pattern: '^gedeputeerde\s+staten$' entity_type: GRP.GOV entity_subtype: GRP.GOV.PRO # Provincial government - pattern: '^provinciale\s+staten$' entity_type: GRP.GOV entity_subtype: GRP.GOV.PRO - pattern: '^burgemeester\s+en\s+wethouders$' entity_type: GRP.GOV entity_subtype: GRP.GOV.MUN - pattern: '^commissaris\s+van\s+de\s+koning$' entity_type: ROL.POS entity_subtype: ROL.POS.GOV # Government position - pattern: '^raad\s+van\s+state$' entity_type: GRP.GOV entity_subtype: GRP.GOV.NAT # National government - pattern: '^nationale\s+ombudsman$' entity_type: ROL.POS entity_subtype: ROL.POS.GOV businesses: description: "Commercial entities" patterns: - pattern: '^(\w+)er\s+handelsvereniging$' entity_type: GRP.COR entity_subtype: GRP.COR.ASS # Trade association capture_groups: 1: type: TOP.SET role: place_adjective examples: - text: "Meppeler Handelsvereniging" - pattern: '^bouwbedrijf\s+(\w+)$' entity_type: GRP.COR capture_groups: 1: type: APP.NAM role: company_name - pattern: '^rabobank\s+(\w+)$' entity_type: GRP.COR entity_subtype: GRP.COR.BNK # Bank capture_groups: 1: type: TOP.SET role: branch_location cultural_organizations: description: "Cultural and arts organizations" patterns: - pattern: '^dansstudio\s+(\w+)$' entity_type: GRP.CUL capture_groups: 1: type: APP.NAM role: studio_name - pattern: '^toneelvereniging\s+(\w+)$' entity_type: GRP.ASS entity_subtype: GRP.ASS.CUL # Cultural association capture_groups: 1: type: APP.NAM role: association_name - pattern: '^schutterij\s+(de\s+)?(\w+)$' entity_type: GRP.ASS entity_subtype: GRP.ASS.TRD # Traditional association capture_groups: 2: type: APP.NAM role: guild_name - pattern: '^schuttersgilde\s+([\w-]+)$' entity_type: GRP.ASS entity_subtype: GRP.ASS.TRD capture_groups: 1: type: APP.NAM role: guild_name - pattern: '^schuttersvereniging\s+([\w-]+)$' entity_type: GRP.ASS entity_subtype: GRP.ASS.TRD capture_groups: 1: type: APP.NAM role: association_name # --------------------------------------------------------------------------- # BUILDING/PLACE PATTERNS (TOP.*) # Physical structures, estates, monuments, religious buildings # --------------------------------------------------------------------------- buildings_places: description: "Patterns identifying physical locations and structures" castles_estates: description: "Castles, estates, and manor houses" patterns: - pattern: '^kasteel\s+(\w+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.CAS # Castle label_template: "Kasteel {1}" capture_groups: 1: type: APP.NAM role: castle_name relationships: - predicate: org:hasSite subject: CUSTODIAN object: $0 confidence: 0.7 examples: - text: "Kasteel Oud Haarlem" - pattern: '^kasteel\s+oud\s+haarlem$' entity_type: TOP.BLD entity_subtype: TOP.BLD.CAS - pattern: '^landgoed\s+(\w+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.EST # Estate capture_groups: 1: type: APP.NAM role: estate_name relationships: - predicate: org:hasSite subject: CUSTODIAN object: $0 confidence: 0.6 - pattern: '^landgoed\s+(borg|de)\s+(\w+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.EST capture_groups: 2: type: APP.NAM role: estate_name - pattern: '^huize\s+(\w+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.MAN # Manor house capture_groups: 1: type: APP.NAM role: house_name - pattern: '^huis\s+(ten|van|de)\s+(\w+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.MAN capture_groups: 2: type: APP.NAM role: house_name examples: - text: "Huis ten Bosch" - text: "Huis van Oud" - pattern: '^hoeve\s+(de\s+)?(\w+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.FRM # Farm/farmhouse capture_groups: 2: type: APP.NAM role: farm_name - pattern: '^herberg\s+(de\s+)?(\w+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.INN # Historic inn capture_groups: 2: type: APP.NAM role: inn_name fortifications: description: "Forts, bunkers, defensive structures" patterns: - pattern: '^fort\s+(\w+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.FOR # Fortification capture_groups: 1: type: APP.NAM role: fort_name relationships: - predicate: crm:P53_has_former_or_current_location subject: $0 object: CUSTODIAN_LOCATION - pattern: '^de\s+atlantikwall$' entity_type: TOP.BLD entity_subtype: TOP.BLD.FOR - pattern: '^kamp\s+vught$' entity_type: TOP.BLD entity_subtype: TOP.BLD.HIS # Historic site relationships: - predicate: schema:location subject: $0 object: "Vught" object_type: TOP.SET religious_buildings: description: "Churches, chapels, monasteries" patterns: - pattern: '^sint\s+(\w+)(kerk|gebouw)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.REL # Religious building capture_groups: 1: type: APP.NAM role: saint_name 2: type: null role: building_type - pattern: '^protestantse\s+(kerk|pastorie)\s+(\w+)?$' entity_type: TOP.BLD entity_subtype: TOP.BLD.REL capture_groups: 1: type: null role: building_type 2: type: TOP.SET role: location_name - pattern: '^kapel\s+van\s+(\w+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.REL capture_groups: 1: type: APP.NAM role: dedication - pattern: '^mariakapel\s+(\w+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.REL capture_groups: 1: type: TOP.SET role: location_name examples: - text: "Mariakapel Nieuw-Dijk" monuments: description: "Monuments, memorials, historic markers" patterns: - pattern: '^monument(en)?\s+(in|didam|loil|nieuw-dijk|oud-dijk|buurtschap)\b' entity_type: TOP.FEA entity_subtype: TOP.FEA.MON # Monument - pattern: '^grafheuvel\s+(\w+)$' entity_type: TOP.FEA entity_subtype: TOP.FEA.ARC # Archaeological feature capture_groups: 1: type: APP.NAM role: feature_name - pattern: '^nationaal\s+monument$' entity_type: TOP.FEA entity_subtype: TOP.FEA.MON - pattern: '^kruisbeeld\s+op\s+(\w+)$' entity_type: TOP.FEA entity_subtype: TOP.FEA.REL # Religious monument capture_groups: 1: type: TOP.SET role: location cultural_venues: description: "Theaters, community centers, museums" patterns: - pattern: '^theater\s+(de|het)\s+(\w+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.CUL # Cultural building capture_groups: 2: type: APP.NAM role: theater_name - pattern: '^buurthuis\s+(\w+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.COM # Community building capture_groups: 1: type: APP.NAM role: building_name - pattern: '^poppodium\s+(de\s+)?(\w+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.CUL capture_groups: 2: type: APP.NAM role: venue_name examples: - text: "Poppodium de Peppel" - pattern: '^aula\s+(\w+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.EDU # Educational building capture_groups: 1: type: APP.NAM role: building_name industrial_heritage: description: "Mills, factories, industrial sites" patterns: - pattern: '^kalkoven\s+(\w+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.IND # Industrial building capture_groups: 1: type: APP.NAM role: site_name - pattern: '^scheepswerf\s+(\w+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.IND capture_groups: 1: type: APP.NAM role: shipyard_name - pattern: '^werkplaats\s+(\w+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.IND capture_groups: 1: type: APP.NAM role: workshop_name parks_gardens: description: "Parks, gardens, nature reserves" patterns: - pattern: '^botanische\s+tuin\s+(\w+)?$' entity_type: TOP.BLD entity_subtype: TOP.BLD.GAR # Garden capture_groups: 1: type: APP.NAM role: garden_name - pattern: '^pinetum\s+(\w+)$' entity_type: TOP.BLD entity_subtype: TOP.BLD.GAR capture_groups: 1: type: APP.NAM role: arboretum_name - pattern: '^landschapspark\s+(\w+)$' entity_type: TOP.GEO entity_subtype: TOP.GEO.PRK # Park capture_groups: 1: type: APP.NAM role: park_name places_named: description: "Named places and locations" patterns: - pattern: '^dekema\s+state$' entity_type: TOP.BLD entity_subtype: TOP.BLD.EST - pattern: '^klein\s+(amerika|rome|zundert)$' entity_type: TOP.SET entity_subtype: TOP.SET.HAM # Hamlet/small settlement capture_groups: 1: type: APP.NAM role: place_reference # --------------------------------------------------------------------------- # PUBLICATION/WORK PATTERNS (WRK.*) # Publications, periodicals, books, reports # --------------------------------------------------------------------------- publications: description: "Patterns identifying publications and works" periodicals: description: "Magazines, newsletters, journals" patterns: - pattern: '^jaarboek(en)?\s+(\w+)$' entity_type: WRK.MAN entity_subtype: WRK.MAN.SER # Serial publication label_template: "Jaarboek {2}" capture_groups: 2: type: APP.NAM role: publication_name relationships: - predicate: dcterms:publisher subject: $0 object: CUSTODIAN confidence: 0.85 examples: - text: "Jaarboeken Aover Diem" - pattern: '^jaarboek(en)?\s+aover\s+diem$' entity_type: WRK.MAN entity_subtype: WRK.MAN.SER relationships: - predicate: dcterms:publisher subject: $0 object: CUSTODIAN - pattern: '^verenigingsblad\s+(\w+)$' entity_type: WRK.MAN entity_subtype: WRK.MAN.SER capture_groups: 1: type: APP.NAM role: publication_name relationships: - predicate: dcterms:publisher subject: $0 object: CUSTODIAN - pattern: '^verenigingsorgaan\s+(de\s+)?(\w+)$' entity_type: WRK.MAN entity_subtype: WRK.MAN.SER capture_groups: 2: type: APP.NAM role: publication_name - pattern: '^myerlese\s+koerier$' entity_type: WRK.MAN entity_subtype: WRK.MAN.SER - pattern: '^nijmeegs\s+katern$' entity_type: WRK.MAN entity_subtype: WRK.MAN.SER - pattern: '^old\s+ni-js(\s+edities)?$' entity_type: WRK.MAN entity_subtype: WRK.MAN.SER - pattern: '^roggels\s+blaadje$' entity_type: WRK.MAN entity_subtype: WRK.MAN.SER - pattern: '^suetan\s+kwartaalbladen$' entity_type: WRK.MAN entity_subtype: WRK.MAN.SER - pattern: '^tusken\s+de\s+marren$' entity_type: WRK.MAN entity_subtype: WRK.MAN.SER - pattern: '^verleden\s+tijdschrift$' entity_type: WRK.MAN entity_subtype: WRK.MAN.SER - pattern: '^dedemsvaartse\s+courant$' entity_type: WRK.MAN entity_subtype: WRK.MAN.SER book_series: description: "Book series and monographs" patterns: - pattern: '^reeuwijkse\s+(bronnen|reeks)$' entity_type: WRK.MAN entity_subtype: WRK.MAN.SER capture_groups: 1: type: null role: series_type - pattern: '^uitgaven\s+(\w+)$' entity_type: WRK.MAN entity_subtype: WRK.MAN.COL # Collection capture_groups: 1: type: APP.NAM role: publisher_name - pattern: '^publicaties\s+(\w+)$' entity_type: WRK.MAN entity_subtype: WRK.MAN.COL capture_groups: 1: type: APP.NAM role: collection_name # --------------------------------------------------------------------------- # ROLE/OCCUPATION PATTERNS (ROL.*) # Job titles, positions, functions # --------------------------------------------------------------------------- roles: description: "Patterns identifying roles, positions, and occupations" job_titles: description: "Professional job titles" patterns: - pattern: '^senior\s+(applicatiebeheerder|systeembeheerder)$' entity_type: ROL.OCC entity_subtype: ROL.OCC.TEC # Technical occupation capture_groups: 1: type: null role: job_specialty relationships: - predicate: org:role subject: CUSTODIAN object: $0 examples: - text: "Senior Applicatiebeheerder" - pattern: '^financial\s+controller$' entity_type: ROL.OCC entity_subtype: ROL.OCC.ADM # Administrative occupation # --------------------------------------------------------------------------- # PERSON PATTERNS (AGT.*) # Historical figures, references to people # --------------------------------------------------------------------------- persons: description: "Patterns identifying references to persons (not contact persons)" historical_figures: description: "Famous historical figures mentioned in content" patterns: - pattern: '^vincent\s+van\s+gogh$' entity_type: AGT.PER entity_subtype: AGT.PER.ART # Artist relationships: - predicate: schema:mentions subject: CUSTODIAN object: $0 description: "Custodian mentions this historical figure" wikidata_id: Q5582 - pattern: '^rembrandt(\s+van\s+rijn)?$' entity_type: AGT.PER entity_subtype: AGT.PER.ART wikidata_id: Q5598 - pattern: '^johannes\s+vermeer$' entity_type: AGT.PER entity_subtype: AGT.PER.ART wikidata_id: Q41264 - pattern: '^vermeer$' entity_type: AGT.PER entity_subtype: AGT.PER.ART wikidata_id: Q41264 # ============================================================================= # DISCARD PATTERNS - Patterns that identify non-entities to be filtered out # ============================================================================= discard_patterns: description: >- Patterns matching text that should NOT be extracted as entities. These are UI elements, navigation text, form labels, etc. urls_technical: description: "URLs and technical strings" patterns: - pattern: '^https?://' discard_reason: "URL - not an entity name" - pattern: '^www\.' discard_reason: "URL fragment" - pattern: '\.html$' discard_reason: "File extension" - pattern: '\.php$' discard_reason: "File extension" - pattern: '\.aspx?$' discard_reason: "File extension" - pattern: '/photos/' discard_reason: "URL path segment" - pattern: '/places/' discard_reason: "URL path segment" - pattern: '^ChIJ' discard_reason: "Google Place ID" - pattern: '^AWn5SU' discard_reason: "Google photo ID" - pattern: 'WordPress' discard_reason: "CMS name" navigation: description: "Website navigation elements" patterns: - pattern: '^menu\s+schakelen$' discard_reason: "Navigation toggle" - pattern: '^go\s+to\s+top$' discard_reason: "Navigation link" - pattern: '^page\s+load\s+link$' discard_reason: "Navigation element" - pattern: '^skip\s+to\b' discard_reason: "Accessibility navigation" - pattern: '^jump\s+to\b' discard_reason: "Navigation link" - pattern: '^ga\s+naar\b' discard_reason: "Dutch navigation" - pattern: '^terug\s+naar\b' discard_reason: "Dutch navigation (back to)" - pattern: '^naar\s+(de|het|inhoud|menu)\b' discard_reason: "Dutch navigation" - pattern: '^back\s+to\b' discard_reason: "Navigation link" - pattern: '^footer\s+navigatie$' discard_reason: "Footer navigation" - pattern: '^hoofd\s*navigatie$' discard_reason: "Main navigation" - pattern: '^volg\s+ons\b' discard_reason: "Social media CTA" - pattern: '^follow\s+(us|this)\b' discard_reason: "Social media CTA" - pattern: '^menu\s+overslaan$' discard_reason: "Skip menu" - pattern: '^scroll\s+naar\b' discard_reason: "Scroll instruction" form_buttons: description: "Form labels and button text" patterns: - pattern: '^typ\s+hier\b' discard_reason: "Form placeholder" - pattern: '^vul\s+in\b' discard_reason: "Form instruction" - pattern: '^selecteer\b' discard_reason: "Form instruction" - pattern: '^kies\s+' discard_reason: "Form instruction" - pattern: '^zoek(en)?(\s+in)?$' discard_reason: "Search button" - pattern: '^aanmeld(en|ing)$' discard_reason: "Registration button" - pattern: '^afmeld(en|ing)$' discard_reason: "Unsubscribe button" - pattern: '^reserv(eren|ering)$' discard_reason: "Reservation button" - pattern: '^verzend(en)?$' discard_reason: "Submit button" - pattern: '^accepteer\b' discard_reason: "Accept button" - pattern: '^afwijzen\b' discard_reason: "Reject button" - pattern: '^akkoord$' discard_reason: "OK button" - pattern: '^instellingen\s+opslaan$' discard_reason: "Save settings button" section_headers: description: "Generic section headers" patterns: - pattern: '^laatste\s+nieuws$' discard_reason: "Section header" - pattern: '^over\s+(ons|deze)$' discard_reason: "About section" - pattern: '^missie\s+en\s+visie$' discard_reason: "Mission/vision section" - pattern: '^contact$' discard_reason: "Contact section" - pattern: '^contactgegevens$' discard_reason: "Contact details section" - pattern: '^bereikbaarheid$' discard_reason: "Directions section" - pattern: '^openingstijden$' discard_reason: "Opening hours section" - pattern: '^parkeren$' discard_reason: "Parking section" - pattern: '^bezoekadres$' discard_reason: "Visitor address section" - pattern: '^postadres$' discard_reason: "Postal address section" - pattern: '^privacybeleid$' discard_reason: "Privacy policy" - pattern: '^disclaimer$' discard_reason: "Disclaimer section" - pattern: '^colofon$' discard_reason: "Colophon section" - pattern: '^sitemap$' discard_reason: "Sitemap" - pattern: '^veelgestelde\s+vragen$' discard_reason: "FAQ section" website_content: description: "Common website content phrases" patterns: - pattern: '^lees\s+meer\b' discard_reason: "Read more link" - pattern: '^bekijk\s+(de|het|alle|meer)\b' discard_reason: "View more link" - pattern: '^download\s+(de|het)\b' discard_reason: "Download link" - pattern: '^meer\s+info(rmatie)?$' discard_reason: "More info link" - pattern: '^handige\s+(info|links)$' discard_reason: "Useful links section" - pattern: '^gratis\s+(toegang|qr)\b' discard_reason: "Free access notice" single_words: description: "Single-word false positives" values: - admin - contact - home - menu - zoeken - search - login - inloggen - registreren - help - info - nieuws - agenda - kalender - archief - collectie - beeldbank - bronnen - links - partners - sponsors - doneren - lidmaatschap - privacy - disclaimer - sitemap - colofon - cookies - vacatures # --------------------------------------------------------------------------- # ADDITIONAL PHRASE PATTERNS - Action verbs, instructions, content phrases # From PHRASE_PATTERNS lines 167-1144 # --------------------------------------------------------------------------- action_instructions: description: "Call-to-action and instruction phrases" patterns: - pattern: '^meld\s+(je|u|een)\b' discard_reason: "Registration CTA" - pattern: '^geef\s+(je|uw)\b' discard_reason: "Form instruction" - pattern: '^word[t]?\s+(lid|vriend|abonnee)$' discard_reason: "Membership CTA" - pattern: '^steun\s+(het|de|ons)\b' discard_reason: "Donation CTA" - pattern: '^huur\s+(een|het|de|eigendom)\b' discard_reason: "Rental CTA" - pattern: '^schrijf\s+(je|ons)$' discard_reason: "Subscription CTA" - pattern: '^stuur\s+(een|foto)$' discard_reason: "Submission CTA" - pattern: '^reserveer\s+een\b' discard_reason: "Reservation CTA" - pattern: '^plan\s+uw\b' discard_reason: "Planning CTA" - pattern: '^kom\s+(in|verder)$' discard_reason: "Invitation CTA" - pattern: '^klik\s+voor\b' discard_reason: "Click instruction" - pattern: '^ontdek\s+(de|jouw|ons)$' discard_reason: "Discovery CTA" - pattern: '^vind\s+ons$' discard_reason: "Find us CTA" - pattern: '^verstuur\s+bericht$' discard_reason: "Send message CTA" - pattern: '^profiel\s+wijzigen$' discard_reason: "Edit profile CTA" - pattern: '^scans\s+aanvragen$' discard_reason: "Request scans CTA" - pattern: '^vraag\s+(en|of|stellen|afvalpas)$' discard_reason: "Question/request CTA" - pattern: '^vragen\s+(en|staat|over)\b' discard_reason: "Questions section" membership_sections: description: "Membership and subscription sections" patterns: - pattern: '^leden\s+(administratie|en\s+lidmaatschap)$' discard_reason: "Membership admin section" - pattern: '^lid\s+(worden|worden\s+inloggen)$' discard_reason: "Join membership CTA" - pattern: '^lidmaatschap\s+\w+$' discard_reason: "Membership section" - pattern: '^soort\s+lidmaatschap$' discard_reason: "Membership type section" - pattern: '^jaarlijkse\s+bijdrage$' discard_reason: "Annual contribution section" water_management: description: "Water board and environmental management content" patterns: - pattern: '^(legger|peilbesluit|proefsluiting|vervanging|vernieuwen|onderhoud|metingen|bediening)\s+' discard_reason: "Water board operations" - pattern: '^waterschapsbelasting\b' discard_reason: "Water board tax" - pattern: '^ons\s+gebied\b' discard_reason: "Our area section" - pattern: '^handhavingsverzoek\b' discard_reason: "Enforcement request" - pattern: '^waterbeheer\s+en\b' discard_reason: "Water management section" - pattern: '^waterkwaliteit\s+\w+$' discard_reason: "Water quality section" - pattern: '^waterschap\s+\w+$' discard_reason: "Water board name" - pattern: '^waterschapsverordening\s+en\b' discard_reason: "Water board regulation" - pattern: '^waterpeil\s+en\b' discard_reason: "Water level section" - pattern: '^natuur\s+en\s+waterkwaliteit$' discard_reason: "Nature and water quality" - pattern: '^recreatie\s+rondom\s+water$' discard_reason: "Recreation around water" - pattern: '^landbouw\s+en\s+water(kwaliteit)?$' discard_reason: "Agriculture and water" - pattern: '^klimaat\s+en\s+veiligheid$' discard_reason: "Climate and safety" - pattern: '^kaderrichtlijn\s+water$' discard_reason: "Water framework directive" - pattern: '^meten\s+van\s+de\s+waterkwaliteit$' discard_reason: "Water quality measurement" - pattern: '^voldoende\s+water$' discard_reason: "Sufficient water section" - pattern: '^natuurvriendelijke\s+oever$' discard_reason: "Natural riverbank" - pattern: '^oevers\s+\w+$' discard_reason: "Riverbanks section" heritage_content: description: "Heritage and historical content phrases" patterns: - pattern: '^historie\s+(van\s+)?\w+$' discard_reason: "History section" - pattern: '^historisch(e)?\s+(coevorden|spektakel|avond|fietsroute|geografie|groenten|projecten|wandeling)\b' discard_reason: "Historical content section" - pattern: '^gevelstenen\s+in\b' discard_reason: "Facade stones section" - pattern: '^grafvondst\s+bij\b' discard_reason: "Grave find section" - pattern: '^erfgoedcollecties\s+van\b' discard_reason: "Heritage collections section" - pattern: '^vondsten\s+in\b' discard_reason: "Finds section" - pattern: '^lokale\s+vondsten$' discard_reason: "Local finds section" - pattern: '^tijdlijn\s+vondsten$' discard_reason: "Finds timeline" - pattern: '^opgraving\s+\w+$' discard_reason: "Excavation section" - pattern: '^militaire\s+historie$' discard_reason: "Military history section" - pattern: '^genealogische\s+begrippen$' discard_reason: "Genealogical terms" - pattern: '^notariele\s+archieven$' discard_reason: "Notarial archives section" - pattern: '^voorouders\s+op\b' discard_reason: "Ancestors section" - pattern: '^larense\s+voorouders$' discard_reason: "Laren ancestors section" - pattern: '^personenbestand\s+\w+$' discard_reason: "Person database section" - pattern: '^namenlijst\s+\w+$' discard_reason: "Name list section" tours_visits: description: "Tour and visit related content" patterns: - pattern: '^rondleiding\s+\w+$' discard_reason: "Tour section" - pattern: '^rondleidingen\s+en\b' discard_reason: "Tours section" - pattern: '^rondwandeling\s+door\b' discard_reason: "Walking tour" - pattern: '^virtuele\s+tour$' discard_reason: "Virtual tour" - pattern: '^groepen\s+aanmelden$' discard_reason: "Group registration" - pattern: '^programma\s+voor\s+groepen$' discard_reason: "Group program" - pattern: '^wensen\s+rondleiding$' discard_reason: "Tour wishes" - pattern: '^landgoedrondleiding\b' discard_reason: "Estate tour" location_directions: description: "Location and directions content" patterns: - pattern: '^locatie\s+\w+$' discard_reason: "Location section" - pattern: '^locaties\s+\w+$' discard_reason: "Locations section" - pattern: '^overige\s+locaties$' discard_reason: "Other locations" - pattern: '^vestiging\s+\w+$' discard_reason: "Branch location" - pattern: '^route\s+(en|per)$' discard_reason: "Route section" - pattern: '^per\s+(auto|boot)$' discard_reason: "By car/boat directions" - pattern: '^met\s+het\s+ov$' discard_reason: "Public transport" - pattern: '^naar\s+(google\s+maps|bestuurspagina|boven\s+scrollen|veelgestelde\s+vragen)$' discard_reason: "Navigation link" - pattern: '^vanuit\s+\w+$' discard_reason: "From location" time_schedule: description: "Time and schedule related content" patterns: - pattern: '^dag\s+tijden$' discard_reason: "Day times" - pattern: '^\w+dag\s+gesloten$' discard_reason: "Day closed" - pattern: '^goede\s+vrijdag$' discard_reason: "Good Friday" - pattern: '^tweede\s+(paasdag|pinksterdag)$' discard_reason: "Holiday name" - pattern: '^vandaag\s+gesloten$' discard_reason: "Closed today" - pattern: '^morgen\s+gesloten$' discard_reason: "Closed tomorrow" - pattern: '^ook\s+(aanwezig|gesloten)$' discard_reason: "Also present/closed" - pattern: '^gesloten\s+op\b' discard_reason: "Closed on" events_activities: description: "Events and activities content" patterns: - pattern: '^lezingen\s+en\s+\w+$' discard_reason: "Lectures section" - pattern: '^thema\s+avonden$' discard_reason: "Theme evenings" - pattern: '^komende\s+activiteiten$' discard_reason: "Upcoming activities" - pattern: '^cursus\s+\w+$' discard_reason: "Course section" - pattern: '^wandel\s+en\b' discard_reason: "Walking section" - pattern: '^wandelapp\s+\w+$' discard_reason: "Walking app" - pattern: '^wandelen\s+en\s+fietsen$' discard_reason: "Walking and cycling" - pattern: '^wandelkaart\s+\w+$' discard_reason: "Walking map" - pattern: '^struinpad\s+wandelingen$' discard_reason: "Trail walks" - pattern: '^trouwen\s+in\b' discard_reason: "Weddings section" - pattern: '^zakelijke\s+bijeenkomsten$' discard_reason: "Business meetings" - pattern: '^jubileum\s+fietsroute$' discard_reason: "Anniversary bike route" online_services: description: "Online services and digital content" patterns: - pattern: '^online\s+(afspraak|betalen|doneren|exposities|platform|reserveren|vraag)$' discard_reason: "Online service" - pattern: '^website\s+(beheer|gemeenteraad)$' discard_reason: "Website section" - pattern: '^webdesign\s+bureau\b' discard_reason: "Web design" - pattern: '^google\s+maps$' discard_reason: "Google Maps reference" - pattern: '^gebruik\s+google\s+maps$' discard_reason: "Use Google Maps" - pattern: '^flickr\s+fotoalbum$' discard_reason: "Flickr photo album" about_sections: description: "About and overview sections" patterns: - pattern: '^over\s+(batavialand|bergh|haaksbergen|heemskerk|lkca|laren|museumpark|numaga|nuwelant|rijnland|roosendaal|rozet|ruurd|onze\s+website)$' discard_reason: "About section" - pattern: '^over\s+(de|het|dekema)\s+' discard_reason: "About section" - pattern: '^ons\s+(adres|bestuur|huisblad|kantoor|team|werkgebied)$' discard_reason: "Our [X] section" - pattern: '^wat\s+(doen|doet|we|wij)$' discard_reason: "What we do section" - pattern: '^wie\s+(we|wij)\s+zijn$' discard_reason: "Who we are section" - pattern: '^visie\s+en\s+missie$' discard_reason: "Vision and mission" - pattern: '^missie\s+en\s+doelen$' discard_reason: "Mission and goals" - pattern: '^ontstaan\s+\w+$' discard_reason: "Origin section" overview_sections: description: "Overview and list sections" patterns: - pattern: '^overzicht\s+(rijksmonumenten|skriemers|archeologische|bouwlocaties|exposities|formulieren|tijdschriften)$' discard_reason: "Overview section" - pattern: '^overige\s+(uitgaven|documenten|locaties|organisaties|vrijwilligers)$' discard_reason: "Other [X] section" - pattern: '^meer\s+(fers|meldingen|natuurmusea|over|telefoonnummers|weten)$' discard_reason: "More [X] section" - pattern: '^meest\s+(bekeken|recente)\b' discard_reason: "Most [X] section" - pattern: '^recente\s+berichten$' discard_reason: "Recent posts" - pattern: '^laatst(e)?\s+(verschenen|update|nieuws)$' discard_reason: "Latest [X] section" links_references: description: "Links and reference sections" patterns: - pattern: '^link\s+naar\b' discard_reason: "Link to" - pattern: '^links\s+\w+$' discard_reason: "Links section" - pattern: '^interessante\s+links$' discard_reason: "Interesting links" - pattern: '^partner\s+links$' discard_reason: "Partner links" - pattern: '^nuttige\s+websites$' discard_reason: "Useful websites" - pattern: '^wikipedia\s+\w+$' discard_reason: "Wikipedia reference" reports_complaints: description: "Reports and complaints sections" patterns: - pattern: '^klacht\s+\w+$' discard_reason: "Complaint section" - pattern: '^klachten\s+\w+$' discard_reason: "Complaints section" - pattern: '^meldingen\s+(en|zonder|over)\b' discard_reason: "Reports section" - pattern: '^meld\s+(direct|het|overlast)$' discard_reason: "Report CTA" - pattern: '^incident\s+melden$' discard_reason: "Report incident" - pattern: '^storing\s+melden$' discard_reason: "Report malfunction" - pattern: '^schade\s+\w+\s+melden$' discard_reason: "Report damage" - pattern: '^spoedeisende\s+meldingen$' discard_reason: "Emergency reports" governance_policy: description: "Governance and policy content" patterns: - pattern: '^committee\s+van\b' discard_reason: "Committee section" - pattern: '^governance\s+code\b' discard_reason: "Governance code" - pattern: '^coordinated\s+vulnerability\b' discard_reason: "Security policy" - pattern: '^privacyverklaring\s+\w+$' discard_reason: "Privacy statement" - pattern: '^wijziging\s+privacyverklaring$' discard_reason: "Privacy statement change" - pattern: '^voorwaarden\s+(en|zonder)$' discard_reason: "Terms section" - pattern: '^richtlijnen\s+en\b' discard_reason: "Guidelines section" - pattern: '^wet\s+open\s+overheid$' discard_reason: "Open government law" - pattern: '^toetsing\s+\w+$' discard_reason: "Assessment section" - pattern: '^toezicht\s+en\b' discard_reason: "Supervision section" - pattern: '^verbonden\s+partijen$' discard_reason: "Related parties" services_facilities: description: "Services and facilities content" patterns: - pattern: '^service\s+contact$' discard_reason: "Service contact" - pattern: '^servicepunt\s+\w+$' discard_reason: "Service point" - pattern: '^infopunt\s+\w+$' discard_reason: "Info point" - pattern: '^informatiepunt\s+\w+$' discard_reason: "Information point" - pattern: '^informatiecentrum\s+\w+$' discard_reason: "Information center" - pattern: '^kenniscentrum\s+\w+$' discard_reason: "Knowledge center" - pattern: '^uitleenpunt\s+\w+$' discard_reason: "Lending point" - pattern: '^portaal\s+\w+$' discard_reason: "Portal section" tickets_shop: description: "Tickets and shopping content" patterns: - pattern: '^tickets\s+(contact|en|kopen)$' discard_reason: "Tickets section" - pattern: '^tarieven\s+en\b' discard_reason: "Prices section" - pattern: '^winkelwagen\s+\w+$' discard_reason: "Shopping cart" - pattern: '^shopping\s+cart$' discard_reason: "Shopping cart" sponsors_support: description: "Sponsors and support content" patterns: - pattern: '^sponsors\s+en\b' discard_reason: "Sponsors section" - pattern: '^sponsoring\s+(en|aanvragen)$' discard_reason: "Sponsoring section" - pattern: '^steun\s+(orientalis|structureel)$' discard_reason: "Support section" - pattern: '^structurele\s+ondersteuning$' discard_reason: "Structural support" - pattern: '^subsidies\s+en\b' discard_reason: "Subsidies section" - pattern: '^subsidieverstrekkers\b' discard_reason: "Subsidy providers" - pattern: '^supporter\s+\w+$' discard_reason: "Supporter section" education_youth: description: "Education and youth content" patterns: - pattern: '^jeugd\s+en\s+onderwijs$' discard_reason: "Youth and education" - pattern: '^onderwijs\s+en\s+jeugd$' discard_reason: "Education and youth" - pattern: '^scholen\s+\w+$' discard_reason: "Schools section" - pattern: '^voor\s+(wo\s+ii|bezoekers|de\s+(jeugd|media|pers)|het\s+onderwijs|onderwijsinstellingen|professionals)$' discard_reason: "For [audience] section" - pattern: '^voorschoolse\s+\w+$' discard_reason: "Preschool section" - pattern: '^kids\s+academy$' discard_reason: "Kids academy" - pattern: '^kinderen\s+bij\b' discard_reason: "Children section" - pattern: '^middelbaar\s+beroepsonderwijs$' discard_reason: "Vocational education" - pattern: '^stages\s+en\s+afstuderen$' discard_reason: "Internships section" volunteers_staff: description: "Volunteers and staff content" patterns: - pattern: '^vrijwilligersuitje\s+\w+$' discard_reason: "Volunteer outing" - pattern: '^welkom\s+nieuwe\s+vrijwilliger$' discard_reason: "Welcome new volunteer" - pattern: '^vacature\s+\w+$' discard_reason: "Vacancy section" - pattern: '^werken\s+bij\b' discard_reason: "Work at section" - pattern: '^medewerkers\s+zoeken$' discard_reason: "Search employees" technical_website: description: "Technical website elements" patterns: - pattern: '^no\s+events$' discard_reason: "No events message" - pattern: '^recent\s+(comments|posts)$' discard_reason: "Recent content" - pattern: '^search\s+submit\b' discard_reason: "Search submit" - pattern: '^share\s+this$' discard_reason: "Share this" - pattern: '^statistics\s+statistics$' discard_reason: "Statistics" - pattern: '^strictly\s+necessary$' discard_reason: "Cookie notice" - pattern: '^system\s+management$' discard_reason: "System management" - pattern: '^my\s+account$' discard_reason: "My account" - pattern: '^other\s+languages$' discard_reason: "Language selector" - pattern: '^product\s+families$' discard_reason: "Product categories" - pattern: '^form\s+submissions$' discard_reason: "Form submissions" - pattern: '^global\s+websites$' discard_reason: "Global websites" - pattern: '^inloggen\s+leden$' discard_reason: "Member login" - pattern: '^inhoud\s+website$' discard_reason: "Website contents" - pattern: '^inhoudsopgave\s+inhoudsopgave$' discard_reason: "Table of contents" foreign_language: description: "Foreign language navigation" patterns: - pattern: '^mit\s+dem\s+(auto|fahrrad)$' discard_reason: "German directions" - pattern: '^kontakt\s+und\b' discard_reason: "German contact" - pattern: '^polskie\s+informacje$' discard_reason: "Polish information" - pattern: '^preparez\s+votre\b' discard_reason: "French prepare" - pattern: '^folgen\s+sie\b' discard_reason: "German follow" - pattern: '^sich\s+einschreiben$' discard_reason: "German register" - pattern: '^international\s+visitors$' discard_reason: "International visitors" miscellaneous_content: description: "Miscellaneous content phrases" patterns: - pattern: '^(feesten|geboorte|groen|foto|inkoop|eten|zien|beeld|groepen|genealogie|wonen)\s+en\s+\w+$' discard_reason: "X and Y content phrase" - pattern: '^\w+\s+en\s+(gebruiken|overlijden|onderhoud|film|aanbesteden|drinken|doen|geluid|bidprentjes|rondleidingen|leefomgeving|ontginning|links)$' discard_reason: "X and Y content phrase" - pattern: '^het\s+(geheugen|geheim|ontstaan|kantoor|heemhuis|lichtruim|natuurhistorisch|nevelhorstmeer|olieslaan|schip|stift|veenkloosterbos)\b' discard_reason: "Het X section" - pattern: '^de\s+(dorpsdokter|drie|egeling|japanse|klinker|oude)\s+' discard_reason: "De X section" - pattern: '^de\s+(atlantikwall|basis|bilt|bongard|buffer|haarslag|klok|kring|lindenhoeve|mansjes|mariahoeve|nestbouwers|noodwoning|omgevingswet|quiz|skriemer|vlotter|wazerweijen|werf|zoolstede)$' discard_reason: "De X place/thing" - pattern: '^genieten\s+van\b' discard_reason: "Enjoy section" - pattern: '^hulp\s+bij\b' discard_reason: "Help with" - pattern: '^opening\s+museum$' discard_reason: "Museum opening" - pattern: '^renovatie\s+\w+$' discard_reason: "Renovation section" - pattern: '^verhuizing\s+naar\b' discard_reason: "Move to" - pattern: '^home\s+contact\b' discard_reason: "Home contact navigation" - pattern: '^in\s+(buurthuis|de\s+(laar|stad))\b' discard_reason: "In location" - pattern: '^(vier|zes)\s+(typen|kernen)\b' discard_reason: "Number types" - pattern: '^werken\s+met\s+\w+$' discard_reason: "Working with" - pattern: '^werkgebied\s+\w+$' discard_reason: "Work area" - pattern: '^werk\s+in\s+uitvoering$' discard_reason: "Work in progress" - pattern: '^serie\s+\w+$' discard_reason: "Series section" - pattern: '^dromen\s+denken\s+doen$' discard_reason: "Slogan" - pattern: '^founding\s+fathers$' discard_reason: "Founding fathers section" - pattern: '^green\s+team$' discard_reason: "Green team" - pattern: '^kijk\s+en\s+beleef$' discard_reason: "Look and experience" - pattern: '^toen\s+en\s+nu$' discard_reason: "Then and now" - pattern: '^trein\s+en\s+spoor$' discard_reason: "Train and track" - pattern: '^uit\s+in\b' discard_reason: "Out in" - pattern: '^vrij\s+zoeken$' discard_reason: "Free search" - pattern: '^object\s+van\s+de\s+maand$' discard_reason: "Object of the month" - pattern: '^objecten\s+\w+$' discard_reason: "Objects section" - pattern: '^post\s+en\s+e-mail$' discard_reason: "Post and email" - pattern: '^resultaat\s+\w+$' discard_reason: "Result section" - pattern: '^reviews\s+op\b' discard_reason: "Reviews on" - pattern: '^suggesties\s+en\b' discard_reason: "Suggestions section" - pattern: '^vergunningen\s+en\b' discard_reason: "Permits section" - pattern: '^verhalend\s+ontwerpen$' discard_reason: "Narrative design" - pattern: '^verhuizen\s+en\b' discard_reason: "Moving section" - pattern: '^verslagen\s+van\b' discard_reason: "Reports of" - pattern: '^verzonden\s+nieuwsbrieven$' discard_reason: "Sent newsletters" - pattern: '^windmolens\s+en\b' discard_reason: "Windmills section" - pattern: '^winter\s+in\b' discard_reason: "Winter in" - pattern: '^korte\s+lijnen$' discard_reason: "Short lines" - pattern: '^huidige\s+aanbod$' discard_reason: "Current offer" - pattern: '^iets\s+vragen$' discard_reason: "Ask something" - pattern: '^in\s+engeland$' discard_reason: "In England" - pattern: '^inkomende\s+telefoongesprekken$' discard_reason: "Incoming calls" - pattern: '^inleiding\s+\w+$' discard_reason: "Introduction" - pattern: '^inloop\s+\w+$' discard_reason: "Walk-in section" - pattern: '^internationale\s+samenwerking$' discard_reason: "International cooperation" - pattern: '^informatiebrochures\s+molens$' discard_reason: "Mill brochures" - pattern: '^info\s+borden$' discard_reason: "Info boards" - pattern: '^index\s+\w+$' discard_reason: "Index section" - pattern: '^samenwerking\s+met$' discard_reason: "Cooperation with" - pattern: '^schilderijen\s+kunstschilders$' discard_reason: "Paintings section" - pattern: '^planten\s+en\s+dieren$' discard_reason: "Plants and animals" - pattern: '^positieve\s+gezondheid$' discard_reason: "Positive health" - pattern: '^unieke\s+combinatie$' discard_reason: "Unique combination" - pattern: '^uittreksels\s+en\b' discard_reason: "Extracts section" - pattern: '^uitwisselen\s+van\b' discard_reason: "Exchange of" - pattern: '^voortgang\s+procedure$' discard_reason: "Progress procedure" - pattern: '^vorige\s+volgende\b' discard_reason: "Previous next navigation" - pattern: '^welkom\s+terug$' discard_reason: "Welcome back" - pattern: '^wereld\s+van\b' discard_reason: "World of" - pattern: '^wapen\s+van\b' discard_reason: "Coat of arms" - pattern: '^verdwenen\s+\w+$' discard_reason: "Disappeared section" - pattern: '^vernieuwing\s+museum\b' discard_reason: "Museum renewal" - pattern: '^vroegere\s+kringactiviteiten$' discard_reason: "Former activities" - pattern: '^vrouwelijke\s+engelandvaarders$' discard_reason: "Female England travelers" - pattern: '^wegkruisenwandelboekje\s+\w+$' discard_reason: "Cross walk booklet" - pattern: '^wegwerkzaamheden\s+en\b' discard_reason: "Roadworks section" - pattern: '^opgewekte\s+geschiedenissen$' discard_reason: "Cheerful histories" - pattern: '^omschrijving\s+van\b' discard_reason: "Description of" - pattern: '^ommetje\s+\w+$' discard_reason: "Short walk" - pattern: '^ondernemen(d)?\s+(in|nijeveen)$' discard_reason: "Entrepreneurship section" - pattern: '^ondernemers\s+kunnen\s+contact$' discard_reason: "Entrepreneurs contact" - pattern: '^onderwerpen\s+onderwerpen$' discard_reason: "Subjects section" - pattern: '^onderzoeksresultaten\s+\w+$' discard_reason: "Research results" - pattern: '^oorsprong\s+\w+$' discard_reason: "Origin section" - pattern: '^op\s+(de|het)\s+\w+$' discard_reason: "On the X" - pattern: '^openbare\s+inschrijving$' discard_reason: "Public registration" - pattern: '^openstelling\s+en\b' discard_reason: "Opening section" - pattern: '^pagina\s+voor\b' discard_reason: "Page for" - pattern: '^pakje\s+kunst$' discard_reason: "Package of art" - pattern: '^panorama\s+van\b' discard_reason: "Panorama of" - pattern: '^partner\s+webshop$' discard_reason: "Partner webshop" - pattern: '^pers\s+toolkit$' discard_reason: "Press toolkit" - pattern: '^persoonlijk\s+contact$' discard_reason: "Personal contact" - pattern: '^plaatselijk\s+belang\b' discard_reason: "Local interest" - pattern: '^poortinstructie\s+voor\b' discard_reason: "Gate instruction" - pattern: '^praat\s+mar\s+frysk$' discard_reason: "Speak Frisian" - pattern: '^recht\s+van\s+opstal$' discard_reason: "Right of superficies" - pattern: '^rabo\s+clubsupport$' discard_reason: "Rabo club support" - pattern: '^toegankelijk\s+voor\b' discard_reason: "Accessible for" - pattern: '^toelichting\s+beeldbank$' discard_reason: "Image bank explanation" - pattern: '^tijd\s+geconstateerd$' discard_reason: "Time detected" - pattern: '^varen\s+in\b' discard_reason: "Sailing in" - pattern: '^veel\s+gestelde\s+vragen$' discard_reason: "FAQ" - pattern: '^veilig\s+mailen$' discard_reason: "Safe email" - pattern: '^vakantie\s+in\b' discard_reason: "Holiday in" - pattern: '^van\s+(nieuwegeinse\s+bodem|noord|wirskaante|de\s+(bestuurstafel|voorzitter))$' discard_reason: "From X section" - pattern: '^fiscaal\s+nummer$' discard_reason: "Tax number" - pattern: '^financiele\s+verantwoording$' discard_reason: "Financial accountability" - pattern: '^nieuwe\s+(aanwinsten|zaak\s+starten)$' discard_reason: "New acquisitions/start" - pattern: '^nieuw\s+wachtwoord\s+aanvragen$' discard_reason: "Request new password" - pattern: '^minder\s+valide$' discard_reason: "Disabled access" - pattern: '^methode\s+van\b' discard_reason: "Method of" - pattern: '^kaart\s+kernen$' discard_reason: "Map cores" - pattern: '^molens\s+(in|loil)$' discard_reason: "Mills section" - pattern: '^of\s+zocht\s+u$' discard_reason: "Or did you search" - pattern: '^oude\s+(ansichtkaarten|films|kerkhof)$' discard_reason: "Old X section" - pattern: '^straten\s+in\b' discard_reason: "Streets in" - pattern: '^studie\s+hoek$' discard_reason: "Study corner" - pattern: '^kunstenaars\s+in\b' discard_reason: "Artists in" - pattern: '^na\s+wo\s+ii$' discard_reason: "After WW2" - pattern: '^rijwielvordering\s+wo\s+ii$' discard_reason: "WW2 bicycle requisition" - pattern: '^rijnlands\s+vastgoed$' discard_reason: "Rijnland real estate" - pattern: '^rijnlandse\s+mascottes$' discard_reason: "Rijnland mascots" - pattern: '^rietwijk\s+of\s+reewijk$' discard_reason: "Rietwijk or Reewijk" - pattern: '^roggel\s+(leef|en\s+omgeving)$' discard_reason: "Roggel section" - pattern: '^roggelse\s+verenigingen$' discard_reason: "Roggel associations" - pattern: '^rozet\s+voor\s+jou$' discard_reason: "Rozet for you" - pattern: '^veluws\s+schoon$' discard_reason: "Clean Veluwe" - pattern: '^katolieke\s+emancipatie$' discard_reason: "Catholic emancipation" - pattern: '^keur\s+van\s+grafstenen$' discard_reason: "Selection of gravestones" specific_false_positives: description: "Specific strings identified as false positives" patterns: - pattern: '^foto\s+(actief|herkenning|inzenden|album)$' discard_reason: "Photo section" - pattern: '^foto\s+kenneth\s+stamp$' discard_reason: "Photo credit" - pattern: '^comics\s+plus$' discard_reason: "Product name" - pattern: '^canon\s+production\b' discard_reason: "Canon production" - pattern: '^cultuurimpuls\b' discard_reason: "Culture impulse" - pattern: '^edmond\s+\w+\s+penning$' discard_reason: "Medal name" - pattern: '^eigen\s+uitgaven$' discard_reason: "Own publications" - pattern: '^een\s+australische\b' discard_reason: "An Australian" - pattern: '^brabants\s+heem$' discard_reason: "Brabant heritage" - pattern: '^buurt\s+battle$' discard_reason: "Neighborhood battle" - pattern: '^middengebied\b' discard_reason: "Middle area" - pattern: '^zwolse\s+parken$' discard_reason: "Zwolle parks" - pattern: '^zandeind\s+in\b' discard_reason: "Zandeind location" - pattern: '^zelf\s+bewaren$' discard_reason: "Self storage" - pattern: '^zeldzame\s+voorwerpen$' discard_reason: "Rare objects" - pattern: '^woldzigt\s+agenda$' discard_reason: "Woldzigt agenda" - pattern: '^acht\s+van\s+chaam$' discard_reason: "Eight of Chaam" - pattern: '^bij\s+de\s+barones$' discard_reason: "At the baroness" - pattern: '^boek\s+elle\s+klop$' discard_reason: "Book title" - pattern: '^bemmel\s+ressen\b' discard_reason: "Place names" - pattern: '^aold\s+hoksebarge$' discard_reason: "Dialect place name" - pattern: '^bientien\s+over\b' discard_reason: "Room name" - pattern: '^tonnie\s+en\s+kee\b' discard_reason: "Show characters" - pattern: '^den\s+brouwer$' discard_reason: "Place/building name" - pattern: '\bop\s+gastenboek\b' discard_reason: "Guestbook navigation" - pattern: '^de\s+laar$' discard_reason: "Place name" - pattern: '^serie\s+droge\s+voeten$' discard_reason: "Publication series" - pattern: '^familie\s+(bindels|janssen)$' discard_reason: "Family section header" - pattern: '^hof\s+loil$' discard_reason: "Place name" - pattern: '^hoolten\s+klinte$' discard_reason: "Dialect place name" - pattern: '^verhildersum\s+to\s+go$' discard_reason: "Product name" - pattern: '^voerman\s+verwondert$' discard_reason: "Exhibition title" - pattern: '^heemskerker\s+ezels$' discard_reason: "Organization name" - pattern: '^jolly\s+duck$' discard_reason: "Venue name" - pattern: '^maria\s+kleuterschool$' discard_reason: "School name" - pattern: '^waterlandsmuseum\s+de\s+speeltoren$' discard_reason: "Museum name" - pattern: '^pelt\s+als\s+architect$' discard_reason: "Article title" - pattern: '^mierlo\s+puzzel$' discard_reason: "Puzzle name" - pattern: '^ozosnel\s+fandagen$' discard_reason: "Event name" - pattern: '^spijkerserve\s+\w+$' discard_reason: "Place name" - pattern: '^stalpers\s+opleidingen\b' discard_reason: "Training company" - pattern: '^taalbrigade\s+kids$' discard_reason: "Program name" - pattern: '^numaga\s+(excursies|jaarboek)$' discard_reason: "Numaga section" - pattern: '^meierijse\s+schoutsrekeningen$' discard_reason: "Historical records" - pattern: '^nieuwegein\s+lokaal$' discard_reason: "Local Nieuwegein" - pattern: '^nieuwjaarke\s+zingen$' discard_reason: "New Year singing" - pattern: '^nieuwveense\s+landen$' discard_reason: "Place name" - pattern: '^oijen\s+en\s+teeffelen$' discard_reason: "Place names" - pattern: '^molukse\s+(graven|muziek)$' discard_reason: "Moluccan section" - pattern: '^kruisen\s+en\s+\w+$' discard_reason: "Crosses section" # ============================================================================= # RELATIONSHIP PREDICATES REFERENCE # From: data/entity_annotation/modules/advanced/relationship_annotations.yaml # ============================================================================= relationship_predicates: organizational: - id: org:memberOf description: "Entity is member of organization" domain: [AGT.PER, GRP.*] range: [GRP.*] - id: org:subOrganizationOf description: "Organization is part of larger organization" domain: [GRP.*] range: [GRP.*] - id: org:unitOf description: "Organizational unit is part of organization" domain: [GRP.UNT] range: [GRP.*] - id: org:hasSite description: "Organization has location/building" domain: [GRP.*] range: [TOP.BLD, TOP.SET] - id: org:linkedTo description: "Organization is linked to another" domain: [GRP.*] range: [GRP.*] - id: org:role description: "Organization has role/position" domain: [GRP.*] range: [ROL.*] spatial: - id: schema:location description: "Entity is located at place" domain: [GRP.*, TOP.BLD] range: [TOP.SET, TOP.REG, TOP.ADR] - id: schema:areaServed description: "Organization serves geographic area" domain: [GRP.*] range: [TOP.SET, TOP.REG] - id: crm:P53_has_former_or_current_location description: "Heritage site location (temporal)" domain: [TOP.BLD, TOP.FEA] range: [TOP.SET] creative: - id: dcterms:publisher description: "Work published by organization" domain: [WRK.*] range: [GRP.*] - id: dcterms:references description: "Entity references work" domain: [GRP.*, AGT.PER] range: [WRK.*] - id: schema:mentions description: "Entity mentions person/thing" domain: [GRP.*, WRK.*] range: [AGT.PER, GRP.*] identity: - id: owl:sameAs description: "Entities are the same" domain: ["*"] range: ["*"]