1393 lines
38 KiB
Python
1393 lines
38 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Cleanup false positives from web_contact_data extraction - v2.
|
|
|
|
This version uses ONLY explicit patterns for things that are DEFINITIVELY
|
|
not person names. No heuristics about "what names look like".
|
|
|
|
Pattern categories:
|
|
1. URLs and technical strings
|
|
2. Website navigation elements
|
|
3. Form labels and button text
|
|
4. Generic section headers
|
|
5. Organization type suffixes with preceding modifiers
|
|
6. Dutch phrase patterns that are never names
|
|
7. Famous historical figures (references, not contacts)
|
|
"""
|
|
|
|
import re
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import List, Tuple
|
|
|
|
|
|
# =============================================================================
|
|
# CATEGORY 1: URLs and Technical Strings
|
|
# =============================================================================
|
|
URL_TECHNICAL_PATTERNS = [
|
|
r'^https?://',
|
|
r'^www\.',
|
|
r'\.html$',
|
|
r'\.php$',
|
|
r'\.aspx?$',
|
|
r'/photos/',
|
|
r'/places/',
|
|
r'^ChIJ', # Google Place IDs
|
|
r'^AWn5SU', # Google photo IDs
|
|
r'WordPress',
|
|
]
|
|
|
|
# =============================================================================
|
|
# CATEGORY 2: Website Navigation Elements
|
|
# =============================================================================
|
|
NAVIGATION_PATTERNS = [
|
|
r'^menu\s+schakelen$',
|
|
r'^go\s+to\s+top$',
|
|
r'^page\s+load\s+link$',
|
|
r'^skip\s+to\b',
|
|
r'^jump\s+to\b',
|
|
r'^ga\s+naar\b',
|
|
r'^terug\s+naar\b',
|
|
r'^naar\s+(de|het|inhoud|menu)\b',
|
|
r'^back\s+to\b',
|
|
r'^footer\s+navigatie$',
|
|
r'^hoofd\s*navigatie$',
|
|
r'^volg\s+ons\b',
|
|
r'^follow\s+(us|this)\b',
|
|
r'^folgen\s+sie\b',
|
|
]
|
|
|
|
# =============================================================================
|
|
# CATEGORY 3: Form Labels and Button Text
|
|
# =============================================================================
|
|
FORM_BUTTON_PATTERNS = [
|
|
r'^typ\s+hier\b',
|
|
r'^vul\s+in\b',
|
|
r'^selecteer\b',
|
|
r'^kies\s+',
|
|
r'^zoek(en)?(\s+in)?$',
|
|
r'^aanmeld(en|ing)$',
|
|
r'^afmeld(en|ing)$',
|
|
r'^reserv(eren|ering)$',
|
|
r'^verzend(en)?$',
|
|
r'^accepteer\b',
|
|
r'^afwijzen\b',
|
|
r'^akkoord$',
|
|
r'^instellingen\s+opslaan$',
|
|
r'^alles\s+inschakelen$',
|
|
r'^gegevens\s+wijzigen$',
|
|
r'^onthouden\s+inloggen$',
|
|
]
|
|
|
|
# =============================================================================
|
|
# CATEGORY 4: Generic Section Headers
|
|
# =============================================================================
|
|
SECTION_HEADER_PATTERNS = [
|
|
r'^laatste\s+nieuws$',
|
|
r'^over\s+(ons|deze)$',
|
|
r'^missie\s+en\s+visie$',
|
|
r'^het\s+bestuur$',
|
|
r'^de\s+stichting$',
|
|
r'^de\s+vereniging$',
|
|
r'^contact$',
|
|
r'^contactgegevens$',
|
|
r'^bereikbaarheid$',
|
|
r'^openingstijden$',
|
|
r'^parkeren$',
|
|
r'^bezoekadres$',
|
|
r'^postadres$',
|
|
r'^privacybeleid$',
|
|
r'^disclaimer$',
|
|
r'^colofon$',
|
|
r'^sitemap$',
|
|
r'^veelgestelde\s+vragen$',
|
|
]
|
|
|
|
# =============================================================================
|
|
# CATEGORY 5: Organization Names (not persons)
|
|
# Organizations are identified by specific patterns that combine:
|
|
# - Article (De/Het) + place/descriptor + organization type
|
|
# - Place name + organization type
|
|
# =============================================================================
|
|
ORGANIZATION_PATTERNS = [
|
|
# "Historische Vereniging [Place]" pattern
|
|
r'^historische\s+vereniging\b',
|
|
r'^heemkundige\s+kring\b',
|
|
r'^heemkunde\s*kring\b',
|
|
r'^heemkunde\s+werkgroep\b',
|
|
r'^historische\s+werkgroep\b',
|
|
r'^oudheidkundige?\s+(kring|vereniging)\b',
|
|
r'^heemkundevereniging\b',
|
|
|
|
# Organization type with geographic modifier
|
|
r'^\w+se?\s+(historische\s+)?(vereniging|kring|werkgroep|stichting|genootschap)$',
|
|
r'^\w+er\s+handelsvereniging$', # Meppeler Handelsvereniging
|
|
|
|
# "Gemeente [Place]" - municipalities
|
|
r'^gemeente\s+[\w-]+$',
|
|
r'^gemeentehuis\s+[\w-]+$',
|
|
r'^gemeente\s+archieven$',
|
|
|
|
# Full organization names with "van"
|
|
r'^(de\s+)?bibliotheek\s+(noord|zuid|oost|west)[\w-]+$',
|
|
|
|
# Museums, archives named with articles
|
|
r'^(het|de)\s+\w+\s*(museum|archief|bibliotheek)$',
|
|
r'^\w+\s+\w+\s+museum$', # "Pieter Vermeulen Museum", "Nederlands Graanmuseum"
|
|
r'^(nationaal|nederlands|nederlandse|oudheidkundig|virtueel)\s+\w*(museum|archief)$',
|
|
|
|
# Regional archives
|
|
r'^regionaal\s+archief\b',
|
|
r'^stadsarchief\b',
|
|
r'^gemeentearchief\b',
|
|
|
|
# Buildings and institutions
|
|
r'^bureau\s+\w+$', # Bureau Stadsnatuur
|
|
r'^bouwbedrijf\s+\w+$',
|
|
r'^buurthuis\s+\w+$',
|
|
r'^hoeve\s+(de\s+)?\w+$',
|
|
r'^herberg\s+(de\s+)?\w+$',
|
|
r'^huis\s+(ten|van|de)\s+\w+$', # Huis ten Bosch, Huis van Oud
|
|
r'^botanische\s+tuin\b',
|
|
r'^aula\s+\w+$',
|
|
|
|
# Info points
|
|
r'^(contactpersoon|informatie|infopunt|informatiepunt)\s+\w+$',
|
|
|
|
# Publications and products
|
|
r'^jaarboek(en)?\s+\w+$',
|
|
r'^uitgaven\s+\w+$',
|
|
r'^carillon\s+\w+$',
|
|
]
|
|
|
|
# =============================================================================
|
|
# CATEGORY 6: Dutch Phrase Patterns (definitively not names)
|
|
# =============================================================================
|
|
PHRASE_PATTERNS = [
|
|
# "[Noun] en [noun]" patterns - things connected by "en" that aren't names
|
|
r'^(feesten|geboorte|groen|foto|inkoop|eten|zien|beeld|groepen|genealogie|wonen)\s+en\s+\w+$',
|
|
r'^\w+\s+en\s+(gebruiken|overlijden|onderhoud|film|aanbesteden|drinken|doen|geluid|bidprentjes|rondleidingen|leefomgeving|ontginning|links)$',
|
|
r'^gezocht\s+en\s+\w+$',
|
|
r'^filmpjes\s+en\s+links$',
|
|
|
|
# "Het/De [X] van [Y]" publication/place patterns
|
|
r'^het\s+(geheugen|geheim|ontstaan|kantoor|heemhuis|lichtruim|natuurhistorisch|nevelhorstmeer|olieslaan|schip|stift|veenkloosterbos)\b',
|
|
r'^de\s+(dorpsdokter|drie|egeling|japanse|klinker|oude)\s+',
|
|
|
|
# Common website content phrases
|
|
r'^lees\s+meer\b',
|
|
r'^bekijk\s+(de|het|alle|meer)\b',
|
|
r'^download\s+(de|het)\b',
|
|
r'^meer\s+info(rmatie)?$',
|
|
r'^handige\s+(info|links)$',
|
|
r'^gesloten\s+op\b',
|
|
r'^gratis\s+(toegang|qr)\b',
|
|
r'^hulp\s+bij\b',
|
|
r'^opening\s+museum$',
|
|
r'^over\s+(de|het|dekema)\s+',
|
|
r'^renovatie\s+\w+$',
|
|
r'^verhuizing\s+naar\b',
|
|
r'^home\s+contact\b',
|
|
r'^in\s+(buurthuis|de\s+(laar|stad))\b',
|
|
|
|
# Government website patterns
|
|
r'^gedeputeerde\s+staten$',
|
|
r'^provinciale\s+staten$',
|
|
r'^burgemeester\s+en\s+wethouders$',
|
|
r'^commissaris\s+van\s+de\s+koning$',
|
|
r'^raad\s+van\s+state$',
|
|
|
|
# Actions/instructions
|
|
r'^meld\s+(je|u|een)\b',
|
|
r'^geef\s+(je|uw)\b',
|
|
r'^word[t]?\s+(lid|vriend|abonnee)$',
|
|
r'^steun\s+(het|de|ons)\b',
|
|
r'^huur\s+(een|het|de|eigendom)\b',
|
|
|
|
# Water board / government service descriptions (not persons!)
|
|
r'^(legger|peilbesluit|proefsluiting|vervanging|vernieuwen|onderhoud|metingen|bediening)\s+',
|
|
r'^(vier|zes)\s+(typen|kernen)\b',
|
|
r'^werken\s+bij\b',
|
|
r'^wat\s+doen\s+we$',
|
|
r'^waterschapsbelasting\b',
|
|
r'^ons\s+gebied\b',
|
|
r'^handhavingsverzoek\b',
|
|
r'^sponsoring\s+aanvragen$',
|
|
r'^tijdelijke\s+verkeersmaatregelen$',
|
|
r'^green\s+team$',
|
|
r'^stages\s+en\s+afstuderen$',
|
|
r'^serie\s+\w+$',
|
|
|
|
# Historical/heritage website content
|
|
r'^historie\s+(van\s+)?\w+$',
|
|
r'^historisch(e)?\s+(coevorden|spektakel|avond|fietsroute|geografie|groenten|projecten|wandeling)\b',
|
|
r'^gevelstenen\s+in\b',
|
|
r'^grafvondst\s+bij\b',
|
|
r'^genieten\s+van\b',
|
|
|
|
# Governance/committee references (not persons)
|
|
r'^committee\s+van\b',
|
|
r'^governance\s+code\b',
|
|
r'^coordinated\s+vulnerability\b',
|
|
|
|
# Products/publications/services
|
|
r'^foto\s+(actief|herkenning|inzenden|album)$',
|
|
r'^foto\s+kenneth\s+stamp$',
|
|
r'^comics\s+plus$',
|
|
r'^canon\s+production\b',
|
|
r'^cultuurimpuls\b',
|
|
r'^dromen\s+denken\s+doen$',
|
|
r'^erfgoedcollecties\s+van\b',
|
|
r'^een\s+australische\b',
|
|
r'^edmond\s+\w+\s+penning$',
|
|
r'^eigen\s+uitgaven$',
|
|
r'^flickr\s+fotoalbum$',
|
|
r'^founding\s+fathers$',
|
|
|
|
# Place references (not persons)
|
|
r'^bemmel\s+ressen\b',
|
|
r'^dekema\s+state\b',
|
|
r'^brabants\s+heem$',
|
|
r'^buurt\s+battle$',
|
|
r'^middengebied\b',
|
|
r'^zwolse\s+parken$',
|
|
r'^zandeind\s+in\b',
|
|
r'^zakelijke\s+bijeenkomsten$',
|
|
r'^zelf\s+bewaren$',
|
|
r'^zeldzame\s+voorwerpen$',
|
|
r'^woldzigt\s+agenda$',
|
|
r'^acht\s+van\s+chaam$',
|
|
r'^bij\s+de\s+barones$',
|
|
r'^boek\s+elle\s+klop$',
|
|
|
|
# Courses and training
|
|
r'^cursus\s+\w+$',
|
|
|
|
# Schedule/time references (not persons)
|
|
r'^dag\s+tijden$',
|
|
r'^\w+dag\s+gesloten$', # Dinsdag Gesloten, Maandag Gesloten, etc.
|
|
r'^goede\s+vrijdag$',
|
|
|
|
# Technical/business terms
|
|
r'^form\s+submissions$',
|
|
r'^financial\s+controller$',
|
|
r'^global\s+websites$',
|
|
r'^google\s+maps$',
|
|
r'^gebruik\s+google\s+maps$',
|
|
r'^fiscaal\s+nummer$',
|
|
r'^financiele\s+verantwoording$',
|
|
|
|
# Heritage organizations (not persons)
|
|
r'^erfgoed\s+(brabant|gelderland|zeeland|limburg|utrecht|friesland|drenthe|overijssel|flevoland|groningen)$',
|
|
r'^flevolands\s+geheugen$',
|
|
r'^fryske\s+akademy$',
|
|
r'^gelderse\s+kerken$',
|
|
r'^groninger\s+waddenmusea$',
|
|
|
|
# Studios and institutions (not persons)
|
|
r'^dansstudio\s+\w+$',
|
|
|
|
# Geographic/place patterns - "De [X]" where X is clearly a place/thing
|
|
r'^de\s+(atlantikwall|basis|bilt|bongard|buffer|haarslag|klok|kring|lindenhoeve|mansjes|mariahoeve|nestbouwers|noodwoning|omgevingswet|quiz|skriemer|vlotter|wazerweijen|werf|zoolstede)$',
|
|
|
|
# Newspaper/publication names
|
|
r'^dedemsvaartse\s+courant$',
|
|
|
|
# Fort/building names (places, not persons)
|
|
r'^fort\s+\w+$',
|
|
r'^grafheuvel\s+\w+$',
|
|
|
|
# Explicit place references
|
|
r'^aula\s+oude\s+begraafplaats$',
|
|
|
|
# Generic section/group names
|
|
r'^familie\s+(bindels|janssen)$', # Family name sections (not specific persons)
|
|
|
|
# Activity/event descriptions
|
|
r'^groepen\s+aanmelden$',
|
|
r'^groenblauw\s+buurt',
|
|
r'^grote\s+waternavel$',
|
|
r'^gezonde\s+visstand$',
|
|
r'^geveltjes\s+lezen$',
|
|
r'^gesteunde\s+projecten$',
|
|
r'^grensoverschrijdend\s+gedrag$',
|
|
r'^genealogische\s+begrippen$',
|
|
r'^gemeentelijke\s+(bekendmakingen|belastingen)$',
|
|
r'^gemakkelijk\s+starten$',
|
|
r'^gemeenlandshuis\s+\w+$',
|
|
|
|
# Libraries as places (not persons)
|
|
r'^kinderbibliotheek\s+\w+$',
|
|
|
|
# Complaints/reports
|
|
r'^klacht\s+\w+$',
|
|
r'^klachten\s+\w+$',
|
|
r'^meldingen\s+(en|zonder|over)\b',
|
|
r'^meld\s+(direct|het|overlast)$',
|
|
r'^incident\s+melden$',
|
|
r'^storing\s+melden$',
|
|
r'^schade\s+\w+\s+melden$',
|
|
|
|
# Place names with Klein/Groot
|
|
r'^klein\s+(amerika|rome|zundert)$',
|
|
|
|
# Call to action patterns
|
|
r'^kom\s+(in|verder)$',
|
|
r'^klik\s+voor\b',
|
|
|
|
# Religious monuments
|
|
r'^kruisen\s+en\s+\w+$',
|
|
r'^kruisbeeld\s+op\b',
|
|
r'^mariakapel\s+\w+$',
|
|
r'^sint\s+\w+(kerk|gebouw)$',
|
|
r'^protestantse\s+(kerk|pastorie)$',
|
|
r'^kapel\s+van\b',
|
|
|
|
# Estates/landgoed
|
|
r'^landgoed\s+\w+$',
|
|
r'^landgoedrondleiding\b',
|
|
r'^landschapspark\s+\w+$',
|
|
r'^landkaart\s+\w+$',
|
|
|
|
# Membership sections
|
|
r'^leden\s+(administratie|en\s+lidmaatschap)$',
|
|
r'^lid\s+(worden|worden\s+inloggen)$',
|
|
r'^lidmaatschap\s+\w+$',
|
|
r'^soort\s+lidmaatschap$',
|
|
|
|
# Events/lectures
|
|
r'^lezingen\s+en\s+\w+$',
|
|
|
|
# Links
|
|
r'^link\s+naar\b',
|
|
r'^links\s+\w+$',
|
|
r'^interessante\s+links$',
|
|
r'^partner\s+links$',
|
|
r'^nuttige\s+websites$',
|
|
|
|
# Locations
|
|
r'^locatie\s+\w+$',
|
|
r'^locaties\s+\w+$',
|
|
r'^overige\s+locaties$',
|
|
r'^vestiging\s+\w+$',
|
|
|
|
# More/most sections
|
|
r'^meer\s+(fers|meldingen|natuurmusea|over|telefoonnummers|weten)$',
|
|
r'^meest\s+(bekeken|recente)\b',
|
|
|
|
# Menu navigation
|
|
r'^menu\s+overslaan$',
|
|
|
|
# Monuments
|
|
r'^monument(en)?\s+(in|didam|loil|nieuw-dijk|oud-dijk|buurtschap)\b',
|
|
r'^monumentencommissie\s+\w+$',
|
|
r'^nationaal\s+monument$',
|
|
|
|
# Form fields
|
|
r'^naam\s+(specialisme|en\s+voornaam|omschrijving)$',
|
|
r'^voornaam\s+tussenvoegsel\b',
|
|
r'^je\s+naam$',
|
|
r'^ik\s+ben$',
|
|
|
|
# Navigation "Naar X"
|
|
r'^naar\s+(google\s+maps|bestuurspagina|boven\s+scrollen|veelgestelde\s+vragen)$',
|
|
|
|
# National organizations/places
|
|
r'^nationaal\s+(fietsmuseum|glasmuseum|scheepsarcheologisch)\b',
|
|
r'^nationale\s+ombudsman$',
|
|
r'^nederlands(e)?\s+(college|hervormde|genealogische)\b',
|
|
|
|
# Technical/website
|
|
r'^no\s+events$',
|
|
r'^recent\s+(comments|posts)$',
|
|
r'^search\s+submit\b',
|
|
r'^shopping\s+cart$',
|
|
r'^share\s+this$',
|
|
r'^statistics\s+statistics$',
|
|
r'^strictly\s+necessary$',
|
|
r'^system\s+management$',
|
|
r'^my\s+account$',
|
|
r'^other\s+languages$',
|
|
r'^product\s+families$',
|
|
|
|
# Object sections
|
|
r'^object\s+van\s+de\s+maand$',
|
|
r'^objecten\s+\w+$',
|
|
|
|
# Online services
|
|
r'^online\s+(afspraak|betalen|doneren|exposities|platform|reserveren|vraag)$',
|
|
|
|
# "Ons X" sections
|
|
r'^ons\s+(adres|bestuur|huisblad|kantoor|team|werkgebied)$',
|
|
|
|
# "Ontdek X" sections
|
|
r'^ontdek\s+(de|jouw|ons)$',
|
|
|
|
# "Ontstaan X" sections
|
|
r'^ontstaan\s+\w+$',
|
|
|
|
# "Over X" sections (about pages)
|
|
r'^over\s+(batavialand|bergh|haaksbergen|heemskerk|lkca|laren|museumpark|numaga|nuwelant|rijnland|roosendaal|rozet|ruurd|onze\s+website)$',
|
|
|
|
# "Overige X" sections
|
|
r'^overige\s+(uitgaven|documenten|locaties|organisaties|vrijwilligers)$',
|
|
|
|
# "Overzicht X" sections
|
|
r'^overzicht\s+(rijksmonumenten|skriemers|archeologische|bouwlocaties|exposities|formulieren|tijdschriften)$',
|
|
|
|
# Plans and routes
|
|
r'^plan\s+uw\b',
|
|
r'^route\s+(en|per)$',
|
|
|
|
# Portals
|
|
r'^portaal\s+\w+$',
|
|
|
|
# Post/email
|
|
r'^post\s+en\s+e-mail$',
|
|
|
|
# Privacy
|
|
r'^privacyverklaring\s+\w+$',
|
|
r'^wijziging\s+privacyverklaring$',
|
|
|
|
# Profiel
|
|
r'^profiel\s+wijzigen$',
|
|
|
|
# Program
|
|
r'^programma\s+voor\s+groepen$',
|
|
|
|
# Publicaties
|
|
r'^publicaties\s+\w+$',
|
|
|
|
# Raad
|
|
r'^raad\s+van\s+(toezicht|state)$',
|
|
|
|
# Recent
|
|
r'^recente\s+berichten$',
|
|
r'^laatst(e)?\s+(verschenen|update|nieuws)$',
|
|
|
|
# Reserveer
|
|
r'^reserveer\s+een\b',
|
|
|
|
# Resultaat
|
|
r'^resultaat\s+\w+$',
|
|
|
|
# Reviews
|
|
r'^reviews\s+op\b',
|
|
|
|
# Richtlijnen
|
|
r'^richtlijnen\s+en\b',
|
|
|
|
# Rondleiding
|
|
r'^rondleiding\s+\w+$',
|
|
r'^rondleidingen\s+en\b',
|
|
r'^rondwandeling\s+door\b',
|
|
|
|
# Scans/schade
|
|
r'^scans\s+aanvragen$',
|
|
|
|
# Scholen
|
|
r'^scholen\s+\w+$',
|
|
|
|
# Schrijf
|
|
r'^schrijf\s+(je|ons)$',
|
|
|
|
# Scroll
|
|
r'^scroll\s+naar\b',
|
|
|
|
# Senior roles (not specific persons)
|
|
r'^senior\s+(applicatiebeheerder|systeembeheerder)$',
|
|
|
|
# Service
|
|
r'^service\s+contact$',
|
|
r'^servicepunt\s+\w+$',
|
|
|
|
# Sich einschreiben (German)
|
|
r'^sich\s+einschreiben$',
|
|
|
|
# Sponsors
|
|
r'^sponsors\s+en\b',
|
|
r'^sponsoring\s+en\b',
|
|
|
|
# Steun
|
|
r'^steun\s+(orientalis|structureel)$',
|
|
r'^structurele\s+ondersteuning$',
|
|
|
|
# Stoomtrein
|
|
r'^stoomtrein\s+\w+$',
|
|
r'^kantoor\s+stoomtrein\b',
|
|
|
|
# Straten
|
|
r'^straten\s+in\b',
|
|
|
|
# Streekmuseum/centrum
|
|
r'^streekmuseum\s+\w+$',
|
|
r'^streekhistorisch\s+centrum\b',
|
|
|
|
# Studie
|
|
r'^studie\s+hoek$',
|
|
|
|
# Stuur
|
|
r'^stuur\s+(een|foto)$',
|
|
|
|
# Subsidie
|
|
r'^subsidies\s+en\b',
|
|
r'^subsidieverstrekkers\b',
|
|
|
|
# Suggesties
|
|
r'^suggesties\s+en\b',
|
|
|
|
# Supporter
|
|
r'^supporter\s+\w+$',
|
|
|
|
# Tarieven
|
|
r'^tarieven\s+en\b',
|
|
|
|
# Te melden
|
|
r'^te\s+melden\b',
|
|
|
|
# Tegels
|
|
r'^tegels\s+blief\b',
|
|
|
|
# Theater
|
|
r'^theater\s+(de|het|idea)$',
|
|
r'^theaterschool\s+\w+$',
|
|
|
|
# Thema
|
|
r'^thema\s+avonden$',
|
|
|
|
# Tickets
|
|
r'^tickets\s+(contact|en|kopen)$',
|
|
|
|
# Tijd
|
|
r'^tijd\s+geconstateerd$',
|
|
r'^tijdlijn\s+vondsten$',
|
|
|
|
# Toegankelijk
|
|
r'^toegankelijk\s+voor\b',
|
|
|
|
# Toelichting
|
|
r'^toelichting\s+beeldbank$',
|
|
|
|
# Toen en nu
|
|
r'^toen\s+en\s+nu$',
|
|
|
|
# Toetsing/Toezicht
|
|
r'^toetsing\s+\w+$',
|
|
r'^toezicht\s+en\b',
|
|
|
|
# Toneelvereniging
|
|
r'^toneelvereniging\s+\w+$',
|
|
|
|
# Traditiekamer
|
|
r'^traditiekamer\s+\w+$',
|
|
|
|
# Trein en Spoor
|
|
r'^trein\s+en\s+spoor$',
|
|
|
|
# Trouwen
|
|
r'^trouwen\s+in\b',
|
|
|
|
# Tweede (holidays)
|
|
r'^tweede\s+(paasdag|pinksterdag)$',
|
|
|
|
# Uit in
|
|
r'^uit\s+in\b',
|
|
|
|
# Uitgaven
|
|
r'^uitgaven\s+\w+$',
|
|
r'^uitgezonderd\b',
|
|
|
|
# Uitleenpunt
|
|
r'^uitleenpunt\s+\w+$',
|
|
|
|
# Uittreksels
|
|
r'^uittreksels\s+en\b',
|
|
|
|
# Uitwisselen
|
|
r'^uitwisselen\s+van\b',
|
|
|
|
# Unieke
|
|
r'^unieke\s+combinatie$',
|
|
|
|
# Utrecht/buildings
|
|
r'^utrecht\s+house\b',
|
|
|
|
# Vacature
|
|
r'^vacature\s+\w+$',
|
|
|
|
# Vakantie
|
|
r'^vakantie\s+in\b',
|
|
|
|
# Van X (about sections, not persons)
|
|
r'^van\s+(nieuwegeinse\s+bodem|noord|wirskaante|de\s+(bestuurstafel|voorzitter))$',
|
|
|
|
# Vandaag/vanuit
|
|
r'^vandaag\s+gesloten$',
|
|
r'^vanuit\s+\w+$',
|
|
|
|
# Varen
|
|
r'^varen\s+in\b',
|
|
|
|
# Veel gestelde vragen
|
|
r'^veel\s+gestelde\s+vragen$',
|
|
|
|
# Veilig
|
|
r'^veilig\s+mailen$',
|
|
|
|
# Verbonden
|
|
r'^verbonden\s+partijen$',
|
|
|
|
# Verdwenen
|
|
r'^verdwenen\s+\w+$',
|
|
|
|
# Verenigingsblad
|
|
r'^verenigingsblad\s+\w+$',
|
|
r'^verenigingsorgaan\s+\w+$',
|
|
|
|
# Vergunningen
|
|
r'^vergunningen\s+en\b',
|
|
|
|
# Verhaal
|
|
r'^verhalend\s+ontwerpen$',
|
|
|
|
# Verhildersum
|
|
r'^verhildersum\s+to\s+go$',
|
|
|
|
# Verhuizen
|
|
r'^verhuizen\s+en\b',
|
|
|
|
# Verleden
|
|
r'^verleden\s+tijdschrift$',
|
|
|
|
# Vernieuwing
|
|
r'^vernieuwing\s+museum\b',
|
|
|
|
# Verslagen
|
|
r'^verslagen\s+van\b',
|
|
|
|
# Verstuur
|
|
r'^verstuur\s+bericht$',
|
|
|
|
# Verzonden
|
|
r'^verzonden\s+nieuwsbrieven$',
|
|
|
|
# Vind ons
|
|
r'^vind\s+ons$',
|
|
|
|
# Virtuele
|
|
r'^virtuele\s+tour$',
|
|
|
|
# Visie
|
|
r'^visie\s+en\s+missie$',
|
|
r'^missie\s+en\s+doelen$',
|
|
|
|
# Voeman
|
|
r'^voerman\s+verwondert$',
|
|
|
|
# Voldoende
|
|
r'^voldoende\s+water$',
|
|
|
|
# Vondsten
|
|
r'^vondsten\s+in\b',
|
|
r'^lokale\s+vondsten$',
|
|
|
|
# Voor X (sections)
|
|
r'^voor\s+(wo\s+ii|bezoekers|de\s+(jeugd|media|pers)|het\s+onderwijs|onderwijsinstellingen|professionals)$',
|
|
|
|
# Voorlopige
|
|
r'^voorlopige\s+voorziening$',
|
|
|
|
# Voorouders
|
|
r'^voorouders\s+op\b',
|
|
r'^larense\s+voorouders$',
|
|
|
|
# Voorschoolse
|
|
r'^voorschoolse\s+\w+$',
|
|
|
|
# Voortgang
|
|
r'^voortgang\s+procedure$',
|
|
|
|
# Voorwaarden
|
|
r'^voorwaarden\s+(en|zonder)$',
|
|
|
|
# Vorige
|
|
r'^vorige\s+volgende\b',
|
|
|
|
# Vraag
|
|
r'^vraag\s+(en|of|stellen|afvalpas)$',
|
|
r'^vragen\s+(en|staat|over)\b',
|
|
|
|
# Vrij zoeken
|
|
r'^vrij\s+zoeken$',
|
|
|
|
# Vrijwilligers
|
|
r'^vrijwilligersuitje\s+\w+$',
|
|
r'^welkom\s+nieuwe\s+vrijwilliger$',
|
|
|
|
# Vrouwelijke
|
|
r'^vrouwelijke\s+engelandvaarders$',
|
|
|
|
# Vroegere
|
|
r'^vroegere\s+kringactiviteiten$',
|
|
|
|
# Wandel
|
|
r'^wandel\s+en\b',
|
|
r'^wandelapp\s+\w+$',
|
|
r'^wandelen\s+en\s+fietsen$',
|
|
r'^wandelkaart\s+\w+$',
|
|
r'^struinpad\s+wandelingen$',
|
|
|
|
# Wapen
|
|
r'^wapen\s+van\b',
|
|
|
|
# Wat X (sections)
|
|
r'^wat\s+(doen|doet|we|wij)$',
|
|
|
|
# Water
|
|
r'^waterbeheer\s+en\b',
|
|
r'^waterkwaliteit\s+\w+$',
|
|
r'^waterschap\s+\w+$',
|
|
r'^waterschapsverordening\s+en\b',
|
|
r'^waterpeil\s+en\b',
|
|
r'^natuur\s+en\s+waterkwaliteit$',
|
|
r'^recreatie\s+rondom\s+water$',
|
|
r'^landbouw\s+en\s+water(kwaliteit)?$',
|
|
r'^klimaat\s+en\s+veiligheid$',
|
|
r'^kaderrichtlijn\s+water$',
|
|
r'^meten\s+van\s+de\s+waterkwaliteit$',
|
|
|
|
# Webdesign/website
|
|
r'^webdesign\s+bureau\b',
|
|
r'^website\s+(beheer|gemeenteraad)$',
|
|
|
|
# Weg
|
|
r'^wegkruisenwandelboekje\s+\w+$',
|
|
r'^wegwerkzaamheden\s+en\b',
|
|
|
|
# Welkom
|
|
r'^welkom\s+terug$',
|
|
|
|
# Wensen
|
|
r'^wensen\s+rondleiding$',
|
|
|
|
# Wereld
|
|
r'^wereld\s+van\b',
|
|
|
|
# Werk
|
|
r'^werk\s+in\s+uitvoering$',
|
|
r'^werken\s+met\s+\w+$',
|
|
r'^werkgebied\s+\w+$',
|
|
r'^werkplaats\s+\w+$',
|
|
r'^werkplein\s+\w+$',
|
|
|
|
# Wet
|
|
r'^wet\s+open\s+overheid$',
|
|
|
|
# Wie
|
|
r'^wie\s+(we|wij)\s+zijn$',
|
|
|
|
# Wikipedia
|
|
r'^wikipedia\s+\w+$',
|
|
|
|
# Windmolens
|
|
r'^windmolens\s+en\b',
|
|
|
|
# Winkelwagen
|
|
r'^winkelwagen\s+\w+$',
|
|
|
|
# Winter
|
|
r'^winter\s+in\b',
|
|
|
|
# German/foreign navigation
|
|
r'^mit\s+dem\s+(auto|fahrrad)$',
|
|
r'^kontakt\s+und\b',
|
|
r'^polskie\s+informacje$',
|
|
r'^preparez\s+votre\b',
|
|
|
|
# Miscellaneous section headers
|
|
r'^komende\s+activiteiten$',
|
|
r'^korte\s+lijnen$',
|
|
r'^kunstenaars\s+in\b',
|
|
r'^huidige\s+aanbod$',
|
|
r'^iets\s+vragen$',
|
|
r'^in\s+engeland$',
|
|
r'^inkomende\s+telefoongesprekken$',
|
|
r'^inhoud\s+website$',
|
|
r'^inhoudsopgave\s+inhoudsopgave$',
|
|
r'^inleiding\s+\w+$',
|
|
r'^inloggen\s+leden$',
|
|
r'^inloop\s+\w+$',
|
|
r'^internationale\s+samenwerking$',
|
|
r'^international\s+visitors$',
|
|
r'^informatiebrochures\s+molens$',
|
|
r'^informatiecentrum\s+\w+$',
|
|
r'^infopunt\s+\w+$',
|
|
r'^informatiepunt\s+\w+$',
|
|
r'^info\s+borden$',
|
|
r'^index\s+\w+$',
|
|
|
|
# Specific place/building names
|
|
r'^hof\s+loil$',
|
|
r'^hoolten\s+klinte$',
|
|
r'^hoofdkantoor\s+\w+$',
|
|
r'^huize\s+\w+$',
|
|
r'^kasteel\s+\w+$',
|
|
r'^kastelen\s+(en|in)\b',
|
|
r'^kamp\s+vught$',
|
|
r'^kalkoven\s+\w+$',
|
|
r'^pinetum\s+\w+$',
|
|
r'^poppodium\s+\w+$',
|
|
r'^polderdistrict\s+\w+$',
|
|
r'^oudheidkamer\s+\w+$',
|
|
r'^oudheidkundig\s+streekmuseum\b',
|
|
r'^oude\s+raadhuis\b',
|
|
r'^oude\s+(ansichtkaarten|films|kerkhof)$',
|
|
r'^restaurant\s+ons\b',
|
|
r'^rabobank\s+\w+$',
|
|
r'^raethuys\s+\w+$',
|
|
r'^milieuterrein\s+\w+$',
|
|
r'^middelbaar\s+beroepsonderwijs$',
|
|
r'^scheepswerf\s+\w+$',
|
|
r'^schijnvliegveld\s+\w+$',
|
|
r'^schutterij\s+\w+$',
|
|
r'^schuttersgilde\s+\w+$',
|
|
|
|
# Publications
|
|
r'^jaarboek(en)?\s+aover\s+diem$',
|
|
r'^myerlese\s+koerier$',
|
|
r'^nijmeegs\s+katern$',
|
|
r'^old\s+ni-js(\s+edities)?$',
|
|
r'^oud\s+(bestuursleden|neis|ommen)$',
|
|
r'^reeuwijkse\s+(bronnen|reeks)$',
|
|
r'^regio\s+(historie|nieuws)$',
|
|
r'^roggels\s+blaadje$',
|
|
r'^suetan\s+kwartaalbladen$',
|
|
r'^tusken\s+de\s+marren$',
|
|
r'^verleden\s+tijdschrift$',
|
|
|
|
# Activities
|
|
r'^jeugd\s+en\s+onderwijs$',
|
|
r'^jonge\s+(kunstenaars|stadsdichter)$',
|
|
r'^jubileum\s+fietsroute$',
|
|
r'^jumelage\s+\w+$',
|
|
r'^kids\s+academy$',
|
|
r'^kijk\s+en\s+beleef$',
|
|
r'^katolieke\s+emancipatie$',
|
|
r'^keur\s+van\s+grafstenen$',
|
|
|
|
# Kaart
|
|
r'^kaart\s+kernen$',
|
|
|
|
# Molens/mills
|
|
r'^molens\s+(in|loil)$',
|
|
r'^molukse\s+(graven|muziek)$',
|
|
|
|
# Management
|
|
r'^management\s+team$',
|
|
|
|
# Methode
|
|
r'^methode\s+van\b',
|
|
|
|
# Mierlo
|
|
r'^mierlo\s+puzzel$',
|
|
|
|
# Militaire
|
|
r'^militaire\s+historie$',
|
|
|
|
# Minder
|
|
r'^minder\s+valide$',
|
|
|
|
# Morgen
|
|
r'^morgen\s+gesloten$',
|
|
|
|
# Na WO II
|
|
r'^na\s+wo\s+ii$',
|
|
|
|
# Namenlijst
|
|
r'^namenlijst\s+\w+$',
|
|
|
|
# Natuur
|
|
r'^natuurvriendelijke\s+oever$',
|
|
|
|
# Nieuwe
|
|
r'^nieuwe\s+(aanwinsten|zaak\s+starten)$',
|
|
r'^nieuw\s+wachtwoord\s+aanvragen$',
|
|
r'^nieuwegein\s+lokaal$',
|
|
r'^nieuwjaarke\s+zingen$',
|
|
r'^nieuwveense\s+landen$',
|
|
|
|
# Notariele
|
|
r'^notariele\s+archieven$',
|
|
|
|
# Numaga
|
|
r'^numaga\s+(excursies|jaarboek)$',
|
|
|
|
# Oevers
|
|
r'^oevers\s+\w+$',
|
|
|
|
# Of zocht u
|
|
r'^of\s+zocht\s+u$',
|
|
|
|
# Oijen
|
|
r'^oijen\s+en\s+teeffelen$',
|
|
|
|
# Ommetje
|
|
r'^ommetje\s+\w+$',
|
|
|
|
# Omschrijving
|
|
r'^omschrijving\s+van\b',
|
|
|
|
# Ondernemen/ondernemers/onderwijs/onderzoek
|
|
r'^ondernemen(d)?\s+(in|nijeveen)$',
|
|
r'^ondernemers\s+kunnen\s+contact$',
|
|
r'^onderwerpen\s+onderwerpen$',
|
|
r'^onderwijs\s+en\s+jeugd$',
|
|
r'^onderzoeksresultaten\s+\w+$',
|
|
|
|
# Ook
|
|
r'^ook\s+(aanwezig|gesloten)$',
|
|
|
|
# Oorsprong
|
|
r'^oorsprong\s+\w+$',
|
|
|
|
# Op de
|
|
r'^op\s+(de|het)\s+\w+$',
|
|
|
|
# Openbare
|
|
r'^openbare\s+inschrijving$',
|
|
r'^openstelling\s+en\b',
|
|
|
|
# Opgewekte
|
|
r'^opgewekte\s+geschiedenissen$',
|
|
|
|
# Opgraving
|
|
r'^opgraving\s+\w+$',
|
|
|
|
# Ozosnel
|
|
r'^ozosnel\s+fandagen$',
|
|
|
|
# Pagina
|
|
r'^pagina\s+voor\b',
|
|
|
|
# Pakje
|
|
r'^pakje\s+kunst$',
|
|
|
|
# Panorama
|
|
r'^panorama\s+van\b',
|
|
|
|
# Partner
|
|
r'^partner\s+webshop$',
|
|
|
|
# Pelt
|
|
r'^pelt\s+als\s+architect$',
|
|
|
|
# Per
|
|
r'^per\s+(auto|boot)$',
|
|
|
|
# Pers
|
|
r'^pers\s+toolkit$',
|
|
|
|
# Personenbestand
|
|
r'^personenbestand\s+\w+$',
|
|
|
|
# Persoonlijk
|
|
r'^persoonlijk\s+contact$',
|
|
|
|
# Plaatselijk
|
|
r'^plaatselijk\s+belang\b',
|
|
|
|
# Planten
|
|
r'^planten\s+en\s+dieren$',
|
|
|
|
# Poortinstructie
|
|
r'^poortinstructie\s+voor\b',
|
|
|
|
# Positieve
|
|
r'^positieve\s+gezondheid$',
|
|
|
|
# Praat
|
|
r'^praat\s+mar\s+frysk$',
|
|
|
|
# Rabo
|
|
r'^rabo\s+clubsupport$',
|
|
|
|
# Recht
|
|
r'^recht\s+van\s+opstal$',
|
|
|
|
# Renee - keep as name, but these are section headers:
|
|
r'^rijwielvordering\s+wo\s+ii$',
|
|
r'^rijnlands\s+vastgoed$',
|
|
r'^rijnlandse\s+mascottes$',
|
|
r'^rietwijk\s+of\s+reewijk$',
|
|
r'^roggel\s+(leef|en\s+omgeving)$',
|
|
r'^roggelse\s+verenigingen$',
|
|
r'^rozet\s+voor\s+jou$',
|
|
|
|
# Samenwerking
|
|
r'^samenwerking\s+met$',
|
|
|
|
# Schilderijen
|
|
r'^schilderijen\s+kunstschilders$',
|
|
|
|
# Spijkerserve
|
|
r'^spijkerserve\s+\w+$',
|
|
|
|
# Spoedeisende
|
|
r'^spoedeisende\s+meldingen$',
|
|
|
|
# Stadsregio
|
|
r'^stadsregio\s+\w+$',
|
|
r'^stadsontwikkeling\s+\w+$',
|
|
|
|
# Stalpers
|
|
r'^stalpers\s+opleidingen\b',
|
|
|
|
# Taalbrigade
|
|
r'^taalbrigade\s+kids$',
|
|
|
|
# Specific organizations that are clearly not persons
|
|
r'^heemskerker\s+ezels$',
|
|
r'^jolly\s+duck$',
|
|
r'^maria\s+kleuterschool$',
|
|
r'^puttens\s+historisch\s+genootschap$',
|
|
r'^waterlandsmuseum\s+de\s+speeltoren$',
|
|
|
|
# Additional patterns found in second pass
|
|
r'^foto\s+actief\s+media$',
|
|
r'^gebiedsbeheerders\s+en\b',
|
|
r'^het\s+(broederpad|jaarboek|museum)$',
|
|
r'^historische\s+(wandeltocht|kaarten|organisaties|panden|routes|zusterverenigingen)$',
|
|
r'^(infopunt|informatiepunt)\s+digitale\s+overheid$',
|
|
r'^jouw\s+(bezoek|eigen\s+trein)$',
|
|
r'^jaarlijkse\s+bijdrage$',
|
|
r'^kasteel\s+oud\s+haarlem$',
|
|
r'^kenniscentrum\s+\w+$',
|
|
r'^kinderbibliotheek\s+korte\s+akkeren$',
|
|
r'^kinderen\s+bij\b',
|
|
r'^kom\s+in\s+contact$',
|
|
r'^laatst\s+verschenen\s+editie$',
|
|
r'^landgoed\s+(borg|de)\s+\w+$',
|
|
r'^links\s+fotoalbums\b',
|
|
r'^martinus\s+begraafplaats$',
|
|
r'^medewerkers\s+zoeken$',
|
|
r'^meer\s+over\s+(calamiteiten|ons)$',
|
|
r'^meierijse\s+schoutsrekeningen$',
|
|
r'^meld\s+overlast\s+anoniem$',
|
|
r'^met\s+het\s+ov$',
|
|
r'^meteen\s+een\s+reactie$',
|
|
r'^molens\s+in\s+riel$',
|
|
r'^molukse\s+graven\b',
|
|
r'^naam\s+specialisme\s+e-mail$',
|
|
r'^ondernemen\s+in\s+meppel$',
|
|
r'^onderzoeksresultaten\s+digitale\b',
|
|
r'^online\s+(afspraak|vraag)\s+\w+$',
|
|
r'^ontdek\s+(de|jouw)\s+\w+$',
|
|
r'^ontstaan\s+(oud\s+verlaat|van\s+roggel)$',
|
|
r'^ontwikkeling\s+noord\b',
|
|
r'^ook\s+(aanwezig|gesloten)\s+op$',
|
|
r'^oude\s+kerkhof\b',
|
|
r'^over\s+kunstenhuis\b',
|
|
r'^over\s+batavialand\s+contact$',
|
|
r'^over\s+museumpark\b',
|
|
r'^overzicht\s+archeologische\b',
|
|
r'^personenbestand\s+rode\s+boeken$',
|
|
r'^polderdistrict\s+over-betuwe$',
|
|
r'^poppodium\s+de\s+peppel$',
|
|
r'^publicaties\s+hk\b',
|
|
r'^raadhuizen\s+van\b',
|
|
r'^raethuys\s+heemkundekring\b',
|
|
r'^rondleiding\s+zeesluis\b',
|
|
r'^route\s+(en\s+parkeren|per\s+auto)$',
|
|
r'^schenk\s+een\s+bank$',
|
|
r'^schijnvliegveld\s+de\s+kiek$',
|
|
r'^schrijf\s+je\s+in$',
|
|
r'^schutterij\s+de\s+\w+$',
|
|
r'^schuttersgilde\s+\w+$',
|
|
r'^senior\s+applicatiebeheerder\b',
|
|
r'^sint\s+nicolaas$',
|
|
r'^stadsregio\s+arnhem\b',
|
|
r'^stoomtrein\s+goes-borsele$',
|
|
r'^streekmuseum\s+jan\s+anderson$',
|
|
r'^stuur\s+(een\s+terugbelverzoek|foto\s+digitaal)$',
|
|
r'^supporter\s+cittaslow\b',
|
|
r'^theater\s+(de|het)\s+\w+$',
|
|
r'^tickets\s+en\s+tarieven$',
|
|
r'^uitgaven\s+(frans|jos)\s+\w+$',
|
|
r'^vanuit\s+venlo-eindhoven$',
|
|
r'^veluws\s+schoon$',
|
|
r'^verenigingsorgaan\s+de\s+bongard$',
|
|
r'^voorwaarden\s+(en\s+condities|zonder\s+vergunning)$',
|
|
r'^vraag\s+(en\s+aanbod|of\s+opmerking|afvalpas\s+aan)$',
|
|
r'^wat\s+(doen|doet|we|wij)\s+\w*$',
|
|
r'^werkplein\s+drentsche\b',
|
|
r'^wikipedia\s+kasteel\b',
|
|
r'^winkelwagen\s+\w+$',
|
|
r'^jumelage\s+den\s+dungen\b',
|
|
r'^katholieke\s+emancipatie$',
|
|
r'^klacht\s+windpark\b',
|
|
r'^klachten\s+over\s+medewerkers$',
|
|
r'^mariakapel\s+nieuw-dijk$',
|
|
|
|
# Additional false positives found during cleanup
|
|
r'^winkelwagen\s+\w(\s+\w)*$', # "Winkelwagen A A" - shopping cart with single chars
|
|
r'^bientien\s+over\b', # Room name "Bientien Over zaal"
|
|
r'^tonnie\s+en\s+kee\b', # Show/puppet characters from webpage title
|
|
r'^den\s+brouwer$', # Place/building name from alt-text
|
|
r'^schuttersgilde\s+[\w-]+$', # Override: allow hyphens in guild names
|
|
r'^schuttersvereniging\s+[\w-]+$', # Shooting clubs
|
|
r'\bop\s+gastenboek\b', # Navigation text "op Gastenboek" (on guestbook)
|
|
r'^rondleiding\s+\w+', # Tour/rondleiding entries (not persons)
|
|
r'^de\s+laar$', # Place name, not person
|
|
r'^serie\s+droge\s+voeten$', # Publication series name
|
|
r'^aold\s+hoksebarge$', # Dialect place/organization name (Tweants for "Oud Hoksebarge")
|
|
]
|
|
|
|
# =============================================================================
|
|
# CATEGORY 7: Famous Historical Figures (references, not contacts)
|
|
# =============================================================================
|
|
HISTORICAL_FIGURE_PATTERNS = [
|
|
r'^vincent\s+van\s+gogh$',
|
|
r'^rembrandt(\s+van\s+rijn)?$',
|
|
r'^johannes\s+vermeer$',
|
|
r'^vermeer$',
|
|
]
|
|
|
|
# =============================================================================
|
|
# CATEGORY 8: Single-word false positives (exact matches)
|
|
# =============================================================================
|
|
SINGLE_WORD_FALSE_POSITIVES = {
|
|
'admin', 'contact', 'home', 'menu', 'zoeken', 'search', 'login',
|
|
'inloggen', 'registreren', 'help', 'info', 'nieuws', 'agenda',
|
|
'kalender', 'archief', 'collectie', 'beeldbank', 'bronnen', 'links',
|
|
'partners', 'sponsors', 'doneren', 'lidmaatschap', 'privacy',
|
|
'disclaimer', 'sitemap', 'colofon', 'cookies', 'vacatures',
|
|
}
|
|
|
|
|
|
def compile_all_patterns() -> List[re.Pattern]:
|
|
"""Compile all patterns into a list of regex objects."""
|
|
all_patterns = (
|
|
URL_TECHNICAL_PATTERNS +
|
|
NAVIGATION_PATTERNS +
|
|
FORM_BUTTON_PATTERNS +
|
|
SECTION_HEADER_PATTERNS +
|
|
ORGANIZATION_PATTERNS +
|
|
PHRASE_PATTERNS +
|
|
HISTORICAL_FIGURE_PATTERNS
|
|
)
|
|
return [re.compile(p, re.IGNORECASE) for p in all_patterns]
|
|
|
|
|
|
def is_false_positive(name: str, patterns: List[re.Pattern]) -> Tuple[bool, str]:
|
|
"""
|
|
Check if a name is a false positive.
|
|
|
|
Returns: (is_false_positive: bool, reason: str)
|
|
"""
|
|
if not name or not isinstance(name, str):
|
|
return True, "empty_or_invalid"
|
|
|
|
name = name.strip()
|
|
name_lower = name.lower()
|
|
|
|
# Check single-word exact matches
|
|
if name_lower in SINGLE_WORD_FALSE_POSITIVES:
|
|
return True, "single_word_match"
|
|
|
|
# Check if name is too long (likely a sentence or URL)
|
|
if len(name) > 80:
|
|
return True, "too_long"
|
|
|
|
# Check for excessive special characters (URLs, paths, IDs)
|
|
special_chars = sum(1 for c in name if c in '/_\\=&?#@[]{}()<>|')
|
|
if special_chars > 2:
|
|
return True, "special_chars"
|
|
|
|
# Check against all compiled patterns
|
|
for pattern in patterns:
|
|
if pattern.search(name):
|
|
return True, f"pattern:{pattern.pattern[:40]}"
|
|
|
|
return False, ""
|
|
|
|
|
|
def cleanup_file(filepath: Path, patterns: List[re.Pattern], dry_run: bool = False) -> dict:
|
|
"""Clean up false positives from a single YAML file."""
|
|
stats = {
|
|
'removed': 0,
|
|
'kept': 0,
|
|
'removed_entries': []
|
|
}
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
except Exception as e:
|
|
print(f"Error reading {filepath}: {e}")
|
|
return stats
|
|
|
|
if not data or 'web_contact_data' not in data:
|
|
return stats
|
|
|
|
contact_data = data['web_contact_data']
|
|
if 'persons' not in contact_data or not contact_data['persons']:
|
|
return stats
|
|
|
|
cleaned_persons = []
|
|
|
|
for person in contact_data['persons']:
|
|
name = person.get('name', '')
|
|
is_fp, reason = is_false_positive(name, patterns)
|
|
|
|
if is_fp:
|
|
stats['removed'] += 1
|
|
stats['removed_entries'].append({'name': name, 'reason': reason})
|
|
else:
|
|
cleaned_persons.append(person)
|
|
stats['kept'] += 1
|
|
|
|
if stats['removed'] > 0 and not dry_run:
|
|
contact_data['persons'] = cleaned_persons
|
|
contact_data['cleanup_date'] = datetime.now(timezone.utc).isoformat()
|
|
contact_data['cleanup_v2_removed'] = stats['removed']
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
return stats
|
|
|
|
|
|
def cleanup_file_v2(filepath: Path, patterns: List[re.Pattern], dry_run: bool = False) -> dict:
|
|
"""Clean up false positives from a single YAML file.
|
|
|
|
Optimized version that also returns kept names.
|
|
"""
|
|
stats = {
|
|
'removed': 0,
|
|
'kept': 0,
|
|
'removed_entries': [],
|
|
'kept_names': []
|
|
}
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
except Exception as e:
|
|
print(f"Error reading {filepath}: {e}")
|
|
return stats
|
|
|
|
if not data or 'web_contact_data' not in data:
|
|
return stats
|
|
|
|
contact_data = data['web_contact_data']
|
|
if 'persons' not in contact_data or not contact_data['persons']:
|
|
return stats
|
|
|
|
cleaned_persons = []
|
|
|
|
for person in contact_data['persons']:
|
|
name = person.get('name', '')
|
|
is_fp, reason = is_false_positive(name, patterns)
|
|
|
|
if is_fp:
|
|
stats['removed'] += 1
|
|
stats['removed_entries'].append({'name': name, 'reason': reason})
|
|
else:
|
|
cleaned_persons.append(person)
|
|
stats['kept'] += 1
|
|
stats['kept_names'].append(name)
|
|
|
|
if stats['removed'] > 0 and not dry_run:
|
|
contact_data['persons'] = cleaned_persons
|
|
contact_data['cleanup_date'] = datetime.now(timezone.utc).isoformat()
|
|
contact_data['cleanup_v2_removed'] = stats['removed']
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
import subprocess
|
|
|
|
parser = argparse.ArgumentParser(description='Clean up false positives from web_contact_data (v2)')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be removed without making changes')
|
|
parser.add_argument('--verbose', '-v', action='store_true', help='Show details of removed entries')
|
|
parser.add_argument('--file', type=str, help='Process a single file instead of all files')
|
|
parser.add_argument('--show-kept', action='store_true', help='Show names that would be kept (for validation)')
|
|
args = parser.parse_args()
|
|
|
|
patterns = compile_all_patterns()
|
|
print(f"Compiled {len(patterns)} patterns")
|
|
|
|
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
|
|
|
if args.file:
|
|
files = [Path(args.file)]
|
|
else:
|
|
# Optimization: Only process files that have persons (role_type indicates person entries)
|
|
print("Finding files with person entries...")
|
|
result = subprocess.run(
|
|
['grep', '-l', 'role_type:', *[str(f) for f in custodian_dir.glob('NL-*.yaml')]],
|
|
capture_output=True, text=True
|
|
)
|
|
if result.returncode == 0 and result.stdout.strip():
|
|
files = [Path(f) for f in result.stdout.strip().split('\n')]
|
|
else:
|
|
# Fallback to all files
|
|
files = sorted(custodian_dir.glob('NL-*.yaml'))
|
|
files = sorted(files)
|
|
print(f"Found {len(files)} files with person entries")
|
|
|
|
total_removed = 0
|
|
total_kept = 0
|
|
files_modified = 0
|
|
all_removed = []
|
|
all_kept_names = []
|
|
|
|
for i, filepath in enumerate(files, 1):
|
|
if i % 50 == 0:
|
|
print(f" Processing file {i}/{len(files)}...")
|
|
|
|
stats = cleanup_file_v2(filepath, patterns, dry_run=args.dry_run)
|
|
|
|
if stats['removed'] > 0:
|
|
files_modified += 1
|
|
total_removed += stats['removed']
|
|
all_removed.extend(stats['removed_entries'])
|
|
|
|
if args.verbose:
|
|
print(f"\n{filepath.name}: removed {stats['removed']}, kept {stats['kept']}")
|
|
for entry in stats['removed_entries']:
|
|
print(f" - [{entry['reason']}] {entry['name'][:60]}")
|
|
|
|
total_kept += stats['kept']
|
|
|
|
# Collect kept names (no need to re-read file)
|
|
if args.show_kept:
|
|
all_kept_names.extend(stats['kept_names'])
|
|
|
|
print(f"\n{'[DRY RUN] ' if args.dry_run else ''}Summary:")
|
|
print(f" Files processed: {len(files)}")
|
|
print(f" Files modified: {files_modified}")
|
|
print(f" Entries removed: {total_removed}")
|
|
print(f" Entries kept: {total_kept}")
|
|
|
|
if args.verbose and all_removed:
|
|
print(f"\nRemoval reasons breakdown:")
|
|
from collections import Counter
|
|
reason_counts = Counter(e['reason'] for e in all_removed)
|
|
for reason, count in reason_counts.most_common(20):
|
|
print(f" {count}x: {reason}")
|
|
|
|
if args.show_kept:
|
|
print(f"\n--- Names that would be KEPT ({len(set(all_kept_names))} unique) ---")
|
|
for name in sorted(set(all_kept_names)):
|
|
print(f" {name}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|