715 lines
20 KiB
Python
715 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Cleanup false positives from web_contact_data extraction.
|
|
|
|
This script removes entries that are clearly not person names:
|
|
- Navigation menu items
|
|
- Section headers
|
|
- Form labels
|
|
- URL fragments
|
|
- Technical strings (places/photos)
|
|
"""
|
|
|
|
import re
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
# Patterns that indicate a false positive (case-insensitive)
|
|
FALSE_POSITIVE_PATTERNS = [
|
|
# Navigation and menu items
|
|
r'^menu\s+schakelen$',
|
|
r'^go\s+to\s+top$',
|
|
r'^page\s+load\s+link$',
|
|
r'^terug\s+naar\b',
|
|
r'^ga\s+(verder|terug)',
|
|
r'^volg\s+ons',
|
|
r'^bel\s+ons',
|
|
r'^stuur\s+een\s+(brief|bericht)',
|
|
r'^klik\s+hier',
|
|
r'^lees\s+meer',
|
|
r'^bekijk\s+',
|
|
r'^download\s+',
|
|
r'^naar\s+de\s+',
|
|
r'^meer\s+info',
|
|
|
|
# Section headers and titles
|
|
r'^laatste\s+nieuws',
|
|
r'^over\s+(ons|deze)',
|
|
r'^missie\s+en\s+visie',
|
|
r'^het\s+bestuur$',
|
|
r'^de\s+stichting$',
|
|
r'^nieuws\b',
|
|
r'^contact\b',
|
|
r'^collectie[s]?\b',
|
|
r'^beeldbank\b',
|
|
r'^bronnen\b',
|
|
r'^archief\b',
|
|
|
|
# Place names and geographical references (not person names)
|
|
r'^rondom\s+\w+', # "Rondom Dalfsen" etc.
|
|
r'^\w+\s+dialect$', # "Dalfser Dialect" etc.
|
|
r'^oet\s+dorp', # "Oet Dorp en Marke"
|
|
r'^provincie\s+',
|
|
|
|
# Famous historical figures (likely references, not contacts)
|
|
r'^vincent\s+van\s+gogh$',
|
|
r'^sir\s+lawrence\s+alma-tadema$',
|
|
r'^sientje\s+mesdag',
|
|
r'^prof\.?\s+dr\.?', # Academic titles often indicate references
|
|
|
|
# Form labels and placeholders
|
|
r'^typ\s+hier',
|
|
r'^vul\s+in',
|
|
r'^selecteer',
|
|
r'^kies\s+',
|
|
r'^zoek(en)?(\s+in)?',
|
|
r'^tips\s+tijdens',
|
|
r'^zo\s+werkt',
|
|
|
|
# Technical/URL patterns
|
|
r'^places/',
|
|
r'^admin$',
|
|
r'^http',
|
|
r'^www\.',
|
|
r'\.html$',
|
|
r'\.php$',
|
|
r'/photos/',
|
|
|
|
# Common non-name phrases
|
|
r'^altijd\s+ingeschakeld',
|
|
r'^afspraak\s+studiezaal',
|
|
r'^wandelroutes\b',
|
|
r'^fietsroute[s]?\b',
|
|
r'^vrijwilliger[s]?\s+gezocht',
|
|
r'^vriend\s+(worden|van)',
|
|
r'^steun\s+(het|de|ons)',
|
|
r'^word[t]?\s+(lid|vriend|abonnee)',
|
|
r'^sponsoren\b',
|
|
r'^responsible\s+disclosure',
|
|
r'^privacybeleid',
|
|
r'^ondersteund\s+door',
|
|
r'^met\s+dank\s+aan',
|
|
r'^correspondentie\s+adres',
|
|
r'^strikt\s+noodzakelijk',
|
|
r'^statut(en|aire)',
|
|
r'^primair\s+onderwijs',
|
|
r'^secundair\s+onderwijs',
|
|
r'^escape\s+spel',
|
|
r'^canon\s+van',
|
|
r'^dialect\b',
|
|
r'^documentatie\b',
|
|
r'^stamboom\b',
|
|
r'^jaarverslag',
|
|
r'^uitleg\b',
|
|
r'^uitgebreid\b',
|
|
r'^uitdrukkingen',
|
|
r'^interactieve',
|
|
r'^in\s+de\s+media',
|
|
r'^acten\s+van',
|
|
r'^gebouwen\s+en',
|
|
r'^verkeer\s+en',
|
|
r'^vaste\s+collectie',
|
|
r'^gidsen\b',
|
|
r'^schenken\s+en',
|
|
r'^vacatures',
|
|
r'^vestigingen',
|
|
r'^parkeren',
|
|
r'^met\s+de\s+auto',
|
|
r'^openingstijden',
|
|
r'^bereikbaarheid',
|
|
r'^kaart\s+met',
|
|
r'^to\s+the\s+museum',
|
|
r'^zien\s+en\s+doen',
|
|
r'^verhalen\b',
|
|
r'^verkoop\b',
|
|
r'^te\s+koop',
|
|
r'^toekomst\s+voor',
|
|
r'^samen\s+(werkt|staan)',
|
|
r'^provinciale?\s+staten',
|
|
r'^postadres',
|
|
r'^prijsuitreiking',
|
|
r'^plan\s+een\s+afspraak',
|
|
r'^vraag\s+de\s+',
|
|
r'^blijf\s+op\s+de\s+hoogte',
|
|
|
|
# Social media references
|
|
r'^facebook\b',
|
|
r'^instagram\b',
|
|
r'^twitter\b',
|
|
r'^linkedin\b',
|
|
r'^youtube\b',
|
|
|
|
# Workgroups and organizations (not persons)
|
|
r'^historische\s+werkgroep',
|
|
r'^redactie\b',
|
|
r'^studiezaal\b',
|
|
|
|
# Museum/collection references
|
|
r'^(rijks)?museum\b',
|
|
r'^bibliotheek\b',
|
|
r'openbaar\s+vervoer\s+museum',
|
|
r'^rotterdams\b',
|
|
|
|
# Cookies and technical
|
|
r'cookies?$',
|
|
|
|
# Additional false positives found in data
|
|
r'^alle\s+contactgegevens',
|
|
r'^inschrijven\s+nieuwsbrief',
|
|
r'^ons\s+verhaal',
|
|
r'^oude\s+nummers\s+bestellen',
|
|
r'^aanvullende\s+verhalen',
|
|
r'^de\s+\w+\s+app$', # "De SHSEL App" etc.
|
|
r'^het\s+\w+\s+wigbold',
|
|
r'^marken\s+en\s+het',
|
|
r'^begraafplaatsen\s+in',
|
|
r'^historische\s+kring\b',
|
|
r'^alles\s+inschakelen',
|
|
r'^instellingen\s+opslaan',
|
|
r'^commissaris\s+van',
|
|
r'^gedeputeerde\s+staten',
|
|
r'^burgemeester\s+en\s+wethouders',
|
|
|
|
# Government website navigation
|
|
r'^kwaliteit\s+openbaar',
|
|
r'^bevoegdheden\s+van',
|
|
r'^ook\s+interessant',
|
|
r'^gepubliceerde\s+',
|
|
r'^grond\s+en\s+gebouwen',
|
|
r'^natuur\s+en\s+landschap',
|
|
r'^loket\s+',
|
|
r'^inspreken\s+in',
|
|
r'^aanmeldformulier',
|
|
r'^kom\s+inspreken',
|
|
r'^over\s+het\s+',
|
|
r'^benader\s+',
|
|
r'^nodig\s+\w+\s+uit',
|
|
r'^ga\s+stemmen',
|
|
r'^overijssel\s+loket',
|
|
r'^footer\s+',
|
|
r'^bescherming\s+',
|
|
r'^contactgegevens\s+',
|
|
r'^bezoek\s+',
|
|
r'^culturele\s+anbi',
|
|
r'^digitale\s+toegankelijkheid',
|
|
r'^\w+\s+navigatie$', # "Footer navigatie" etc.
|
|
r'^woordvoerders$',
|
|
r'^statenleden\b',
|
|
|
|
# More website navigation
|
|
r'^informatie\s+over\b',
|
|
r'^naar\s+het\s+museum',
|
|
r'^digitale\s+collectie',
|
|
r'^anfahrt\s+', # German
|
|
r'^opening\s+hours', # English
|
|
r'^alle\s+vacatures',
|
|
r'^doe\s+een\s+melding',
|
|
r'^ga\s+naar\s+de\s+',
|
|
r'^in\s+een\s+bijeenkomst',
|
|
r'^in\s+memoriam',
|
|
r'^hoe\s+wij\s+',
|
|
r'^overige\s+externe',
|
|
r'^naar\s+inhoud',
|
|
r'^naar\s+het\s+menu',
|
|
r'^skip\s+to\b',
|
|
r'^jump\s+to\b',
|
|
|
|
# Forms and actions
|
|
r'^aanmeld(en|ing)\b',
|
|
r'^afmeld(en|ing)\b',
|
|
r'^aanvra(gen|ag)\b',
|
|
r'^reserv(eren|ering)',
|
|
r'^meld(en|ing)\b',
|
|
r'^delen\s+op\b',
|
|
r'^accepteer\b',
|
|
r'^instell(en|ingen)',
|
|
r'^verzend(en)?\b',
|
|
|
|
# Website sections and navigation continued
|
|
r'^pers\s+en\s+',
|
|
r'^oudere\s+berichten',
|
|
r'^nieuwste\s+berichten',
|
|
r'^berichten\s+archief',
|
|
r'^handige\s+links',
|
|
r'^veelgestelde\s+vragen',
|
|
r'^huishoudelijk\s+reglement',
|
|
r'^historisch\s+onderzoek',
|
|
r'^museale\s+voorwerpen',
|
|
r'^meer\s+contactgegevens',
|
|
r'^kwetsbaarheid\s+melden',
|
|
r'^opmerking\s+over',
|
|
r'^opvang\s+',
|
|
r'^oorlog\s+en\s+',
|
|
r'^nieuwsflitsen\b',
|
|
r'^informatie\s+plaatsen',
|
|
r'^heemkundekring\s+en\b',
|
|
r'^archiefstukken\b',
|
|
r'^adres\s+(en\s+contact|heemkundekring)',
|
|
r'^advies\s+en\s+contact',
|
|
r'^contactformulier\b',
|
|
r'^contactpagina\b',
|
|
r'^onthouden\s+inloggen',
|
|
r'^zelf\s+(regelen|instellen)',
|
|
r'^zorgen?\s+voor\b',
|
|
r'^zorg\s+(en|voor)',
|
|
r'^alle\s+veelgestelde',
|
|
|
|
# Provincial/municipal website patterns
|
|
r'^zwem\s+veilig',
|
|
r'^zuiver(en|fabriek)',
|
|
r'^vaarvergunning',
|
|
r'^afvalpas\b',
|
|
r'^overlast\s+melden',
|
|
r'^oude\s+uitspraken',
|
|
|
|
# Organization names (not persons)
|
|
r'^historische\s+vereniging\b',
|
|
r'^heemkundige\s+kring\b',
|
|
r'^stichting\b',
|
|
r'^adviesraad\b',
|
|
r'archief$',
|
|
r'^regionaal\s+archief',
|
|
r'^gemeentearchief\b',
|
|
r'^stadsarchief\b',
|
|
|
|
# Magazine/publication names
|
|
r'^magazine\b',
|
|
r'^tijdschrift\b',
|
|
r'^nieuwsbrief\b',
|
|
r'^bulletin\b',
|
|
r'\bmagazine$',
|
|
r'\bnieuwsbrief$',
|
|
|
|
# Place/attraction names
|
|
r'^attracties\b',
|
|
r'^atlas\b',
|
|
r'\bwandelroute$',
|
|
r'\bwindmotor$',
|
|
r'\bhelden$',
|
|
r'\bkookboek',
|
|
|
|
# Article/content references
|
|
r'^artikelen\b',
|
|
r'^diverse\s+artikelen',
|
|
r'^overige\s+artikelen',
|
|
r'^foto\s+archief',
|
|
|
|
# Button/form text
|
|
r'^afwijzen\s+',
|
|
r'^akkoord\b',
|
|
|
|
# Organizational things
|
|
r'^agrarisch\b',
|
|
|
|
# More website content patterns
|
|
r'^archieven\s+voormalige',
|
|
r'^automatische\s+incasso',
|
|
r'^bagger(en)?\b',
|
|
r'^bedevaart\b',
|
|
r'^beeld\s+en\s+geluid',
|
|
r'^beelden\s+uit\b',
|
|
r'^beeldende\s+kunst',
|
|
r'^bekendmakingen\b',
|
|
r'^belastingen\b',
|
|
r'^beleef\b',
|
|
r'^beleid\b',
|
|
r'^bellen\b',
|
|
r'^bemmel\s+in\b',
|
|
r'^bemmels\b',
|
|
r'^bemmelse\b',
|
|
r'^beschermde?\b',
|
|
r'^beschrijving\b',
|
|
r'^besluit\b',
|
|
r'^beste\s+(bezoekers|lezers)',
|
|
r'^bestel\b',
|
|
r'^bestelling\b',
|
|
r'^betaald\b',
|
|
r'^beveiligen\b',
|
|
r'^bezoekadres\b',
|
|
r'^bezwaar\b',
|
|
r'^bibliotheekabonnement\b',
|
|
r'^bidprentjes\b',
|
|
r'als\s+ANBI$',
|
|
r'\s+in\s+(oorlog|prenten)$',
|
|
r'\s+volkslied$',
|
|
r'\s+uiterwaard$',
|
|
r'^incasso\b',
|
|
r'collectie$',
|
|
|
|
# Even more website content patterns
|
|
r'^bijlage\b',
|
|
r'^bijzondere\b',
|
|
r'^bloemrijke\b',
|
|
r'^boek\s+(een|je|uw|gevelstenen)',
|
|
r'^boeken\s+(bestellen|en|over)',
|
|
r'^boekentips\b',
|
|
r'^boerderijen\b',
|
|
r'^bombardement\b',
|
|
r'^botanisch\b',
|
|
r'^boter\b',
|
|
r'^bouwen\b',
|
|
r'^brieven\b',
|
|
r'^burgerlijke\b',
|
|
r'^carnaval\b',
|
|
r'^chaamse\b',
|
|
r'^communicatie\b',
|
|
r'^complete\s+agenda',
|
|
r'^controle\b',
|
|
r'^cookiebar\b',
|
|
r'^correcties\b',
|
|
r'^criminaliteit\b',
|
|
r'^cultuur\b',
|
|
r'^cursusbureau\b',
|
|
r'^cursussen\b',
|
|
r'^datum\b',
|
|
r'^de\s+(bedrijven|bewoners|blauwe|gemeente|laatste|mangel|naam|online|oprichting)',
|
|
r'\s+en\s+(DVD|kranten|veldnamen|vergunningen|PR|aanvullingen|ondermijning|recreatie|sport|workshops|correspondentie)$',
|
|
r'\s+in\s+(alphen|chaam|midden)$',
|
|
r'\s+erfgoed$',
|
|
r'^hoen$',
|
|
|
|
# Two-word false positives (First word is a common word, not a first name)
|
|
r'^alle\s+',
|
|
r'^albert\s+\w+prijs', # Awards
|
|
r'^bekende\b',
|
|
r'^bedrijfsinformatie\b',
|
|
r'^belastingsamenwerking\b',
|
|
r'^beleven\b',
|
|
r'^berghse\b',
|
|
r'^bloeizone\b',
|
|
r'^boerderij\b',
|
|
r'\bkroniek$',
|
|
r'\bcanon$',
|
|
r'^avereester\b',
|
|
r'^bakker\s+brandts', # Historical name reference
|
|
|
|
# More false positive patterns (De/Het/Een + noun)
|
|
r'^de\s+(pareltjes|sterren|tuin|vereniging|werkgroepen)',
|
|
r'^deel\s+(deze|link)',
|
|
r'^delen\b',
|
|
r'^didamse\b',
|
|
r'^dien\s+(bezwaar|klacht|melding|verzoek)',
|
|
r'^digitaal\b',
|
|
r'^digitale\b',
|
|
r'^direct\s+inschrijven',
|
|
r'^documenten\b',
|
|
r'^doe\s+mee',
|
|
r'^doelstellingen\b',
|
|
r'^donaties\b',
|
|
r'^doneer\b',
|
|
r'^doneren\b',
|
|
r'^dorps',
|
|
r'^downloads\b',
|
|
r'^een\s+(greep|melding|sloot|werkplek)',
|
|
r'^eerdere\b',
|
|
r'^eigendommen\b',
|
|
r'^einde\b',
|
|
r'^elke\s+(dinsdagmiddag|woensdag)',
|
|
r'^en\s+voiture',
|
|
r'^english\b',
|
|
r'^ereleden\b',
|
|
r'^escape\b',
|
|
r'^eten\b',
|
|
r'^eventueel\b',
|
|
r'^excursie\b',
|
|
r'^exotische\b',
|
|
r'^externe\b',
|
|
r'^extra\s+gegevens',
|
|
r'^familieber',
|
|
r'^families\b',
|
|
r'^feestelijke\b',
|
|
r'^dansschool\b',
|
|
|
|
# More patterns (F-H)
|
|
r'^feesten\b',
|
|
r'^filmpjes\b',
|
|
r'^financiele\b',
|
|
r'^fiscaal\b',
|
|
r'^follow\b',
|
|
r'^foto\s+(album|en|herkenning|inzenden)',
|
|
r'^fotografie\b',
|
|
r'^ga\s+naar',
|
|
r'^gebiedsbeheerders\b',
|
|
r'^geboorte\b',
|
|
r'^geef\s+je',
|
|
r'^gegevens\s+wijzigen',
|
|
r'^gemakkelijk\b',
|
|
r'^gemeentelijke\b',
|
|
r'^genealogi',
|
|
r'^genieten\b',
|
|
r'^gesloten\b',
|
|
r'^gesteunde\b',
|
|
r'^gevelstenen\b',
|
|
r'^geveltjes\b',
|
|
r'^gezocht\b',
|
|
r'^gezonde\b',
|
|
r'^global\b',
|
|
r'^grafvondst\b',
|
|
r'^gratis\b',
|
|
r'^grensoverschrijdend\b',
|
|
r'^groen\s+en\b',
|
|
r'^groenblauw\b',
|
|
r'^groepen\b',
|
|
r'^grote\s+water',
|
|
r'^handhavingsverzoek\b',
|
|
r'^handige\b',
|
|
r'^het\s+(geheim|kantoor|museum|olieslaan|ontstaan)',
|
|
r'^historische\s+(avond|fietsroute|geografie|groenten|kaarten)',
|
|
]
|
|
|
|
# Exact matches (case-insensitive)
|
|
FALSE_POSITIVE_EXACT = {
|
|
'admin',
|
|
'contact',
|
|
'home',
|
|
'menu',
|
|
'zoeken',
|
|
'search',
|
|
'login',
|
|
'inloggen',
|
|
'registreren',
|
|
'aanmelden',
|
|
'afmelden',
|
|
'help',
|
|
'info',
|
|
'nieuws',
|
|
'agenda',
|
|
'kalender',
|
|
'archief',
|
|
'collectie',
|
|
'beeldbank',
|
|
'bronnen',
|
|
'links',
|
|
'partners',
|
|
'sponsors',
|
|
'doneren',
|
|
'lidmaatschap',
|
|
'privacy',
|
|
'disclaimer',
|
|
'sitemap',
|
|
'colofon',
|
|
'rembrandt',
|
|
'vermeer',
|
|
'back to top',
|
|
'admin login',
|
|
'log in',
|
|
}
|
|
|
|
# Names that contain these substrings are false positives
|
|
FALSE_POSITIVE_CONTAINS = [
|
|
'Menu schakelen',
|
|
'WordPress',
|
|
'ChIJ', # Google Place IDs
|
|
'AWn5SU', # Google photo IDs
|
|
'photos/',
|
|
'places/',
|
|
]
|
|
|
|
|
|
def compile_patterns():
|
|
"""Compile regex patterns for efficiency."""
|
|
return [re.compile(p, re.IGNORECASE) for p in FALSE_POSITIVE_PATTERNS]
|
|
|
|
|
|
def is_false_positive(name: str, patterns: list) -> bool:
|
|
"""Check if a name is a false positive."""
|
|
if not name or not isinstance(name, str):
|
|
return True
|
|
|
|
name = name.strip()
|
|
|
|
# Check exact matches
|
|
if name.lower() in FALSE_POSITIVE_EXACT:
|
|
return True
|
|
|
|
# Check contains patterns
|
|
for substring in FALSE_POSITIVE_CONTAINS:
|
|
if substring in name:
|
|
return True
|
|
|
|
# Check regex patterns
|
|
for pattern in patterns:
|
|
if pattern.search(name):
|
|
return True
|
|
|
|
# Check if name is too long (likely a URL or path)
|
|
if len(name) > 100:
|
|
return True
|
|
|
|
# Check if name contains too many special characters
|
|
special_chars = sum(1 for c in name if c in '/_\\=&?#@[]{}()<>')
|
|
if special_chars > 2:
|
|
return True
|
|
|
|
# Heuristic: Dutch person names typically start with capital and have 2-4 words
|
|
# Multi-word phrases (4+ words) are rarely person names (except with particles like van de)
|
|
words = name.split()
|
|
if len(words) >= 4:
|
|
# Exception: Names with Dutch particles can have 4 words: "Jan van der Berg"
|
|
particles = {'van', 'de', 'den', 'der', 'ter', 'ten', "'t", 'het'}
|
|
non_particle_words = [w for w in words if w.lower() not in particles]
|
|
if len(non_particle_words) >= 3: # More than firstname + particle + lastname
|
|
return True
|
|
|
|
# Names starting with lowercase are likely false positives
|
|
if name[0].islower():
|
|
return True
|
|
|
|
# Common Dutch phrase patterns that are not names
|
|
name_lower = name.lower()
|
|
|
|
# Dutch surnames often have: van, de, den, der, ter, ten, 't
|
|
dutch_surname_pattern = re.compile(r'^[A-Z][a-z]+\s+(van\s+)?(de\s+|den\s+|der\s+|ter\s+|ten\s+|\'t\s+)?[A-Z][a-z]+(-[A-Z][a-z]+)?$')
|
|
|
|
# If name matches typical Dutch name pattern, it's likely valid
|
|
if dutch_surname_pattern.match(name):
|
|
return False
|
|
|
|
# Check for phrase indicators (prepositions, articles, etc.)
|
|
phrase_indicators = [
|
|
' en ', ' of ', ' voor ', ' met ', ' naar ', ' bij ', ' uit ', ' over ',
|
|
' tijdens ', ' door ', ' aan ', ' tot ', ' vanaf ', ' binnen ',
|
|
' op de ', ' in de ', ' van de ', ' het ', ' een ', ' je ', ' uw ',
|
|
' deze ', ' die ', ' dat ', ' ons ', ' onze '
|
|
]
|
|
|
|
if any(ind in name_lower for ind in phrase_indicators):
|
|
return True
|
|
|
|
# Non-name starting words (Dutch)
|
|
non_name_start_words = {
|
|
'alle', 'andere', 'bekende', 'bekijk', 'beste', 'binnen', 'buiten',
|
|
'complete', 'diverse', 'eigen', 'elk', 'elke', 'enkele', 'extra',
|
|
'feestelijke', 'financiele', 'fiscaal', 'foto', 'ga', 'geen', 'geef',
|
|
'gemeentelijke', 'gesloten', 'gezocht', 'gratis', 'grote', 'handige',
|
|
'het', 'historische', 'hoe', 'huidige', 'juridische', 'kleine', 'kom',
|
|
'lees', 'meer', 'meeste', 'meld', 'mijn', 'nieuwe', 'officieel', 'online',
|
|
'onze', 'open', 'openbare', 'overige', 'primaire', 'recente', 'speciale',
|
|
'standaard', 'totale', 'twee', 'uw', 'veel', 'verdere', 'via', 'volle',
|
|
'voormalige', 'welke', 'wie', 'wij', 'wilt', 'zonder', 'zoek'
|
|
}
|
|
|
|
first_word = name_lower.split()[0] if name_lower.split() else ''
|
|
if first_word in non_name_start_words:
|
|
return True
|
|
|
|
# Non-name words anywhere in the name
|
|
non_name_words = {
|
|
'gemeente', 'archief', 'museum', 'bibliotheek', 'kunst', 'beleid',
|
|
'bestuur', 'website', 'formulier', 'informatie', 'nieuws', 'contact',
|
|
'service', 'dienst', 'afdeling', 'kantoor', 'locatie', 'adres', 'bezoek',
|
|
'online', 'digitaal', 'collectie', 'erfgoed', 'aanvragen', 'bekendmaking',
|
|
'vergunning', 'subsidie', 'regeling', 'document', 'download', 'pagina',
|
|
'link', 'kaart', 'route', 'wandeling', 'fietsroute', 'excursie', 'rondleiding',
|
|
'workshop', 'cursus', 'lezing', 'presentatie', 'bijeenkomst', 'vergadering',
|
|
'activiteit', 'evenement', 'agenda', 'kalender', 'programma', 'overzicht',
|
|
'lijst', 'tabel', 'grafiek', 'statistiek', 'rapport', 'verslag', 'jaarverslag',
|
|
'nieuwsbrief', 'magazine', 'tijdschrift', 'bulletin', 'krant', 'artikel',
|
|
'foto', 'video', 'film', 'audio', 'podcast', 'webinar', 'livestream'
|
|
}
|
|
if any(word in name_lower for word in non_name_words):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def cleanup_file(filepath: Path, patterns: list, dry_run: bool = False) -> dict:
|
|
"""Clean up false positives from a single YAML file."""
|
|
stats = {
|
|
'removed': 0,
|
|
'kept': 0,
|
|
'removed_names': []
|
|
}
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
except Exception as e:
|
|
print(f"Error reading {filepath}: {e}")
|
|
return stats
|
|
|
|
if not data or 'web_contact_data' not in data:
|
|
return stats
|
|
|
|
contact_data = data['web_contact_data']
|
|
if 'persons' not in contact_data or not contact_data['persons']:
|
|
return stats
|
|
|
|
original_count = len(contact_data['persons'])
|
|
cleaned_persons = []
|
|
|
|
for person in contact_data['persons']:
|
|
name = person.get('name', '')
|
|
if is_false_positive(name, patterns):
|
|
stats['removed'] += 1
|
|
stats['removed_names'].append(name)
|
|
else:
|
|
cleaned_persons.append(person)
|
|
stats['kept'] += 1
|
|
|
|
if stats['removed'] > 0:
|
|
if not dry_run:
|
|
contact_data['persons'] = cleaned_persons
|
|
contact_data['cleanup_date'] = datetime.now(timezone.utc).isoformat()
|
|
contact_data['cleanup_removed'] = contact_data.get('cleanup_removed', 0) + stats['removed']
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Clean up false positives from web_contact_data')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be removed without making changes')
|
|
parser.add_argument('--verbose', '-v', action='store_true', help='Show details of removed entries')
|
|
parser.add_argument('--file', type=str, help='Process a single file instead of all files')
|
|
args = parser.parse_args()
|
|
|
|
patterns = compile_patterns()
|
|
|
|
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
|
|
|
if args.file:
|
|
files = [Path(args.file)]
|
|
else:
|
|
files = list(custodian_dir.glob('NL-*.yaml'))
|
|
|
|
total_removed = 0
|
|
total_kept = 0
|
|
files_modified = 0
|
|
all_removed_names = []
|
|
|
|
for filepath in files:
|
|
stats = cleanup_file(filepath, patterns, dry_run=args.dry_run)
|
|
|
|
if stats['removed'] > 0:
|
|
files_modified += 1
|
|
total_removed += stats['removed']
|
|
all_removed_names.extend(stats['removed_names'])
|
|
|
|
if args.verbose:
|
|
print(f"\n{filepath.name}: removed {stats['removed']}, kept {stats['kept']}")
|
|
for name in stats['removed_names']:
|
|
print(f" - {name[:80]}...")
|
|
|
|
total_kept += stats['kept']
|
|
|
|
print(f"\n{'[DRY RUN] ' if args.dry_run else ''}Summary:")
|
|
print(f" Files processed: {len(files)}")
|
|
print(f" Files modified: {files_modified}")
|
|
print(f" Entries removed: {total_removed}")
|
|
print(f" Entries kept: {total_kept}")
|
|
|
|
if args.verbose and all_removed_names:
|
|
print(f"\nMost common removed entries:")
|
|
from collections import Counter
|
|
for name, count in Counter(all_removed_names).most_common(30):
|
|
print(f" {count}x: {name[:60]}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|