#!/usr/bin/env python3 """ Scrape KIEN profile pages to extract location/address information. KIEN (Kenniscentrum Immaterieel Erfgoed Nederland) profile pages often contain contact information including addresses. This script scrapes those pages to find location data for KIEN entries that don't have locations yet. The KIEN profile pages are at: https://www.immaterieelerfgoed.nl/nl/page/{id}/{slug} This script: 1. Finds all KIEN entries without locations 2. Scrapes their KIEN profile pages 3. Extracts address/location information 4. Updates the entry files with the discovered locations """ import argparse import re import sqlite3 import time import yaml from datetime import datetime, timezone from pathlib import Path from typing import Optional, Dict, Any, List from urllib.parse import urlparse # Paths ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries') GEONAMES_DB = Path('/Users/kempersc/apps/glam/data/reference/geonames.db') # Reverse mapping - admin1 code to province code ADMIN1_TO_PROVINCE = { '01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR', '05': 'LI', '06': 'NB', '07': 'NH', '09': 'UT', '10': 'ZE', '11': 'ZH', '15': 'OV', '16': 'FL', } def get_kien_entries_without_locations() -> List[Dict[str, Any]]: """Find all KIEN entries that don't have location data.""" entries = [] # KIEN entries are in the 17xx and 18xx range for pattern in ['17*.yaml', '18*.yaml']: for entry_path in ENTRIES_DIR.glob(pattern): with open(entry_path, 'r', encoding='utf-8') as f: try: entry = yaml.safe_load(f) except yaml.YAMLError: continue # Check if it's a KIEN entry if not entry: continue provenance = entry.get('provenance', {}) sources = provenance.get('sources', {}) is_kien = 'kien' in sources or any( 'kien' in str(s).lower() or 'immaterieelerfgoed' in str(s).lower() for s in [entry.get('original_entry', {}).get('systeem', '')] ) if not is_kien: continue # Check if already has locations or GHCID has_location = bool(entry.get('locations')) has_ghcid = bool(entry.get('ghcid', {}).get('ghcid_current')) if not has_location and not has_ghcid: kien_url = entry.get('kien_enrichment', {}).get('kien_url', '') org_name = entry.get('original_entry', {}).get('organisatie', '') website = entry.get('contact', {}).get('website') or entry.get('original_entry', {}).get('webadres_organisatie') entries.append({ 'path': entry_path, 'entry_index': entry.get('entry_index'), 'org_name': org_name, 'kien_url': kien_url, 'website': website, 'entry': entry, }) return entries def lookup_city_in_geonames(city_name: str, country_code: str = 'NL') -> Optional[Dict[str, Any]]: """Look up a city in the GeoNames database.""" if not GEONAMES_DB.exists(): return None conn = sqlite3.connect(GEONAMES_DB) cursor = conn.cursor() # Try exact match first cursor.execute(""" SELECT geonames_id, name, ascii_name, admin1_code, latitude, longitude, population, feature_code FROM cities WHERE country_code = ? AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?)) AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') ORDER BY population DESC LIMIT 1 """, (country_code, city_name, city_name)) row = cursor.fetchone() conn.close() if row: return { 'geonames_id': row[0], 'name': row[1], 'ascii_name': row[2], 'admin1_code': row[3], 'latitude': row[4], 'longitude': row[5], 'population': row[6], 'feature_code': row[7], 'region_code': ADMIN1_TO_PROVINCE.get(row[3], 'XX'), } return None def extract_address_from_html(html: str) -> Optional[Dict[str, Any]]: """ Extract address information from KIEN profile page HTML. The KIEN profile pages have a specific structure: - Organization contact info is in a "Contact" section near the top (under the avatar) - KIEN's own contact info is in the footer (should be ignored) The organization's location often appears as just a city name in all caps (e.g., "KAMPEN") in the contact section. Returns dict with extracted address components or None. """ try: from bs4 import BeautifulSoup except ImportError: print("Warning: BeautifulSoup not installed. Run: pip install beautifulsoup4") return None soup = BeautifulSoup(html, 'html.parser') # Collect address candidates (excluding footer) address_candidates: List[str] = [] # Strategy 1: Find the organization's contact section (not footer) # The page structure has an avatar image, then h1 with contact name, then h2 with org name # Below that is the contact info with city in caps # Look for content before the footer # The footer contains KIEN's address "Postbus 649, 6800 AP Arnhem" # Find all h3 elements with "Contact" heading - there are two: one for org, one for KIEN contact_headings = [h for h in soup.find_all('h3') if h.get_text(strip=True) == 'Contact'] # The first "Contact" section is the organization's if contact_headings: org_contact = contact_headings[0] # Get the parent/siblings that contain the actual contact info parent = org_contact.parent if parent: # Get text from this section section_text = parent.get_text(separator='\n', strip=True) # Stop at the next section or at "Kenniscentrum" (KIEN's address marker) lines = section_text.split('\n') for line in lines: line = line.strip() if not line: continue # Skip if it's KIEN's address if 'Kenniscentrum' in line or 'Postbus' in line or '6800 AP' in line: break # City names can be in ALL CAPS or Title Case # Accept lines that are 3-50 chars and look like place names if len(line) > 2 and len(line) < 50: # All caps city (e.g., "KAMPEN") if line.isupper(): address_candidates.append(line) # Title case city (e.g., "Hattem", "Den Haag") elif re.match(r'^[A-Z][a-z]+(?:\s+[a-z]+)*$|^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*$', line): # Single word capitalized or multi-word with each word capitalized address_candidates.append(line) # Or postal code + city if re.match(r'\d{4}\s?[A-Z]{2}', line): address_candidates.append(line) # Strategy 2: Look for city names near the top of the page # (but not in footer/navigation) main_content = soup.get_text() # Split at "Kenniscentrum Immaterieel Erfgoed" which marks the footer parts = main_content.split('Kenniscentrum Immaterieel Erfgoed') if parts: org_section = parts[0] # Look for all-caps words that might be city names (3+ chars) caps_words = re.findall(r'\b([A-Z]{3,}(?:\s+[A-Z]+)?)\b', org_section) for word in caps_words: # Skip common non-city words if word in ['CONTACT', 'MEDIA', 'IMMATERIEEL', 'ERFGOED', 'WEBSITE', 'NEDERLAND', 'HOME', 'OVER', 'NIEUWS', 'AGENDA', 'ENGLISH', 'DUTCH', 'MENU']: continue address_candidates.append(word) # Also look for title-case words that could be Dutch cities # Pattern: Words starting with capital, potentially with lowercase following # Look for standalone words that might be city names title_words = re.findall(r'\b([A-Z][a-z]{2,}(?:[\s\-][A-Za-z]+)?)\b', org_section) for word in title_words: # Skip common non-city words skip_words = ['Contact', 'Media', 'Website', 'Email', 'Telefoon', 'Facebook', 'Twitter', 'Instagram', 'YouTube', 'LinkedIn', 'Home', 'Over', 'Nieuws', 'Agenda', 'English', 'Dutch', 'Menu', 'Zoeken', 'Stichting', 'Vereniging', 'Nederland', 'Nederlands', 'Meer'] if word in skip_words: continue # Add to candidates address_candidates.append(word) # Try to validate each candidate as a Dutch city for candidate in address_candidates: # Handle postal code + city pattern match = re.search(r'(\d{4}\s?[A-Z]{2})\s+([A-Za-z][A-Za-z\-\'\s]+)', candidate) if match: postal_code = match.group(1).replace(' ', '') city = match.group(2).strip().title() # Convert to title case else: # Single city name (possibly in caps) postal_code = None city = candidate.strip().title() # Convert KAMPEN to Kampen # Clean up city name city = re.sub(r'\s+', ' ', city) city = city.split(',')[0].strip() # Skip if too short if len(city) < 3: continue # Validate city in GeoNames geonames = lookup_city_in_geonames(city) if geonames: result: Dict[str, Any] = { 'city': geonames['name'], 'country': 'NL', 'latitude': geonames['latitude'], 'longitude': geonames['longitude'], 'geonames_id': geonames['geonames_id'], 'region_code': geonames['region_code'], 'extraction_method': 'KIEN_PROFILE_CONTACT_SECTION', } if postal_code: result['postal_code'] = postal_code return result return None def scrape_kien_profile(url: str, page: Any) -> Optional[str]: """ Scrape a KIEN profile page using Playwright. Returns HTML content or None on failure. """ try: page.goto(url, wait_until='networkidle', timeout=30000) time.sleep(1) # Wait for any dynamic content html = page.content() return html except Exception as e: if 'Timeout' in str(type(e).__name__): print(f" Timeout for {url}") else: print(f" Error scraping {url}: {e}") return None def scrape_org_website(url: str, page: Any) -> Optional[str]: """ Scrape an organization's own website for contact/address info. Returns HTML content or None on failure. """ if not url or url == 'null': return None try: from bs4 import BeautifulSoup except ImportError: return None # Normalize URL if not url.startswith('http'): url = 'https://' + url try: page.goto(url, wait_until='networkidle', timeout=30000) time.sleep(1) # Try to find contact/about page html = page.content() soup = BeautifulSoup(html, 'html.parser') # Look for contact links contact_links: List[str] = [] for link in soup.find_all('a', href=True): href_attr = link.get('href') if href_attr: href = str(href_attr).lower() text = link.get_text(strip=True).lower() if any(term in href or term in text for term in ['contact', 'over', 'about', 'adres', 'locatie']): contact_links.append(str(href_attr)) # If we find contact page, navigate there if contact_links: for contact_href in contact_links[:1]: # Try first contact link if not contact_href.startswith('http'): # Make relative URL absolute parsed = urlparse(url) if contact_href.startswith('/'): contact_href = f"{parsed.scheme}://{parsed.netloc}{contact_href}" else: contact_href = f"{url.rstrip('/')}/{contact_href}" try: page.goto(contact_href, wait_until='networkidle', timeout=20000) time.sleep(1) return page.content() except Exception: pass return html except Exception as e: print(f" Error scraping {url}: {e}") return None def update_entry_with_location(entry_path: Path, location: Dict[str, Any], extraction_method: str, source_url: str, dry_run: bool = True) -> bool: """Update an entry file with discovered location.""" with open(entry_path, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) # Create location structure location_data: Dict[str, Any] = { 'city': location['city'], 'country': 'NL', 'latitude': location.get('latitude'), 'longitude': location.get('longitude'), } if 'postal_code' in location: location_data['postal_code'] = location['postal_code'] # Create resolution metadata resolution = { 'method': extraction_method, 'source_url': source_url, 'geonames_id': location.get('geonames_id'), 'geonames_name': location.get('city'), 'region_code': location.get('region_code'), 'extraction_timestamp': datetime.now(timezone.utc).isoformat(), } if not dry_run: entry['locations'] = [location_data] entry['location_resolution'] = resolution # Add provenance note if 'provenance' not in entry: entry['provenance'] = {'notes': []} if 'notes' not in entry['provenance']: entry['provenance']['notes'] = [] entry['provenance']['notes'].append( f"Location extracted from {extraction_method} - {source_url}" ) with open(entry_path, 'w', encoding='utf-8') as f: yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return True def main() -> None: parser = argparse.ArgumentParser(description='Scrape KIEN profiles for location data') parser.add_argument('--dry-run', action='store_true', help='Print what would be done without making changes') parser.add_argument('--limit', type=int, default=10, help='Limit number of entries to process') parser.add_argument('--skip-kien', action='store_true', help='Skip KIEN profile scraping') parser.add_argument('--skip-website', action='store_true', help='Skip organization website scraping') args = parser.parse_args() try: from playwright.sync_api import sync_playwright except ImportError: print("Error: Playwright not installed. Run: pip install playwright && playwright install chromium") return # Get entries without locations entries = get_kien_entries_without_locations() print(f"Found {len(entries)} KIEN entries without locations") if args.limit: entries = entries[:args.limit] print(f"Processing first {args.limit} entries") extracted: List[Dict[str, Any]] = [] failed: List[Dict[str, Any]] = [] with sync_playwright() as p: browser = p.chromium.launch(headless=True) context = browser.new_context( user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' ) page = context.new_page() for i, entry_info in enumerate(entries): print(f"\n[{i+1}/{len(entries)}] {entry_info['org_name']}") location: Optional[Dict[str, Any]] = None source_url: Optional[str] = None method: Optional[str] = None # Try KIEN profile first if not args.skip_kien and entry_info['kien_url']: print(f" Scraping KIEN profile: {entry_info['kien_url']}") html = scrape_kien_profile(entry_info['kien_url'], page) if html: location = extract_address_from_html(html) if location: source_url = entry_info['kien_url'] method = 'KIEN_PROFILE_SCRAPE' print(f" ✓ Found location: {location['city']}") # Try organization website if not location and not args.skip_website and entry_info['website']: print(f" Scraping website: {entry_info['website']}") html = scrape_org_website(entry_info['website'], page) if html: location = extract_address_from_html(html) if location: source_url = entry_info['website'] method = 'ORG_WEBSITE_SCRAPE' print(f" ✓ Found location: {location['city']}") if location and source_url and method: extracted.append({ 'entry_index': entry_info['entry_index'], 'org_name': entry_info['org_name'], 'location': location, 'source': source_url, 'method': method, }) # Update the entry file update_entry_with_location( entry_info['path'], location, method, source_url, dry_run=args.dry_run ) else: failed.append({ 'entry_index': entry_info['entry_index'], 'org_name': entry_info['org_name'], 'kien_url': entry_info['kien_url'], 'website': entry_info['website'], }) print(" ✗ No location found") # Rate limiting time.sleep(2) browser.close() # Summary print(f"\n{'[DRY RUN] ' if args.dry_run else ''}Summary:") print(f" - Locations extracted: {len(extracted)}") print(f" - No location found: {len(failed)}") if extracted: print("\nExtracted locations:") for e in extracted: print(f" {e['org_name']} → {e['location']['city']} ({e['method']})") if failed: print("\nFailed to find location:") for f in failed: print(f" {f['org_name']}") if __name__ == '__main__': main()