glam/scripts/scrape_kien_profiles_for_locations.py
2025-12-05 15:30:23 +01:00

500 lines
19 KiB
Python

#!/usr/bin/env python3
"""
Scrape KIEN profile pages to extract location/address information.
KIEN (Kenniscentrum Immaterieel Erfgoed Nederland) profile pages often contain
contact information including addresses. This script scrapes those pages to
find location data for KIEN entries that don't have locations yet.
The KIEN profile pages are at:
https://www.immaterieelerfgoed.nl/nl/page/{id}/{slug}
This script:
1. Finds all KIEN entries without locations
2. Scrapes their KIEN profile pages
3. Extracts address/location information
4. Updates the entry files with the discovered locations
"""
import argparse
import re
import sqlite3
import time
import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List
from urllib.parse import urlparse
# Paths
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
GEONAMES_DB = Path('/Users/kempersc/apps/glam/data/reference/geonames.db')
# Reverse mapping - admin1 code to province code
ADMIN1_TO_PROVINCE = {
'01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR',
'05': 'LI', '06': 'NB', '07': 'NH', '09': 'UT',
'10': 'ZE', '11': 'ZH', '15': 'OV', '16': 'FL',
}
def get_kien_entries_without_locations() -> List[Dict[str, Any]]:
"""Find all KIEN entries that don't have location data."""
entries = []
# KIEN entries are in the 17xx and 18xx range
for pattern in ['17*.yaml', '18*.yaml']:
for entry_path in ENTRIES_DIR.glob(pattern):
with open(entry_path, 'r', encoding='utf-8') as f:
try:
entry = yaml.safe_load(f)
except yaml.YAMLError:
continue
# Check if it's a KIEN entry
if not entry:
continue
provenance = entry.get('provenance', {})
sources = provenance.get('sources', {})
is_kien = 'kien' in sources or any(
'kien' in str(s).lower() or 'immaterieelerfgoed' in str(s).lower()
for s in [entry.get('original_entry', {}).get('systeem', '')]
)
if not is_kien:
continue
# Check if already has locations or GHCID
has_location = bool(entry.get('locations'))
has_ghcid = bool(entry.get('ghcid', {}).get('ghcid_current'))
if not has_location and not has_ghcid:
kien_url = entry.get('kien_enrichment', {}).get('kien_url', '')
org_name = entry.get('original_entry', {}).get('organisatie', '')
website = entry.get('contact', {}).get('website') or entry.get('original_entry', {}).get('webadres_organisatie')
entries.append({
'path': entry_path,
'entry_index': entry.get('entry_index'),
'org_name': org_name,
'kien_url': kien_url,
'website': website,
'entry': entry,
})
return entries
def lookup_city_in_geonames(city_name: str, country_code: str = 'NL') -> Optional[Dict[str, Any]]:
"""Look up a city in the GeoNames database."""
if not GEONAMES_DB.exists():
return None
conn = sqlite3.connect(GEONAMES_DB)
cursor = conn.cursor()
# Try exact match first
cursor.execute("""
SELECT geonames_id, name, ascii_name, admin1_code, latitude, longitude,
population, feature_code
FROM cities
WHERE country_code = ?
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
ORDER BY population DESC
LIMIT 1
""", (country_code, city_name, city_name))
row = cursor.fetchone()
conn.close()
if row:
return {
'geonames_id': row[0],
'name': row[1],
'ascii_name': row[2],
'admin1_code': row[3],
'latitude': row[4],
'longitude': row[5],
'population': row[6],
'feature_code': row[7],
'region_code': ADMIN1_TO_PROVINCE.get(row[3], 'XX'),
}
return None
def extract_address_from_html(html: str) -> Optional[Dict[str, Any]]:
"""
Extract address information from KIEN profile page HTML.
The KIEN profile pages have a specific structure:
- Organization contact info is in a "Contact" section near the top (under the avatar)
- KIEN's own contact info is in the footer (should be ignored)
The organization's location often appears as just a city name in all caps
(e.g., "KAMPEN") in the contact section.
Returns dict with extracted address components or None.
"""
try:
from bs4 import BeautifulSoup
except ImportError:
print("Warning: BeautifulSoup not installed. Run: pip install beautifulsoup4")
return None
soup = BeautifulSoup(html, 'html.parser')
# Collect address candidates (excluding footer)
address_candidates: List[str] = []
# Strategy 1: Find the organization's contact section (not footer)
# The page structure has an avatar image, then h1 with contact name, then h2 with org name
# Below that is the contact info with city in caps
# Look for content before the footer
# The footer contains KIEN's address "Postbus 649, 6800 AP Arnhem"
# Find all h3 elements with "Contact" heading - there are two: one for org, one for KIEN
contact_headings = [h for h in soup.find_all('h3') if h.get_text(strip=True) == 'Contact']
# The first "Contact" section is the organization's
if contact_headings:
org_contact = contact_headings[0]
# Get the parent/siblings that contain the actual contact info
parent = org_contact.parent
if parent:
# Get text from this section
section_text = parent.get_text(separator='\n', strip=True)
# Stop at the next section or at "Kenniscentrum" (KIEN's address marker)
lines = section_text.split('\n')
for line in lines:
line = line.strip()
if not line:
continue
# Skip if it's KIEN's address
if 'Kenniscentrum' in line or 'Postbus' in line or '6800 AP' in line:
break
# City names can be in ALL CAPS or Title Case
# Accept lines that are 3-50 chars and look like place names
if len(line) > 2 and len(line) < 50:
# All caps city (e.g., "KAMPEN")
if line.isupper():
address_candidates.append(line)
# Title case city (e.g., "Hattem", "Den Haag")
elif re.match(r'^[A-Z][a-z]+(?:\s+[a-z]+)*$|^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*$', line):
# Single word capitalized or multi-word with each word capitalized
address_candidates.append(line)
# Or postal code + city
if re.match(r'\d{4}\s?[A-Z]{2}', line):
address_candidates.append(line)
# Strategy 2: Look for city names near the top of the page
# (but not in footer/navigation)
main_content = soup.get_text()
# Split at "Kenniscentrum Immaterieel Erfgoed" which marks the footer
parts = main_content.split('Kenniscentrum Immaterieel Erfgoed')
if parts:
org_section = parts[0]
# Look for all-caps words that might be city names (3+ chars)
caps_words = re.findall(r'\b([A-Z]{3,}(?:\s+[A-Z]+)?)\b', org_section)
for word in caps_words:
# Skip common non-city words
if word in ['CONTACT', 'MEDIA', 'IMMATERIEEL', 'ERFGOED', 'WEBSITE', 'NEDERLAND',
'HOME', 'OVER', 'NIEUWS', 'AGENDA', 'ENGLISH', 'DUTCH', 'MENU']:
continue
address_candidates.append(word)
# Also look for title-case words that could be Dutch cities
# Pattern: Words starting with capital, potentially with lowercase following
# Look for standalone words that might be city names
title_words = re.findall(r'\b([A-Z][a-z]{2,}(?:[\s\-][A-Za-z]+)?)\b', org_section)
for word in title_words:
# Skip common non-city words
skip_words = ['Contact', 'Media', 'Website', 'Email', 'Telefoon', 'Facebook',
'Twitter', 'Instagram', 'YouTube', 'LinkedIn', 'Home', 'Over',
'Nieuws', 'Agenda', 'English', 'Dutch', 'Menu', 'Zoeken',
'Stichting', 'Vereniging', 'Nederland', 'Nederlands', 'Meer']
if word in skip_words:
continue
# Add to candidates
address_candidates.append(word)
# Try to validate each candidate as a Dutch city
for candidate in address_candidates:
# Handle postal code + city pattern
match = re.search(r'(\d{4}\s?[A-Z]{2})\s+([A-Za-z][A-Za-z\-\'\s]+)', candidate)
if match:
postal_code = match.group(1).replace(' ', '')
city = match.group(2).strip().title() # Convert to title case
else:
# Single city name (possibly in caps)
postal_code = None
city = candidate.strip().title() # Convert KAMPEN to Kampen
# Clean up city name
city = re.sub(r'\s+', ' ', city)
city = city.split(',')[0].strip()
# Skip if too short
if len(city) < 3:
continue
# Validate city in GeoNames
geonames = lookup_city_in_geonames(city)
if geonames:
result: Dict[str, Any] = {
'city': geonames['name'],
'country': 'NL',
'latitude': geonames['latitude'],
'longitude': geonames['longitude'],
'geonames_id': geonames['geonames_id'],
'region_code': geonames['region_code'],
'extraction_method': 'KIEN_PROFILE_CONTACT_SECTION',
}
if postal_code:
result['postal_code'] = postal_code
return result
return None
def scrape_kien_profile(url: str, page: Any) -> Optional[str]:
"""
Scrape a KIEN profile page using Playwright.
Returns HTML content or None on failure.
"""
try:
page.goto(url, wait_until='networkidle', timeout=30000)
time.sleep(1) # Wait for any dynamic content
html = page.content()
return html
except Exception as e:
if 'Timeout' in str(type(e).__name__):
print(f" Timeout for {url}")
else:
print(f" Error scraping {url}: {e}")
return None
def scrape_org_website(url: str, page: Any) -> Optional[str]:
"""
Scrape an organization's own website for contact/address info.
Returns HTML content or None on failure.
"""
if not url or url == 'null':
return None
try:
from bs4 import BeautifulSoup
except ImportError:
return None
# Normalize URL
if not url.startswith('http'):
url = 'https://' + url
try:
page.goto(url, wait_until='networkidle', timeout=30000)
time.sleep(1)
# Try to find contact/about page
html = page.content()
soup = BeautifulSoup(html, 'html.parser')
# Look for contact links
contact_links: List[str] = []
for link in soup.find_all('a', href=True):
href_attr = link.get('href')
if href_attr:
href = str(href_attr).lower()
text = link.get_text(strip=True).lower()
if any(term in href or term in text for term in ['contact', 'over', 'about', 'adres', 'locatie']):
contact_links.append(str(href_attr))
# If we find contact page, navigate there
if contact_links:
for contact_href in contact_links[:1]: # Try first contact link
if not contact_href.startswith('http'):
# Make relative URL absolute
parsed = urlparse(url)
if contact_href.startswith('/'):
contact_href = f"{parsed.scheme}://{parsed.netloc}{contact_href}"
else:
contact_href = f"{url.rstrip('/')}/{contact_href}"
try:
page.goto(contact_href, wait_until='networkidle', timeout=20000)
time.sleep(1)
return page.content()
except Exception:
pass
return html
except Exception as e:
print(f" Error scraping {url}: {e}")
return None
def update_entry_with_location(entry_path: Path, location: Dict[str, Any],
extraction_method: str, source_url: str,
dry_run: bool = True) -> bool:
"""Update an entry file with discovered location."""
with open(entry_path, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
# Create location structure
location_data: Dict[str, Any] = {
'city': location['city'],
'country': 'NL',
'latitude': location.get('latitude'),
'longitude': location.get('longitude'),
}
if 'postal_code' in location:
location_data['postal_code'] = location['postal_code']
# Create resolution metadata
resolution = {
'method': extraction_method,
'source_url': source_url,
'geonames_id': location.get('geonames_id'),
'geonames_name': location.get('city'),
'region_code': location.get('region_code'),
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
}
if not dry_run:
entry['locations'] = [location_data]
entry['location_resolution'] = resolution
# Add provenance note
if 'provenance' not in entry:
entry['provenance'] = {'notes': []}
if 'notes' not in entry['provenance']:
entry['provenance']['notes'] = []
entry['provenance']['notes'].append(
f"Location extracted from {extraction_method} - {source_url}"
)
with open(entry_path, 'w', encoding='utf-8') as f:
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return True
def main() -> None:
parser = argparse.ArgumentParser(description='Scrape KIEN profiles for location data')
parser.add_argument('--dry-run', action='store_true', help='Print what would be done without making changes')
parser.add_argument('--limit', type=int, default=10, help='Limit number of entries to process')
parser.add_argument('--skip-kien', action='store_true', help='Skip KIEN profile scraping')
parser.add_argument('--skip-website', action='store_true', help='Skip organization website scraping')
args = parser.parse_args()
try:
from playwright.sync_api import sync_playwright
except ImportError:
print("Error: Playwright not installed. Run: pip install playwright && playwright install chromium")
return
# Get entries without locations
entries = get_kien_entries_without_locations()
print(f"Found {len(entries)} KIEN entries without locations")
if args.limit:
entries = entries[:args.limit]
print(f"Processing first {args.limit} entries")
extracted: List[Dict[str, Any]] = []
failed: List[Dict[str, Any]] = []
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(
user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
)
page = context.new_page()
for i, entry_info in enumerate(entries):
print(f"\n[{i+1}/{len(entries)}] {entry_info['org_name']}")
location: Optional[Dict[str, Any]] = None
source_url: Optional[str] = None
method: Optional[str] = None
# Try KIEN profile first
if not args.skip_kien and entry_info['kien_url']:
print(f" Scraping KIEN profile: {entry_info['kien_url']}")
html = scrape_kien_profile(entry_info['kien_url'], page)
if html:
location = extract_address_from_html(html)
if location:
source_url = entry_info['kien_url']
method = 'KIEN_PROFILE_SCRAPE'
print(f" ✓ Found location: {location['city']}")
# Try organization website
if not location and not args.skip_website and entry_info['website']:
print(f" Scraping website: {entry_info['website']}")
html = scrape_org_website(entry_info['website'], page)
if html:
location = extract_address_from_html(html)
if location:
source_url = entry_info['website']
method = 'ORG_WEBSITE_SCRAPE'
print(f" ✓ Found location: {location['city']}")
if location and source_url and method:
extracted.append({
'entry_index': entry_info['entry_index'],
'org_name': entry_info['org_name'],
'location': location,
'source': source_url,
'method': method,
})
# Update the entry file
update_entry_with_location(
entry_info['path'],
location,
method,
source_url,
dry_run=args.dry_run
)
else:
failed.append({
'entry_index': entry_info['entry_index'],
'org_name': entry_info['org_name'],
'kien_url': entry_info['kien_url'],
'website': entry_info['website'],
})
print(" ✗ No location found")
# Rate limiting
time.sleep(2)
browser.close()
# Summary
print(f"\n{'[DRY RUN] ' if args.dry_run else ''}Summary:")
print(f" - Locations extracted: {len(extracted)}")
print(f" - No location found: {len(failed)}")
if extracted:
print("\nExtracted locations:")
for e in extracted:
print(f" {e['org_name']}{e['location']['city']} ({e['method']})")
if failed:
print("\nFailed to find location:")
for f in failed:
print(f" {f['org_name']}")
if __name__ == '__main__':
main()