500 lines
19 KiB
Python
500 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Scrape KIEN profile pages to extract location/address information.
|
|
|
|
KIEN (Kenniscentrum Immaterieel Erfgoed Nederland) profile pages often contain
|
|
contact information including addresses. This script scrapes those pages to
|
|
find location data for KIEN entries that don't have locations yet.
|
|
|
|
The KIEN profile pages are at:
|
|
https://www.immaterieelerfgoed.nl/nl/page/{id}/{slug}
|
|
|
|
This script:
|
|
1. Finds all KIEN entries without locations
|
|
2. Scrapes their KIEN profile pages
|
|
3. Extracts address/location information
|
|
4. Updates the entry files with the discovered locations
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
import sqlite3
|
|
import time
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any, List
|
|
from urllib.parse import urlparse
|
|
|
|
# Paths
|
|
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
|
|
GEONAMES_DB = Path('/Users/kempersc/apps/glam/data/reference/geonames.db')
|
|
|
|
# Reverse mapping - admin1 code to province code
|
|
ADMIN1_TO_PROVINCE = {
|
|
'01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR',
|
|
'05': 'LI', '06': 'NB', '07': 'NH', '09': 'UT',
|
|
'10': 'ZE', '11': 'ZH', '15': 'OV', '16': 'FL',
|
|
}
|
|
|
|
|
|
def get_kien_entries_without_locations() -> List[Dict[str, Any]]:
|
|
"""Find all KIEN entries that don't have location data."""
|
|
entries = []
|
|
|
|
# KIEN entries are in the 17xx and 18xx range
|
|
for pattern in ['17*.yaml', '18*.yaml']:
|
|
for entry_path in ENTRIES_DIR.glob(pattern):
|
|
with open(entry_path, 'r', encoding='utf-8') as f:
|
|
try:
|
|
entry = yaml.safe_load(f)
|
|
except yaml.YAMLError:
|
|
continue
|
|
|
|
# Check if it's a KIEN entry
|
|
if not entry:
|
|
continue
|
|
|
|
provenance = entry.get('provenance', {})
|
|
sources = provenance.get('sources', {})
|
|
is_kien = 'kien' in sources or any(
|
|
'kien' in str(s).lower() or 'immaterieelerfgoed' in str(s).lower()
|
|
for s in [entry.get('original_entry', {}).get('systeem', '')]
|
|
)
|
|
|
|
if not is_kien:
|
|
continue
|
|
|
|
# Check if already has locations or GHCID
|
|
has_location = bool(entry.get('locations'))
|
|
has_ghcid = bool(entry.get('ghcid', {}).get('ghcid_current'))
|
|
|
|
if not has_location and not has_ghcid:
|
|
kien_url = entry.get('kien_enrichment', {}).get('kien_url', '')
|
|
org_name = entry.get('original_entry', {}).get('organisatie', '')
|
|
website = entry.get('contact', {}).get('website') or entry.get('original_entry', {}).get('webadres_organisatie')
|
|
|
|
entries.append({
|
|
'path': entry_path,
|
|
'entry_index': entry.get('entry_index'),
|
|
'org_name': org_name,
|
|
'kien_url': kien_url,
|
|
'website': website,
|
|
'entry': entry,
|
|
})
|
|
|
|
return entries
|
|
|
|
|
|
def lookup_city_in_geonames(city_name: str, country_code: str = 'NL') -> Optional[Dict[str, Any]]:
|
|
"""Look up a city in the GeoNames database."""
|
|
if not GEONAMES_DB.exists():
|
|
return None
|
|
|
|
conn = sqlite3.connect(GEONAMES_DB)
|
|
cursor = conn.cursor()
|
|
|
|
# Try exact match first
|
|
cursor.execute("""
|
|
SELECT geonames_id, name, ascii_name, admin1_code, latitude, longitude,
|
|
population, feature_code
|
|
FROM cities
|
|
WHERE country_code = ?
|
|
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
""", (country_code, city_name, city_name))
|
|
|
|
row = cursor.fetchone()
|
|
conn.close()
|
|
|
|
if row:
|
|
return {
|
|
'geonames_id': row[0],
|
|
'name': row[1],
|
|
'ascii_name': row[2],
|
|
'admin1_code': row[3],
|
|
'latitude': row[4],
|
|
'longitude': row[5],
|
|
'population': row[6],
|
|
'feature_code': row[7],
|
|
'region_code': ADMIN1_TO_PROVINCE.get(row[3], 'XX'),
|
|
}
|
|
|
|
return None
|
|
|
|
|
|
def extract_address_from_html(html: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Extract address information from KIEN profile page HTML.
|
|
|
|
The KIEN profile pages have a specific structure:
|
|
- Organization contact info is in a "Contact" section near the top (under the avatar)
|
|
- KIEN's own contact info is in the footer (should be ignored)
|
|
|
|
The organization's location often appears as just a city name in all caps
|
|
(e.g., "KAMPEN") in the contact section.
|
|
|
|
Returns dict with extracted address components or None.
|
|
"""
|
|
try:
|
|
from bs4 import BeautifulSoup
|
|
except ImportError:
|
|
print("Warning: BeautifulSoup not installed. Run: pip install beautifulsoup4")
|
|
return None
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
# Collect address candidates (excluding footer)
|
|
address_candidates: List[str] = []
|
|
|
|
# Strategy 1: Find the organization's contact section (not footer)
|
|
# The page structure has an avatar image, then h1 with contact name, then h2 with org name
|
|
# Below that is the contact info with city in caps
|
|
|
|
# Look for content before the footer
|
|
# The footer contains KIEN's address "Postbus 649, 6800 AP Arnhem"
|
|
|
|
# Find all h3 elements with "Contact" heading - there are two: one for org, one for KIEN
|
|
contact_headings = [h for h in soup.find_all('h3') if h.get_text(strip=True) == 'Contact']
|
|
|
|
# The first "Contact" section is the organization's
|
|
if contact_headings:
|
|
org_contact = contact_headings[0]
|
|
# Get the parent/siblings that contain the actual contact info
|
|
parent = org_contact.parent
|
|
if parent:
|
|
# Get text from this section
|
|
section_text = parent.get_text(separator='\n', strip=True)
|
|
# Stop at the next section or at "Kenniscentrum" (KIEN's address marker)
|
|
lines = section_text.split('\n')
|
|
for line in lines:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
# Skip if it's KIEN's address
|
|
if 'Kenniscentrum' in line or 'Postbus' in line or '6800 AP' in line:
|
|
break
|
|
# City names can be in ALL CAPS or Title Case
|
|
# Accept lines that are 3-50 chars and look like place names
|
|
if len(line) > 2 and len(line) < 50:
|
|
# All caps city (e.g., "KAMPEN")
|
|
if line.isupper():
|
|
address_candidates.append(line)
|
|
# Title case city (e.g., "Hattem", "Den Haag")
|
|
elif re.match(r'^[A-Z][a-z]+(?:\s+[a-z]+)*$|^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*$', line):
|
|
# Single word capitalized or multi-word with each word capitalized
|
|
address_candidates.append(line)
|
|
# Or postal code + city
|
|
if re.match(r'\d{4}\s?[A-Z]{2}', line):
|
|
address_candidates.append(line)
|
|
|
|
# Strategy 2: Look for city names near the top of the page
|
|
# (but not in footer/navigation)
|
|
main_content = soup.get_text()
|
|
|
|
# Split at "Kenniscentrum Immaterieel Erfgoed" which marks the footer
|
|
parts = main_content.split('Kenniscentrum Immaterieel Erfgoed')
|
|
if parts:
|
|
org_section = parts[0]
|
|
|
|
# Look for all-caps words that might be city names (3+ chars)
|
|
caps_words = re.findall(r'\b([A-Z]{3,}(?:\s+[A-Z]+)?)\b', org_section)
|
|
for word in caps_words:
|
|
# Skip common non-city words
|
|
if word in ['CONTACT', 'MEDIA', 'IMMATERIEEL', 'ERFGOED', 'WEBSITE', 'NEDERLAND',
|
|
'HOME', 'OVER', 'NIEUWS', 'AGENDA', 'ENGLISH', 'DUTCH', 'MENU']:
|
|
continue
|
|
address_candidates.append(word)
|
|
|
|
# Also look for title-case words that could be Dutch cities
|
|
# Pattern: Words starting with capital, potentially with lowercase following
|
|
# Look for standalone words that might be city names
|
|
title_words = re.findall(r'\b([A-Z][a-z]{2,}(?:[\s\-][A-Za-z]+)?)\b', org_section)
|
|
for word in title_words:
|
|
# Skip common non-city words
|
|
skip_words = ['Contact', 'Media', 'Website', 'Email', 'Telefoon', 'Facebook',
|
|
'Twitter', 'Instagram', 'YouTube', 'LinkedIn', 'Home', 'Over',
|
|
'Nieuws', 'Agenda', 'English', 'Dutch', 'Menu', 'Zoeken',
|
|
'Stichting', 'Vereniging', 'Nederland', 'Nederlands', 'Meer']
|
|
if word in skip_words:
|
|
continue
|
|
# Add to candidates
|
|
address_candidates.append(word)
|
|
|
|
# Try to validate each candidate as a Dutch city
|
|
for candidate in address_candidates:
|
|
# Handle postal code + city pattern
|
|
match = re.search(r'(\d{4}\s?[A-Z]{2})\s+([A-Za-z][A-Za-z\-\'\s]+)', candidate)
|
|
if match:
|
|
postal_code = match.group(1).replace(' ', '')
|
|
city = match.group(2).strip().title() # Convert to title case
|
|
else:
|
|
# Single city name (possibly in caps)
|
|
postal_code = None
|
|
city = candidate.strip().title() # Convert KAMPEN to Kampen
|
|
|
|
# Clean up city name
|
|
city = re.sub(r'\s+', ' ', city)
|
|
city = city.split(',')[0].strip()
|
|
|
|
# Skip if too short
|
|
if len(city) < 3:
|
|
continue
|
|
|
|
# Validate city in GeoNames
|
|
geonames = lookup_city_in_geonames(city)
|
|
if geonames:
|
|
result: Dict[str, Any] = {
|
|
'city': geonames['name'],
|
|
'country': 'NL',
|
|
'latitude': geonames['latitude'],
|
|
'longitude': geonames['longitude'],
|
|
'geonames_id': geonames['geonames_id'],
|
|
'region_code': geonames['region_code'],
|
|
'extraction_method': 'KIEN_PROFILE_CONTACT_SECTION',
|
|
}
|
|
if postal_code:
|
|
result['postal_code'] = postal_code
|
|
return result
|
|
|
|
return None
|
|
|
|
|
|
def scrape_kien_profile(url: str, page: Any) -> Optional[str]:
|
|
"""
|
|
Scrape a KIEN profile page using Playwright.
|
|
|
|
Returns HTML content or None on failure.
|
|
"""
|
|
try:
|
|
page.goto(url, wait_until='networkidle', timeout=30000)
|
|
time.sleep(1) # Wait for any dynamic content
|
|
html = page.content()
|
|
return html
|
|
except Exception as e:
|
|
if 'Timeout' in str(type(e).__name__):
|
|
print(f" Timeout for {url}")
|
|
else:
|
|
print(f" Error scraping {url}: {e}")
|
|
return None
|
|
|
|
|
|
def scrape_org_website(url: str, page: Any) -> Optional[str]:
|
|
"""
|
|
Scrape an organization's own website for contact/address info.
|
|
|
|
Returns HTML content or None on failure.
|
|
"""
|
|
if not url or url == 'null':
|
|
return None
|
|
|
|
try:
|
|
from bs4 import BeautifulSoup
|
|
except ImportError:
|
|
return None
|
|
|
|
# Normalize URL
|
|
if not url.startswith('http'):
|
|
url = 'https://' + url
|
|
|
|
try:
|
|
page.goto(url, wait_until='networkidle', timeout=30000)
|
|
time.sleep(1)
|
|
|
|
# Try to find contact/about page
|
|
html = page.content()
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
# Look for contact links
|
|
contact_links: List[str] = []
|
|
for link in soup.find_all('a', href=True):
|
|
href_attr = link.get('href')
|
|
if href_attr:
|
|
href = str(href_attr).lower()
|
|
text = link.get_text(strip=True).lower()
|
|
if any(term in href or term in text for term in ['contact', 'over', 'about', 'adres', 'locatie']):
|
|
contact_links.append(str(href_attr))
|
|
|
|
# If we find contact page, navigate there
|
|
if contact_links:
|
|
for contact_href in contact_links[:1]: # Try first contact link
|
|
if not contact_href.startswith('http'):
|
|
# Make relative URL absolute
|
|
parsed = urlparse(url)
|
|
if contact_href.startswith('/'):
|
|
contact_href = f"{parsed.scheme}://{parsed.netloc}{contact_href}"
|
|
else:
|
|
contact_href = f"{url.rstrip('/')}/{contact_href}"
|
|
|
|
try:
|
|
page.goto(contact_href, wait_until='networkidle', timeout=20000)
|
|
time.sleep(1)
|
|
return page.content()
|
|
except Exception:
|
|
pass
|
|
|
|
return html
|
|
|
|
except Exception as e:
|
|
print(f" Error scraping {url}: {e}")
|
|
return None
|
|
|
|
|
|
def update_entry_with_location(entry_path: Path, location: Dict[str, Any],
|
|
extraction_method: str, source_url: str,
|
|
dry_run: bool = True) -> bool:
|
|
"""Update an entry file with discovered location."""
|
|
with open(entry_path, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
# Create location structure
|
|
location_data: Dict[str, Any] = {
|
|
'city': location['city'],
|
|
'country': 'NL',
|
|
'latitude': location.get('latitude'),
|
|
'longitude': location.get('longitude'),
|
|
}
|
|
if 'postal_code' in location:
|
|
location_data['postal_code'] = location['postal_code']
|
|
|
|
# Create resolution metadata
|
|
resolution = {
|
|
'method': extraction_method,
|
|
'source_url': source_url,
|
|
'geonames_id': location.get('geonames_id'),
|
|
'geonames_name': location.get('city'),
|
|
'region_code': location.get('region_code'),
|
|
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
|
|
if not dry_run:
|
|
entry['locations'] = [location_data]
|
|
entry['location_resolution'] = resolution
|
|
|
|
# Add provenance note
|
|
if 'provenance' not in entry:
|
|
entry['provenance'] = {'notes': []}
|
|
if 'notes' not in entry['provenance']:
|
|
entry['provenance']['notes'] = []
|
|
entry['provenance']['notes'].append(
|
|
f"Location extracted from {extraction_method} - {source_url}"
|
|
)
|
|
|
|
with open(entry_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return True
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description='Scrape KIEN profiles for location data')
|
|
parser.add_argument('--dry-run', action='store_true', help='Print what would be done without making changes')
|
|
parser.add_argument('--limit', type=int, default=10, help='Limit number of entries to process')
|
|
parser.add_argument('--skip-kien', action='store_true', help='Skip KIEN profile scraping')
|
|
parser.add_argument('--skip-website', action='store_true', help='Skip organization website scraping')
|
|
args = parser.parse_args()
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
except ImportError:
|
|
print("Error: Playwright not installed. Run: pip install playwright && playwright install chromium")
|
|
return
|
|
|
|
# Get entries without locations
|
|
entries = get_kien_entries_without_locations()
|
|
print(f"Found {len(entries)} KIEN entries without locations")
|
|
|
|
if args.limit:
|
|
entries = entries[:args.limit]
|
|
print(f"Processing first {args.limit} entries")
|
|
|
|
extracted: List[Dict[str, Any]] = []
|
|
failed: List[Dict[str, Any]] = []
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
context = browser.new_context(
|
|
user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
|
)
|
|
page = context.new_page()
|
|
|
|
for i, entry_info in enumerate(entries):
|
|
print(f"\n[{i+1}/{len(entries)}] {entry_info['org_name']}")
|
|
|
|
location: Optional[Dict[str, Any]] = None
|
|
source_url: Optional[str] = None
|
|
method: Optional[str] = None
|
|
|
|
# Try KIEN profile first
|
|
if not args.skip_kien and entry_info['kien_url']:
|
|
print(f" Scraping KIEN profile: {entry_info['kien_url']}")
|
|
html = scrape_kien_profile(entry_info['kien_url'], page)
|
|
if html:
|
|
location = extract_address_from_html(html)
|
|
if location:
|
|
source_url = entry_info['kien_url']
|
|
method = 'KIEN_PROFILE_SCRAPE'
|
|
print(f" ✓ Found location: {location['city']}")
|
|
|
|
# Try organization website
|
|
if not location and not args.skip_website and entry_info['website']:
|
|
print(f" Scraping website: {entry_info['website']}")
|
|
html = scrape_org_website(entry_info['website'], page)
|
|
if html:
|
|
location = extract_address_from_html(html)
|
|
if location:
|
|
source_url = entry_info['website']
|
|
method = 'ORG_WEBSITE_SCRAPE'
|
|
print(f" ✓ Found location: {location['city']}")
|
|
|
|
if location and source_url and method:
|
|
extracted.append({
|
|
'entry_index': entry_info['entry_index'],
|
|
'org_name': entry_info['org_name'],
|
|
'location': location,
|
|
'source': source_url,
|
|
'method': method,
|
|
})
|
|
|
|
# Update the entry file
|
|
update_entry_with_location(
|
|
entry_info['path'],
|
|
location,
|
|
method,
|
|
source_url,
|
|
dry_run=args.dry_run
|
|
)
|
|
else:
|
|
failed.append({
|
|
'entry_index': entry_info['entry_index'],
|
|
'org_name': entry_info['org_name'],
|
|
'kien_url': entry_info['kien_url'],
|
|
'website': entry_info['website'],
|
|
})
|
|
print(" ✗ No location found")
|
|
|
|
# Rate limiting
|
|
time.sleep(2)
|
|
|
|
browser.close()
|
|
|
|
# Summary
|
|
print(f"\n{'[DRY RUN] ' if args.dry_run else ''}Summary:")
|
|
print(f" - Locations extracted: {len(extracted)}")
|
|
print(f" - No location found: {len(failed)}")
|
|
|
|
if extracted:
|
|
print("\nExtracted locations:")
|
|
for e in extracted:
|
|
print(f" {e['org_name']} → {e['location']['city']} ({e['method']})")
|
|
|
|
if failed:
|
|
print("\nFailed to find location:")
|
|
for f in failed:
|
|
print(f" {f['org_name']}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|