#!/usr/bin/env python3 """ Fetch remaining custodian profile details from KIEN using Playwright. This script fetches contact information (website, email, phone) for custodians that haven't been processed yet. Author: GLAM Project Date: 2025-12-04 """ import asyncio import json import re from datetime import datetime, timezone from pathlib import Path from playwright.async_api import async_playwright, Page # Configuration OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/intangible_heritage") DELAY_MS = 1500 # Delay between requests async def fetch_custodian_profile(page: Page, name: str, url: str) -> dict: """Fetch detailed profile for a custodian from their KIEN page.""" result = { 'name': name, 'kien_url': url, 'website': None, 'email': None, 'phone': None, 'address': None, 'description': None, 'heritage_forms_linked': [], 'fetched_at': datetime.now(timezone.utc).isoformat() } try: await page.goto(url, wait_until='networkidle', timeout=30000) await asyncio.sleep(1) # Extract contact information using JavaScript data = await page.evaluate('''() => { const result = { website: null, email: null, phone: null, description: null, heritageForms: [] }; const text = document.body.innerText; const html = document.body.innerHTML; // Get description from meta or first paragraph const metaDesc = document.querySelector('meta[name="description"]'); if (metaDesc) { result.description = metaDesc.getAttribute('content'); } else { const firstP = document.querySelector('main p, article p, .content p'); if (firstP) { result.description = firstP.textContent.trim().substring(0, 500); } } // Find website link (external, not KIEN itself) const websiteLink = document.querySelector('a[href^="http"]:not([href*="immaterieelerfgoed.nl"])'); if (websiteLink) { const href = websiteLink.getAttribute('href'); // Skip social media const socialDomains = ['facebook.com', 'twitter.com', 'instagram.com', 'linkedin.com', 'youtube.com']; if (!socialDomains.some(d => href.includes(d))) { result.website = href; } } // Find email - look for mailto: links first const emailLink = document.querySelector('a[href^="mailto:"]'); if (emailLink) { result.email = emailLink.getAttribute('href').replace('mailto:', ''); } else { // Try regex in text const emailMatch = text.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}/); if (emailMatch) { result.email = emailMatch[0]; } } // Find phone - look for tel: links first const phoneLink = document.querySelector('a[href^="tel:"]'); if (phoneLink) { result.phone = phoneLink.getAttribute('href').replace('tel:', ''); } else { // Try regex for Dutch phone numbers const phoneMatch = text.match(/(\\+31|0)[0-9\\s\\-]{9,}/); if (phoneMatch) { result.phone = phoneMatch[0].replace(/\\s/g, '').replace(/-/g, ''); } } // Find linked heritage forms const heritageLinks = document.querySelectorAll('a[href*="/nl/"][href*="/page/"]'); const seenForms = new Set(); heritageLinks.forEach(link => { const href = link.getAttribute('href'); const name = link.textContent.trim(); // Skip navigation-like links if (name.length > 5 && !seenForms.has(name) && !href.includes('kennisbank') && !href.includes('contact') && !href.includes('partner') && !href.includes('publicaties')) { seenForms.add(name); result.heritageForms.push({ name: name, url: 'https://www.immaterieelerfgoed.nl' + href }); } }); return result; }''') result['website'] = data.get('website') result['email'] = data.get('email') result['phone'] = data.get('phone') result['description'] = data.get('description') result['heritage_forms_linked'] = data.get('heritageForms', []) except Exception as e: result['error'] = str(e) return result async def main(): """Main function to fetch remaining custodian profiles.""" print("=" * 60) print("KIEN Custodian Profile Fetcher") print("=" * 60) print(f"Started at: {datetime.now().isoformat()}") print() # Load existing profiles profiles_file = OUTPUT_DIR / "custodian_profiles.json" if profiles_file.exists(): with open(profiles_file) as f: existing = json.load(f) fetched_urls = {c['kien_url'] for c in existing.get('custodians', [])} else: fetched_urls = set() print(f"Already fetched: {len(fetched_urls)} profiles") # Load all custodians list cleaned_file = OUTPUT_DIR / "kien_custodians_cleaned.json" with open(cleaned_file) as f: all_custodians = json.load(f)['custodians'] # Filter to remaining remaining = [c for c in all_custodians if c['kien_url'] not in fetched_urls] print(f"Remaining to fetch: {len(remaining)} profiles") print() if not remaining: print("No remaining profiles to fetch!") return # Fetch remaining profiles results = [] async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context( user_agent="GLAM-Heritage-Research/1.0 (Academic Research)" ) page = await context.new_page() for i, custodian in enumerate(remaining, 1): name = custodian['name'] url = custodian['kien_url'] print(f"[{i}/{len(remaining)}] {name}") profile = await fetch_custodian_profile(page, name, url) results.append(profile) # Show what we found if profile.get('website'): print(f" → Website: {profile['website']}") if profile.get('email'): print(f" → Email: {profile['email']}") if profile.get('phone'): print(f" → Phone: {profile['phone']}") if profile.get('error'): print(f" ⚠ Error: {profile['error']}") # Rate limiting await asyncio.sleep(DELAY_MS / 1000) # Progress save every 25 items if i % 25 == 0: progress_file = OUTPUT_DIR / f"profiles_progress_{i}.json" with open(progress_file, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f" [PROGRESS] Saved {i} profiles") await browser.close() # Merge with existing profiles print("\nMerging with existing profiles...") if profiles_file.exists(): with open(profiles_file) as f: existing = json.load(f) else: existing = {'custodians': []} all_profiles = existing.get('custodians', []) + results output = { 'extracted_at': datetime.now(timezone.utc).isoformat(), 'source': 'https://www.immaterieelerfgoed.nl', 'total_custodians': len(all_profiles), 'custodians': all_profiles } with open(profiles_file, 'w', encoding='utf-8') as f: json.dump(output, f, ensure_ascii=False, indent=2) # Summary websites = sum(1 for c in all_profiles if c.get('website')) emails = sum(1 for c in all_profiles if c.get('email')) phones = sum(1 for c in all_profiles if c.get('phone')) print("\n" + "=" * 60) print("FETCH COMPLETE") print("=" * 60) print(f"New profiles fetched: {len(results)}") print(f"Total profiles: {len(all_profiles)}") print(f" With website: {websites}") print(f" With email: {emails}") print(f" With phone: {phones}") print(f"\nSaved to: {profiles_file}") if __name__ == "__main__": asyncio.run(main())