246 lines
8.7 KiB
Python
246 lines
8.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fetch remaining custodian profile details from KIEN using Playwright.
|
|
|
|
This script fetches contact information (website, email, phone) for
|
|
custodians that haven't been processed yet.
|
|
|
|
Author: GLAM Project
|
|
Date: 2025-12-04
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
from playwright.async_api import async_playwright, Page
|
|
|
|
# Configuration
|
|
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/intangible_heritage")
|
|
DELAY_MS = 1500 # Delay between requests
|
|
|
|
async def fetch_custodian_profile(page: Page, name: str, url: str) -> dict:
|
|
"""Fetch detailed profile for a custodian from their KIEN page."""
|
|
result = {
|
|
'name': name,
|
|
'kien_url': url,
|
|
'website': None,
|
|
'email': None,
|
|
'phone': None,
|
|
'address': None,
|
|
'description': None,
|
|
'heritage_forms_linked': [],
|
|
'fetched_at': datetime.now(timezone.utc).isoformat()
|
|
}
|
|
|
|
try:
|
|
await page.goto(url, wait_until='networkidle', timeout=30000)
|
|
await asyncio.sleep(1)
|
|
|
|
# Extract contact information using JavaScript
|
|
data = await page.evaluate('''() => {
|
|
const result = {
|
|
website: null,
|
|
email: null,
|
|
phone: null,
|
|
description: null,
|
|
heritageForms: []
|
|
};
|
|
|
|
const text = document.body.innerText;
|
|
const html = document.body.innerHTML;
|
|
|
|
// Get description from meta or first paragraph
|
|
const metaDesc = document.querySelector('meta[name="description"]');
|
|
if (metaDesc) {
|
|
result.description = metaDesc.getAttribute('content');
|
|
} else {
|
|
const firstP = document.querySelector('main p, article p, .content p');
|
|
if (firstP) {
|
|
result.description = firstP.textContent.trim().substring(0, 500);
|
|
}
|
|
}
|
|
|
|
// Find website link (external, not KIEN itself)
|
|
const websiteLink = document.querySelector('a[href^="http"]:not([href*="immaterieelerfgoed.nl"])');
|
|
if (websiteLink) {
|
|
const href = websiteLink.getAttribute('href');
|
|
// Skip social media
|
|
const socialDomains = ['facebook.com', 'twitter.com', 'instagram.com', 'linkedin.com', 'youtube.com'];
|
|
if (!socialDomains.some(d => href.includes(d))) {
|
|
result.website = href;
|
|
}
|
|
}
|
|
|
|
// Find email - look for mailto: links first
|
|
const emailLink = document.querySelector('a[href^="mailto:"]');
|
|
if (emailLink) {
|
|
result.email = emailLink.getAttribute('href').replace('mailto:', '');
|
|
} else {
|
|
// Try regex in text
|
|
const emailMatch = text.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}/);
|
|
if (emailMatch) {
|
|
result.email = emailMatch[0];
|
|
}
|
|
}
|
|
|
|
// Find phone - look for tel: links first
|
|
const phoneLink = document.querySelector('a[href^="tel:"]');
|
|
if (phoneLink) {
|
|
result.phone = phoneLink.getAttribute('href').replace('tel:', '');
|
|
} else {
|
|
// Try regex for Dutch phone numbers
|
|
const phoneMatch = text.match(/(\\+31|0)[0-9\\s\\-]{9,}/);
|
|
if (phoneMatch) {
|
|
result.phone = phoneMatch[0].replace(/\\s/g, '').replace(/-/g, '');
|
|
}
|
|
}
|
|
|
|
// Find linked heritage forms
|
|
const heritageLinks = document.querySelectorAll('a[href*="/nl/"][href*="/page/"]');
|
|
const seenForms = new Set();
|
|
heritageLinks.forEach(link => {
|
|
const href = link.getAttribute('href');
|
|
const name = link.textContent.trim();
|
|
// Skip navigation-like links
|
|
if (name.length > 5 && !seenForms.has(name) &&
|
|
!href.includes('kennisbank') && !href.includes('contact') &&
|
|
!href.includes('partner') && !href.includes('publicaties')) {
|
|
seenForms.add(name);
|
|
result.heritageForms.push({
|
|
name: name,
|
|
url: 'https://www.immaterieelerfgoed.nl' + href
|
|
});
|
|
}
|
|
});
|
|
|
|
return result;
|
|
}''')
|
|
|
|
result['website'] = data.get('website')
|
|
result['email'] = data.get('email')
|
|
result['phone'] = data.get('phone')
|
|
result['description'] = data.get('description')
|
|
result['heritage_forms_linked'] = data.get('heritageForms', [])
|
|
|
|
except Exception as e:
|
|
result['error'] = str(e)
|
|
|
|
return result
|
|
|
|
|
|
async def main():
|
|
"""Main function to fetch remaining custodian profiles."""
|
|
print("=" * 60)
|
|
print("KIEN Custodian Profile Fetcher")
|
|
print("=" * 60)
|
|
print(f"Started at: {datetime.now().isoformat()}")
|
|
print()
|
|
|
|
# Load existing profiles
|
|
profiles_file = OUTPUT_DIR / "custodian_profiles.json"
|
|
if profiles_file.exists():
|
|
with open(profiles_file) as f:
|
|
existing = json.load(f)
|
|
fetched_urls = {c['kien_url'] for c in existing.get('custodians', [])}
|
|
else:
|
|
fetched_urls = set()
|
|
|
|
print(f"Already fetched: {len(fetched_urls)} profiles")
|
|
|
|
# Load all custodians list
|
|
cleaned_file = OUTPUT_DIR / "kien_custodians_cleaned.json"
|
|
with open(cleaned_file) as f:
|
|
all_custodians = json.load(f)['custodians']
|
|
|
|
# Filter to remaining
|
|
remaining = [c for c in all_custodians if c['kien_url'] not in fetched_urls]
|
|
print(f"Remaining to fetch: {len(remaining)} profiles")
|
|
print()
|
|
|
|
if not remaining:
|
|
print("No remaining profiles to fetch!")
|
|
return
|
|
|
|
# Fetch remaining profiles
|
|
results = []
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=True)
|
|
context = await browser.new_context(
|
|
user_agent="GLAM-Heritage-Research/1.0 (Academic Research)"
|
|
)
|
|
page = await context.new_page()
|
|
|
|
for i, custodian in enumerate(remaining, 1):
|
|
name = custodian['name']
|
|
url = custodian['kien_url']
|
|
|
|
print(f"[{i}/{len(remaining)}] {name}")
|
|
|
|
profile = await fetch_custodian_profile(page, name, url)
|
|
results.append(profile)
|
|
|
|
# Show what we found
|
|
if profile.get('website'):
|
|
print(f" → Website: {profile['website']}")
|
|
if profile.get('email'):
|
|
print(f" → Email: {profile['email']}")
|
|
if profile.get('phone'):
|
|
print(f" → Phone: {profile['phone']}")
|
|
if profile.get('error'):
|
|
print(f" ⚠ Error: {profile['error']}")
|
|
|
|
# Rate limiting
|
|
await asyncio.sleep(DELAY_MS / 1000)
|
|
|
|
# Progress save every 25 items
|
|
if i % 25 == 0:
|
|
progress_file = OUTPUT_DIR / f"profiles_progress_{i}.json"
|
|
with open(progress_file, 'w', encoding='utf-8') as f:
|
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
|
print(f" [PROGRESS] Saved {i} profiles")
|
|
|
|
await browser.close()
|
|
|
|
# Merge with existing profiles
|
|
print("\nMerging with existing profiles...")
|
|
|
|
if profiles_file.exists():
|
|
with open(profiles_file) as f:
|
|
existing = json.load(f)
|
|
else:
|
|
existing = {'custodians': []}
|
|
|
|
all_profiles = existing.get('custodians', []) + results
|
|
|
|
output = {
|
|
'extracted_at': datetime.now(timezone.utc).isoformat(),
|
|
'source': 'https://www.immaterieelerfgoed.nl',
|
|
'total_custodians': len(all_profiles),
|
|
'custodians': all_profiles
|
|
}
|
|
|
|
with open(profiles_file, 'w', encoding='utf-8') as f:
|
|
json.dump(output, f, ensure_ascii=False, indent=2)
|
|
|
|
# Summary
|
|
websites = sum(1 for c in all_profiles if c.get('website'))
|
|
emails = sum(1 for c in all_profiles if c.get('email'))
|
|
phones = sum(1 for c in all_profiles if c.get('phone'))
|
|
|
|
print("\n" + "=" * 60)
|
|
print("FETCH COMPLETE")
|
|
print("=" * 60)
|
|
print(f"New profiles fetched: {len(results)}")
|
|
print(f"Total profiles: {len(all_profiles)}")
|
|
print(f" With website: {websites}")
|
|
print(f" With email: {emails}")
|
|
print(f" With phone: {phones}")
|
|
print(f"\nSaved to: {profiles_file}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|