glam/scripts/fetch_remaining_custodians.py
2025-12-05 15:30:23 +01:00

246 lines
8.7 KiB
Python

#!/usr/bin/env python3
"""
Fetch remaining custodian profile details from KIEN using Playwright.
This script fetches contact information (website, email, phone) for
custodians that haven't been processed yet.
Author: GLAM Project
Date: 2025-12-04
"""
import asyncio
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from playwright.async_api import async_playwright, Page
# Configuration
OUTPUT_DIR = Path("/Users/kempersc/apps/glam/data/intangible_heritage")
DELAY_MS = 1500 # Delay between requests
async def fetch_custodian_profile(page: Page, name: str, url: str) -> dict:
"""Fetch detailed profile for a custodian from their KIEN page."""
result = {
'name': name,
'kien_url': url,
'website': None,
'email': None,
'phone': None,
'address': None,
'description': None,
'heritage_forms_linked': [],
'fetched_at': datetime.now(timezone.utc).isoformat()
}
try:
await page.goto(url, wait_until='networkidle', timeout=30000)
await asyncio.sleep(1)
# Extract contact information using JavaScript
data = await page.evaluate('''() => {
const result = {
website: null,
email: null,
phone: null,
description: null,
heritageForms: []
};
const text = document.body.innerText;
const html = document.body.innerHTML;
// Get description from meta or first paragraph
const metaDesc = document.querySelector('meta[name="description"]');
if (metaDesc) {
result.description = metaDesc.getAttribute('content');
} else {
const firstP = document.querySelector('main p, article p, .content p');
if (firstP) {
result.description = firstP.textContent.trim().substring(0, 500);
}
}
// Find website link (external, not KIEN itself)
const websiteLink = document.querySelector('a[href^="http"]:not([href*="immaterieelerfgoed.nl"])');
if (websiteLink) {
const href = websiteLink.getAttribute('href');
// Skip social media
const socialDomains = ['facebook.com', 'twitter.com', 'instagram.com', 'linkedin.com', 'youtube.com'];
if (!socialDomains.some(d => href.includes(d))) {
result.website = href;
}
}
// Find email - look for mailto: links first
const emailLink = document.querySelector('a[href^="mailto:"]');
if (emailLink) {
result.email = emailLink.getAttribute('href').replace('mailto:', '');
} else {
// Try regex in text
const emailMatch = text.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}/);
if (emailMatch) {
result.email = emailMatch[0];
}
}
// Find phone - look for tel: links first
const phoneLink = document.querySelector('a[href^="tel:"]');
if (phoneLink) {
result.phone = phoneLink.getAttribute('href').replace('tel:', '');
} else {
// Try regex for Dutch phone numbers
const phoneMatch = text.match(/(\\+31|0)[0-9\\s\\-]{9,}/);
if (phoneMatch) {
result.phone = phoneMatch[0].replace(/\\s/g, '').replace(/-/g, '');
}
}
// Find linked heritage forms
const heritageLinks = document.querySelectorAll('a[href*="/nl/"][href*="/page/"]');
const seenForms = new Set();
heritageLinks.forEach(link => {
const href = link.getAttribute('href');
const name = link.textContent.trim();
// Skip navigation-like links
if (name.length > 5 && !seenForms.has(name) &&
!href.includes('kennisbank') && !href.includes('contact') &&
!href.includes('partner') && !href.includes('publicaties')) {
seenForms.add(name);
result.heritageForms.push({
name: name,
url: 'https://www.immaterieelerfgoed.nl' + href
});
}
});
return result;
}''')
result['website'] = data.get('website')
result['email'] = data.get('email')
result['phone'] = data.get('phone')
result['description'] = data.get('description')
result['heritage_forms_linked'] = data.get('heritageForms', [])
except Exception as e:
result['error'] = str(e)
return result
async def main():
"""Main function to fetch remaining custodian profiles."""
print("=" * 60)
print("KIEN Custodian Profile Fetcher")
print("=" * 60)
print(f"Started at: {datetime.now().isoformat()}")
print()
# Load existing profiles
profiles_file = OUTPUT_DIR / "custodian_profiles.json"
if profiles_file.exists():
with open(profiles_file) as f:
existing = json.load(f)
fetched_urls = {c['kien_url'] for c in existing.get('custodians', [])}
else:
fetched_urls = set()
print(f"Already fetched: {len(fetched_urls)} profiles")
# Load all custodians list
cleaned_file = OUTPUT_DIR / "kien_custodians_cleaned.json"
with open(cleaned_file) as f:
all_custodians = json.load(f)['custodians']
# Filter to remaining
remaining = [c for c in all_custodians if c['kien_url'] not in fetched_urls]
print(f"Remaining to fetch: {len(remaining)} profiles")
print()
if not remaining:
print("No remaining profiles to fetch!")
return
# Fetch remaining profiles
results = []
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="GLAM-Heritage-Research/1.0 (Academic Research)"
)
page = await context.new_page()
for i, custodian in enumerate(remaining, 1):
name = custodian['name']
url = custodian['kien_url']
print(f"[{i}/{len(remaining)}] {name}")
profile = await fetch_custodian_profile(page, name, url)
results.append(profile)
# Show what we found
if profile.get('website'):
print(f" → Website: {profile['website']}")
if profile.get('email'):
print(f" → Email: {profile['email']}")
if profile.get('phone'):
print(f" → Phone: {profile['phone']}")
if profile.get('error'):
print(f" ⚠ Error: {profile['error']}")
# Rate limiting
await asyncio.sleep(DELAY_MS / 1000)
# Progress save every 25 items
if i % 25 == 0:
progress_file = OUTPUT_DIR / f"profiles_progress_{i}.json"
with open(progress_file, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f" [PROGRESS] Saved {i} profiles")
await browser.close()
# Merge with existing profiles
print("\nMerging with existing profiles...")
if profiles_file.exists():
with open(profiles_file) as f:
existing = json.load(f)
else:
existing = {'custodians': []}
all_profiles = existing.get('custodians', []) + results
output = {
'extracted_at': datetime.now(timezone.utc).isoformat(),
'source': 'https://www.immaterieelerfgoed.nl',
'total_custodians': len(all_profiles),
'custodians': all_profiles
}
with open(profiles_file, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=2)
# Summary
websites = sum(1 for c in all_profiles if c.get('website'))
emails = sum(1 for c in all_profiles if c.get('email'))
phones = sum(1 for c in all_profiles if c.get('phone'))
print("\n" + "=" * 60)
print("FETCH COMPLETE")
print("=" * 60)
print(f"New profiles fetched: {len(results)}")
print(f"Total profiles: {len(all_profiles)}")
print(f" With website: {websites}")
print(f" With email: {emails}")
print(f" With phone: {phones}")
print(f"\nSaved to: {profiles_file}")
if __name__ == "__main__":
asyncio.run(main())