glam/scripts/enrich_timespan_linkup.py
2025-12-16 09:02:52 +01:00

661 lines
22 KiB
Python

#!/usr/bin/env python3
"""
Enrich Dutch heritage custodian files with TimeSpan data using Linkup web search.
This script:
1. Identifies custodian files without temporal data
2. Searches for founding/establishment dates via Linkup API
3. Archives retrieved webpages in data/custodian/web/{entry_index}/linkup/
4. Extracts dates and adds CIDOC-CRM E52_Time-Span compliant data
5. Adds proper provenance tracking
TimeSpan follows CIDOC-CRM E52_Time-Span pattern:
- begin_of_the_begin: Earliest possible start (P82a)
- end_of_the_begin: Latest possible start (P81a)
- begin_of_the_end: Earliest possible end (P81b)
- end_of_the_end: Latest possible end (P82b)
Usage:
python scripts/enrich_timespan_linkup.py [--dry-run] [--verbose] [--limit N]
python scripts/enrich_timespan_linkup.py --resume # Resume from checkpoint
"""
import argparse
import json
import os
import re
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List, Tuple
import yaml
# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
# Load environment variables from .env
try:
from dotenv import load_dotenv
load_dotenv(PROJECT_ROOT / ".env")
except ImportError:
pass # dotenv not required if env vars set externally
# Check for required dependencies
try:
import httpx
except ImportError:
print("ERROR: httpx not installed. Run: pip install httpx")
sys.exit(1)
# ============================================================================
# Configuration
# ============================================================================
LINKUP_API_URL = "https://api.linkup.so/v1/search"
CHECKPOINT_FILE = PROJECT_ROOT / "data/custodian/.linkup_timespan_checkpoint.json"
CUSTODIAN_DIR = PROJECT_ROOT / "data/custodian"
WEB_ARCHIVE_DIR = PROJECT_ROOT / "data/custodian/web"
# Rate limiting
REQUESTS_PER_MINUTE = 10
REQUEST_DELAY = 60.0 / REQUESTS_PER_MINUTE # 6 seconds between requests
# Dutch keywords for founding dates
FOUNDING_KEYWORDS_NL = [
"opgericht",
"gesticht",
"sinds",
"ontstaan",
"oprichting",
"begon",
"gestart",
"geopend",
]
FOUNDING_KEYWORDS_EN = [
"founded",
"established",
"since",
"opened",
"created",
"started",
]
# ============================================================================
# YAML Handling (preserve formatting)
# ============================================================================
class PreserveQuotesLoader(yaml.SafeLoader):
pass
class PreserveQuotesDumper(yaml.SafeDumper):
pass
def str_representer(dumper, data):
if '\n' in data:
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
PreserveQuotesDumper.add_representer(str, str_representer)
# ============================================================================
# Checkpoint Management
# ============================================================================
def load_checkpoint() -> Dict[str, Any]:
"""Load processing checkpoint."""
if CHECKPOINT_FILE.exists():
with open(CHECKPOINT_FILE, 'r') as f:
return json.load(f)
return {"processed": [], "last_timestamp": None, "stats": {}}
def save_checkpoint(checkpoint: Dict[str, Any]):
"""Save processing checkpoint."""
checkpoint["last_timestamp"] = datetime.now(timezone.utc).isoformat()
CHECKPOINT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(CHECKPOINT_FILE, 'w') as f:
json.dump(checkpoint, f, indent=2)
# ============================================================================
# Date Parsing
# ============================================================================
def parse_year_from_text(text: str) -> Optional[Tuple[int, str]]:
"""
Extract year from text with context.
Returns (year, context) tuple or None.
"""
# Pattern: "opgericht in 1985" / "founded in 1985" / "since 1985" / etc.
patterns = [
# Dutch patterns
(r'opgericht\s+(?:in\s+)?(\d{4})', 'opgericht'),
(r'gesticht\s+(?:in\s+)?(\d{4})', 'gesticht'),
(r'sinds\s+(\d{4})', 'sinds'),
(r'oprichting\s+(?:in\s+)?(\d{4})', 'oprichting'),
(r'geopend\s+(?:in\s+)?(\d{4})', 'geopend'),
(r'begon\s+(?:in\s+)?(\d{4})', 'begon'),
(r'ontstaan\s+(?:in\s+)?(\d{4})', 'ontstaan'),
# English patterns
(r'founded\s+(?:in\s+)?(\d{4})', 'founded'),
(r'established\s+(?:in\s+)?(\d{4})', 'established'),
(r'since\s+(\d{4})', 'since'),
(r'opened\s+(?:in\s+)?(\d{4})', 'opened'),
(r'created\s+(?:in\s+)?(\d{4})', 'created'),
# Generic year with context
(r'in\s+(\d{4})\s+(?:opgericht|gesticht|geopend)', 'year_context'),
]
text_lower = text.lower()
for pattern, context in patterns:
match = re.search(pattern, text_lower)
if match:
year = int(match.group(1))
# Validate year range (1500 - current year)
if 1500 <= year <= datetime.now().year:
return (year, context)
return None
def parse_full_date_from_text(text: str) -> Optional[Tuple[str, str]]:
"""
Extract full date (day/month/year) from text.
Returns (ISO date string, context) or None.
"""
# Pattern: "15 maart 1985" / "March 15, 1985" / "15-03-1985" / "23-11-2005"
dutch_months = {
'januari': 1, 'februari': 2, 'maart': 3, 'april': 4,
'mei': 5, 'juni': 6, 'juli': 7, 'augustus': 8,
'september': 9, 'oktober': 10, 'november': 11, 'december': 12
}
english_months = {
'january': 1, 'february': 2, 'march': 3, 'april': 4,
'may': 5, 'june': 6, 'july': 7, 'august': 8,
'september': 9, 'october': 10, 'november': 11, 'december': 12
}
text_lower = text.lower()
# Numeric format: "23-11-2005" or "23/11/2005" (DD-MM-YYYY, European)
numeric_pattern = r'(\d{1,2})[-/](\d{1,2})[-/](\d{4})'
match = re.search(numeric_pattern, text)
if match:
day, month, year = int(match.group(1)), int(match.group(2)), int(match.group(3))
if 1 <= day <= 31 and 1 <= month <= 12 and 1500 <= year <= datetime.now().year:
return (f"{year}-{month:02d}-{day:02d}T00:00:00Z", "full_date_numeric")
# Dutch format: "15 maart 1985"
for month_name, month_num in dutch_months.items():
pattern = rf'(\d{{1,2}})\s+{month_name}\s+(\d{{4}})'
match = re.search(pattern, text_lower)
if match:
day, year = int(match.group(1)), int(match.group(2))
if 1 <= day <= 31 and 1500 <= year <= datetime.now().year:
return (f"{year}-{month_num:02d}-{day:02d}T00:00:00Z", "full_date_nl")
# English format: "March 15, 1985"
for month_name, month_num in english_months.items():
pattern = rf'{month_name}\s+(\d{{1,2}}),?\s+(\d{{4}})'
match = re.search(pattern, text_lower)
if match:
day, year = int(match.group(1)), int(match.group(2))
if 1 <= day <= 31 and 1500 <= year <= datetime.now().year:
return (f"{year}-{month_num:02d}-{day:02d}T00:00:00Z", "full_date_en")
return None
def extract_dates_from_linkup_results(results: List[Dict]) -> Dict[str, Any]:
"""
Extract founding dates from Linkup search results.
Returns dict with:
- founding_date: ISO date string or None
- date_precision: 'year', 'month', 'day'
- source_url: URL where date was found
- context: snippet where date appears
"""
extracted = {
"founding_date": None,
"date_precision": None,
"source_url": None,
"source_urls": [],
"context": None,
"all_dates_found": [],
}
for result in results:
content = result.get("content", "") or result.get("snippet", "") or ""
url = result.get("url", "")
if not content:
continue
# Try full date first
full_date = parse_full_date_from_text(content)
if full_date:
date_str, context = full_date
extracted["all_dates_found"].append({
"date": date_str,
"precision": "day",
"url": url,
"context": context,
})
if not extracted["founding_date"]:
extracted["founding_date"] = date_str
extracted["date_precision"] = "day"
extracted["source_url"] = url
extracted["context"] = context
continue
# Try year only
year_result = parse_year_from_text(content)
if year_result:
year, context = year_result
year_date = f"{year}-01-01T00:00:00Z"
extracted["all_dates_found"].append({
"date": year_date,
"year": year,
"precision": "year",
"url": url,
"context": context,
})
if not extracted["founding_date"]:
extracted["founding_date"] = year_date
extracted["date_precision"] = "year"
extracted["source_url"] = url
extracted["context"] = context
if url and url not in extracted["source_urls"]:
extracted["source_urls"].append(url)
return extracted
# ============================================================================
# Linkup API Integration
# ============================================================================
def search_linkup(query: str, api_key: str) -> Optional[Dict[str, Any]]:
"""
Search using Linkup API.
Returns search results or None on error.
"""
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
payload = {
"q": query,
"depth": "standard", # or "deep" for more thorough search
"outputType": "sourcedAnswer",
}
try:
with httpx.Client(timeout=30.0) as client:
response = client.post(LINKUP_API_URL, json=payload, headers=headers)
response.raise_for_status()
return response.json()
except httpx.HTTPStatusError as e:
print(f" Linkup API error: {e.response.status_code} - {e.response.text[:200]}")
return None
except Exception as e:
print(f" Linkup request error: {e}")
return None
def build_search_query(custodian_data: Dict[str, Any]) -> str:
"""Build optimal search query for finding founding date."""
# Get organization name
org_name = (
custodian_data.get("custodian_name", {}).get("claim_value") or
custodian_data.get("original_entry", {}).get("organisatie") or
custodian_data.get("google_maps_enrichment", {}).get("name") or
""
)
# Get location
city = (
custodian_data.get("location", {}).get("city") or
custodian_data.get("original_entry", {}).get("plaatsnaam_bezoekadres") or
""
)
if not org_name:
return ""
# Build query: "Organization Name" city opgericht
query = f'"{org_name}"'
if city:
query += f" {city}"
query += " opgericht OR gesticht OR sinds"
return query
# ============================================================================
# Web Archive Management
# ============================================================================
def get_archive_path(entry_index: int) -> Path:
"""Get archive directory for a custodian entry."""
entry_str = f"{entry_index:04d}"
return WEB_ARCHIVE_DIR / entry_str / "linkup"
def archive_linkup_results(entry_index: int, query: str, results: Dict[str, Any]) -> Path:
"""
Archive Linkup search results to filesystem.
Returns path to archived file.
"""
archive_dir = get_archive_path(entry_index)
archive_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
filename = f"linkup_founding_{timestamp}.json"
archive_path = archive_dir / filename
archive_data = {
"query": query,
"fetch_timestamp": datetime.now(timezone.utc).isoformat(),
"api_response": results,
}
with open(archive_path, 'w', encoding='utf-8') as f:
json.dump(archive_data, f, indent=2, ensure_ascii=False)
return archive_path
# ============================================================================
# TimeSpan Creation
# ============================================================================
def create_timespan_from_extracted(extracted: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Create CIDOC-CRM TimeSpan from extracted date information.
Handles uncertainty based on precision:
- 'day': begin_of_the_begin == end_of_the_begin (precise)
- 'year': begin_of_the_begin = Jan 1, end_of_the_begin = Dec 31
"""
if not extracted.get("founding_date"):
return None
founding_date = extracted["founding_date"]
precision = extracted.get("date_precision", "year")
timespan = {}
if precision == "day":
# Precise date known
timespan["begin_of_the_begin"] = founding_date
timespan["end_of_the_begin"] = founding_date
elif precision == "year":
# Year known, month/day uncertain
# Extract year and create range
year_match = re.match(r'(\d{4})', founding_date)
if year_match:
year = year_match.group(1)
timespan["begin_of_the_begin"] = f"{year}-01-01T00:00:00Z"
timespan["end_of_the_begin"] = f"{year}-12-31T23:59:59Z"
else:
# Approximate date
timespan["begin_of_the_begin"] = founding_date
timespan["end_of_the_begin"] = founding_date
# End dates null (still operating)
timespan["begin_of_the_end"] = None
timespan["end_of_the_end"] = None
return timespan
# ============================================================================
# File Processing
# ============================================================================
def needs_timespan_enrichment(data: Dict[str, Any]) -> bool:
"""Check if custodian needs Linkup TimeSpan enrichment."""
# Skip if already has timespan with begin dates
existing = data.get("timespan", {})
if existing and existing.get("begin_of_the_begin"):
return False
# Skip if has Wikidata inception
wikidata = data.get("wikidata_enrichment", {})
if wikidata.get("wikidata_inception"):
return False
return True
def process_file(
filepath: Path,
api_key: str,
dry_run: bool = False,
verbose: bool = False
) -> Tuple[bool, str]:
"""
Process a single custodian file.
Returns (success, status_message) tuple.
"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
except Exception as e:
return (False, f"Error reading file: {e}")
if not data:
return (False, "Empty file")
# Check if needs enrichment
if not needs_timespan_enrichment(data):
return (False, "Already has temporal data")
# Get entry index
entry_index = data.get("entry_index")
if entry_index is None:
return (False, "No entry_index")
# Build search query
query = build_search_query(data)
if not query:
return (False, "Could not build search query")
if verbose:
org_name = data.get("original_entry", {}).get("organisatie", filepath.stem)
print(f" Searching: {query[:80]}...")
if dry_run:
return (True, "Would search Linkup (dry run)")
# Search Linkup
results = search_linkup(query, api_key)
if not results:
return (False, "Linkup search failed")
# Archive results FIRST (before analysis)
archive_path = archive_linkup_results(entry_index, query, results)
if verbose:
print(f" Archived to: {archive_path.relative_to(PROJECT_ROOT)}")
# Extract dates from results
sources = results.get("sources", [])
if not sources:
return (False, "No sources in Linkup results")
extracted = extract_dates_from_linkup_results(sources)
if not extracted.get("founding_date"):
return (False, "No founding date found in results")
# Create TimeSpan
timespan = create_timespan_from_extracted(extracted)
if not timespan:
return (False, "Could not create TimeSpan")
# Add sources to timespan
timespan["sources"] = [f"Linkup web search: {extracted.get('source_url', 'multiple sources')}"]
if extracted.get("context"):
timespan["notes"] = f"Found via pattern: {extracted['context']}"
# Update data
data["timespan"] = timespan
# Add provenance
if "provenance" not in data:
data["provenance"] = {"sources": {}}
if "sources" not in data["provenance"]:
data["provenance"]["sources"] = {}
data["provenance"]["sources"]["linkup_timespan"] = [{
"source_type": "linkup_web_search",
"fetch_timestamp": datetime.now(timezone.utc).isoformat(),
"search_query": query,
"source_urls": extracted.get("source_urls", [])[:5], # Limit to 5 URLs
"claims_extracted": ["timespan_begin"],
"data_tier": "TIER_4_INFERRED",
"archive_path": str(archive_path.relative_to(PROJECT_ROOT)),
}]
# Write back
try:
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, Dumper=PreserveQuotesDumper,
default_flow_style=False, allow_unicode=True, sort_keys=False)
except Exception as e:
return (False, f"Error writing file: {e}")
if verbose:
print(f" Added TimeSpan: begin={timespan.get('begin_of_the_begin')}")
return (True, f"Added TimeSpan from {extracted.get('date_precision', 'unknown')} precision date")
# ============================================================================
# Main
# ============================================================================
def main():
parser = argparse.ArgumentParser(description='Enrich custodian files with TimeSpan via Linkup')
parser.add_argument('--dry-run', action='store_true', help='Do not write changes or make API calls')
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process (0=unlimited)')
parser.add_argument('--resume', action='store_true', help='Resume from checkpoint')
parser.add_argument('--pattern', default='NL-*.yaml', help='File pattern to match (default: NL-*.yaml)')
args = parser.parse_args()
# Get API key
api_key = os.environ.get("LINKUP_API_KEY", "")
if not api_key and not args.dry_run:
print("ERROR: LINKUP_API_KEY environment variable not set")
print("Set it with: export LINKUP_API_KEY=your_key")
sys.exit(1)
# Load checkpoint if resuming
checkpoint = load_checkpoint() if args.resume else {"processed": [], "stats": {}}
processed_files = set(checkpoint.get("processed", []))
print(f"TimeSpan Enrichment via Linkup")
print(f"=" * 50)
print(f"Pattern: {args.pattern}")
print(f"Dry run: {args.dry_run}")
print(f"Limit: {args.limit if args.limit > 0 else 'unlimited'}")
print(f"Resume: {args.resume} ({len(processed_files)} already processed)")
print()
# Collect files to process
files = list(CUSTODIAN_DIR.glob(args.pattern))
files.sort()
print(f"Found {len(files)} files matching pattern")
# Filter out already processed
if args.resume:
files = [f for f in files if f.name not in processed_files]
print(f"After filtering: {len(files)} files to process")
# Apply limit
if args.limit > 0:
files = files[:args.limit]
print(f"Limited to: {len(files)} files")
print()
# Stats
stats = {
"total": len(files),
"enriched": 0,
"skipped": 0,
"errors": 0,
}
try:
for i, filepath in enumerate(files, 1):
print(f"[{i}/{len(files)}] {filepath.name}")
success, message = process_file(filepath, api_key, args.dry_run, args.verbose)
if success:
stats["enriched"] += 1
if args.verbose:
print(f"{message}")
else:
if "Already has" in message or "No entry_index" in message:
stats["skipped"] += 1
else:
stats["errors"] += 1
if args.verbose:
print(f" - {message}")
# Update checkpoint
checkpoint["processed"].append(filepath.name)
checkpoint["stats"] = stats
# Save checkpoint periodically
if i % 10 == 0:
save_checkpoint(checkpoint)
# Rate limiting
if not args.dry_run and i < len(files):
time.sleep(REQUEST_DELAY)
except KeyboardInterrupt:
print("\n\nInterrupted! Saving checkpoint...")
save_checkpoint(checkpoint)
# Final save
save_checkpoint(checkpoint)
print()
print(f"=" * 50)
print(f"Results:")
print(f" Total processed: {stats['total']}")
print(f" Enriched with TimeSpan: {stats['enriched']}")
print(f" Skipped (existing data): {stats['skipped']}")
print(f" Errors: {stats['errors']}")
if args.dry_run:
print("\n(Dry run - no files were modified)")
if __name__ == '__main__':
main()