836 lines
29 KiB
Python
836 lines
29 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract timeline events from timeline event sources data and update custodian YAML files.
|
|
|
|
This script extracts historical events with FULL PROVENANCE from source archives files
|
|
and stores them in custodian YAML files following the CustodianTimelineEvent schema.
|
|
|
|
Schema: schemas/20251121/linkml/modules/classes/CustodianTimelineEvent.yaml
|
|
|
|
Event types extracted (OrganizationalChangeEventTypeEnum):
|
|
- FOUNDING: opgericht, gesticht, ontstaan
|
|
- MERGER: fusie, fuseerde, samengevoegd, ging op in
|
|
- DISSOLUTION: opgeheven, gesloten
|
|
- RENAMING: hernoemd, nieuwe naam
|
|
- TRANSFER: verhuisd, verplaatst, gevestigd (physical move)
|
|
- EXPANSION: uitgebreid, verbouwd, nieuwbouw, gemoderniseerd
|
|
- SPLIT: opgesplitst
|
|
- SPIN_OFF: afgesplitst
|
|
- REDUCTION: ingekrompen
|
|
- REORGANIZATION: herstructurering
|
|
|
|
EXCLUDED (not in enum):
|
|
- reopening: not a recognized event type
|
|
- predecessor: relationship, not event
|
|
- friends_org: separate organization
|
|
|
|
Output structure in custodian YAML:
|
|
timeline_enrichment:
|
|
timeline_events:
|
|
- event_type: FOUNDING
|
|
event_date: "2005-04-30"
|
|
date_precision: day
|
|
approximate: false
|
|
description: "..."
|
|
source_urls: [...]
|
|
linkup_query: "..."
|
|
linkup_answer: "..."
|
|
fetch_timestamp: "2025-12-15T16:04:38Z"
|
|
archive_path: web/0002/linkup/linkup_founding_20251215T160438Z.json
|
|
extraction_method: linkup_answer_regex
|
|
extraction_timestamp: "2025-12-16T10:00:00Z"
|
|
data_tier: TIER_4_INFERRED
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import yaml
|
|
|
|
# =============================================================================
|
|
# CONFIGURATION
|
|
# =============================================================================
|
|
|
|
# Minimum year for institutional events (filters out historical references)
|
|
MIN_YEAR = 1800
|
|
|
|
# Dutch month names to numbers
|
|
DUTCH_MONTHS = {
|
|
'januari': '01', 'februari': '02', 'maart': '03', 'april': '04',
|
|
'mei': '05', 'juni': '06', 'juli': '07', 'augustus': '08',
|
|
'september': '09', 'oktober': '10', 'november': '11', 'december': '12'
|
|
}
|
|
|
|
# =============================================================================
|
|
# EVENT TYPE MAPPING (script patterns → OrganizationalChangeEventTypeEnum)
|
|
# =============================================================================
|
|
|
|
# Maps internal event type names to enum values
|
|
EVENT_TYPE_MAP = {
|
|
'founding': 'FOUNDING',
|
|
'merger': 'MERGER',
|
|
'dissolution': 'DISSOLUTION',
|
|
'name_change': 'RENAMING',
|
|
'relocation': 'TRANSFER', # Physical move maps to TRANSFER
|
|
'expansion': 'EXPANSION',
|
|
'split': 'SPLIT',
|
|
'spin_off': 'SPIN_OFF',
|
|
'reduction': 'REDUCTION',
|
|
'reorganization': 'REORGANIZATION',
|
|
}
|
|
|
|
# Event type keywords (Dutch) - only types that map to enum
|
|
EVENT_KEYWORDS = {
|
|
'founding': [
|
|
r'opgericht\s+(?:op|in)',
|
|
r'gesticht\s+(?:op|in)',
|
|
r'werd\s+opgericht',
|
|
r'is\s+opgericht',
|
|
r'ontstaan\s+in',
|
|
r'opgericht\s+op\s+\d',
|
|
],
|
|
'merger': [
|
|
r'fusie\s+(?:van|tussen|met)',
|
|
r'fuseerde\s+met',
|
|
r'samengevoegd\s+met',
|
|
r'ging\s+(?:ook\s+)?(?:hier\s+)?in\s+op',
|
|
r'ging\s+op\s+in',
|
|
r'voortgekomen\s+uit\s+een\s+fusie',
|
|
r'ontstaan\s+uit\s+een\s+fusie',
|
|
],
|
|
'relocation': [
|
|
r'verhuisd\s+naar',
|
|
r'verplaatst\s+naar',
|
|
r'nieuwe\s+locatie',
|
|
r'betrok\s+(?:een\s+)?nieuw\s+pand',
|
|
r'gevestigd\s+(?:aan|in|op)',
|
|
],
|
|
'expansion': [
|
|
r'uitgebreid\s+(?:en|in)',
|
|
r'verbouwd\s+in',
|
|
r'nieuwbouw\s+in',
|
|
r'gemoderniseerd',
|
|
r'werd\s+(?:in\s+\d{4}\s+)?uitgebreid',
|
|
],
|
|
'name_change': [
|
|
r'hernoemd\s+(?:naar|tot)',
|
|
r'nieuwe\s+naam',
|
|
r'naam\s+gewijzigd',
|
|
r'naam\s+veranderd',
|
|
],
|
|
'dissolution': [
|
|
r'opgeheven\s+in',
|
|
r'gesloten\s+in',
|
|
r'opgegaan\s+in',
|
|
r'beëindigd\s+in',
|
|
],
|
|
'split': [
|
|
r'opgesplitst\s+in',
|
|
r'verdeeld\s+in',
|
|
],
|
|
'spin_off': [
|
|
r'afgesplitst\s+(?:van|uit)',
|
|
r'verzelfstandigd',
|
|
],
|
|
'reduction': [
|
|
r'ingekrompen',
|
|
r'afgebouwd',
|
|
],
|
|
'reorganization': [
|
|
r'herstructurering',
|
|
r'gereorganiseerd',
|
|
r'reorganisatie',
|
|
],
|
|
}
|
|
|
|
# Date extraction patterns with precision detection
|
|
DATE_PATTERNS = [
|
|
# Full date: "30 april 2005" or "op 30 april 2005" → day precision
|
|
(r'(?:op\s+)?(\d{1,2})\s+(' + '|'.join(DUTCH_MONTHS.keys()) + r')\s+(\d{4})', 'full', 'day'),
|
|
# Full date: "30-4-2005" or "30/4/2005" → day precision
|
|
(r'(\d{1,2})[-/](\d{1,2})[-/](\d{4})', 'numeric', 'day'),
|
|
# Year with context: "in 1854", "sinds 2005", "vanaf 2006" → year precision
|
|
(r'(?:in|sinds|vanaf|anno|per)\s+(\d{4})', 'year', 'year'),
|
|
# Year in parentheses: "(2000)" → year precision
|
|
(r'\((\d{4})\)', 'year', 'year'),
|
|
# Approximate: "circa 1900", "rond 2000", "ongeveer 1980" → year precision, approximate=True
|
|
(r'(?:circa|rond|ongeveer)\s+(\d{4})', 'circa', 'year'),
|
|
# Year only after "werd" or "is": "werd in 1980" → year precision
|
|
(r'werd\s+in\s+(\d{4})', 'year', 'year'),
|
|
# Decade reference: "begin jaren '90", "eind jaren '80" → decade precision
|
|
(r"(?:begin|eind|midden)\s+jaren\s+'?(\d{2})", 'decade', 'decade'),
|
|
]
|
|
|
|
# =============================================================================
|
|
# DATE PARSING
|
|
# =============================================================================
|
|
|
|
|
|
def parse_dutch_date(match: tuple, date_type: str) -> tuple[str, bool]:
|
|
"""
|
|
Parse a Dutch date match to ISO format.
|
|
|
|
Returns:
|
|
Tuple of (iso_date_string, is_approximate)
|
|
"""
|
|
if date_type == 'full':
|
|
day, month_name, year = match
|
|
month = DUTCH_MONTHS.get(month_name.lower(), '01')
|
|
return f"{year}-{month}-{int(day):02d}", False
|
|
elif date_type == 'numeric':
|
|
day, month, year = match
|
|
return f"{year}-{int(month):02d}-{int(day):02d}", False
|
|
elif date_type == 'year':
|
|
year = match[0] if isinstance(match, tuple) else match
|
|
return f"{year}", False
|
|
elif date_type == 'circa':
|
|
year = match[0] if isinstance(match, tuple) else match
|
|
return f"{year}", True
|
|
elif date_type == 'decade':
|
|
# "jaren '90" → "1990"
|
|
decade = match[0] if isinstance(match, tuple) else match
|
|
century = '19' if int(decade) > 20 else '20'
|
|
return f"{century}{decade}", True # Decades are approximate
|
|
return "", False
|
|
|
|
|
|
def is_valid_year(date_str: str) -> bool:
|
|
"""Check if the year in a date string is >= MIN_YEAR."""
|
|
try:
|
|
year = int(date_str[:4])
|
|
return year >= MIN_YEAR
|
|
except (ValueError, IndexError):
|
|
return False
|
|
|
|
|
|
def detect_date_precision(iso_date: str, date_type: str) -> str:
|
|
"""
|
|
Detect the precision of a date based on format and extraction type.
|
|
|
|
Returns: day, month, year, decade, century, or unknown
|
|
"""
|
|
if date_type == 'decade':
|
|
return 'decade'
|
|
elif date_type in ('full', 'numeric'):
|
|
# Check if it's a full date (YYYY-MM-DD)
|
|
if len(iso_date) == 10 and iso_date.count('-') == 2:
|
|
return 'day'
|
|
elif len(iso_date) == 7 and iso_date.count('-') == 1:
|
|
return 'month'
|
|
# Default to year for year-only extractions
|
|
return 'year'
|
|
|
|
|
|
# =============================================================================
|
|
# EVENT EXTRACTION
|
|
# =============================================================================
|
|
|
|
|
|
def find_closest_date(
|
|
text: str,
|
|
event_match_pos: int,
|
|
search_window: int = 150
|
|
) -> tuple[str, str, bool, int] | None:
|
|
"""
|
|
Find the closest date to an event keyword match position.
|
|
|
|
Uses proximity scoring to prefer dates closer to the event keyword,
|
|
searching both before and after the match position.
|
|
|
|
Args:
|
|
text: Full text to search
|
|
event_match_pos: Character position of the event keyword match
|
|
search_window: How many characters before/after to search
|
|
|
|
Returns:
|
|
Tuple of (iso_date, date_type, is_approximate, distance) or None
|
|
"""
|
|
text_lower = text.lower()
|
|
start = max(0, event_match_pos - search_window)
|
|
end = min(len(text), event_match_pos + search_window)
|
|
context = text_lower[start:end]
|
|
|
|
best_date = None
|
|
best_distance = float('inf')
|
|
|
|
for date_pattern, date_type, _ in DATE_PATTERNS:
|
|
for date_match in re.finditer(date_pattern, context):
|
|
# Calculate distance from event keyword to date
|
|
# event_match_pos relative to context start = event_match_pos - start
|
|
event_pos_in_context = event_match_pos - start
|
|
date_pos_in_context = date_match.start()
|
|
distance = abs(date_pos_in_context - event_pos_in_context)
|
|
|
|
if distance < best_distance:
|
|
match_groups = date_match.groups()
|
|
iso_date, is_approx = parse_dutch_date(match_groups, date_type)
|
|
|
|
# Filter out dates before MIN_YEAR
|
|
if iso_date and is_valid_year(iso_date):
|
|
best_date = (iso_date, date_type, is_approx, distance)
|
|
best_distance = distance
|
|
|
|
return best_date
|
|
|
|
|
|
def extract_events_from_structured_text(text: str) -> list[dict]:
|
|
"""
|
|
Extract events from structured text (bullet points, numbered lists).
|
|
|
|
Handles patterns like:
|
|
- Stadsarchief Deventer opgericht in 1838
|
|
- Samengevoegd met Athenaeumbibliotheek in 1999
|
|
|
|
Returns:
|
|
List of event dicts with date correctly associated per line
|
|
"""
|
|
events = []
|
|
|
|
# Split on newlines and bullet markers
|
|
lines = re.split(r'\n|(?:^|\s)[-•*]\s+', text)
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if not line or len(line) < 10:
|
|
continue
|
|
|
|
line_lower = line.lower()
|
|
|
|
# For each event type, check if this line mentions it
|
|
for event_type, patterns in EVENT_KEYWORDS.items():
|
|
for pattern in patterns:
|
|
match = re.search(pattern, line_lower)
|
|
if match:
|
|
# Store match position for use in lambda (avoids type checker issue)
|
|
match_pos = match.start()
|
|
|
|
# Find date within THIS line only (tight coupling)
|
|
for date_pattern, date_type, _ in DATE_PATTERNS:
|
|
date_matches = list(re.finditer(date_pattern, line_lower))
|
|
if date_matches:
|
|
# Prefer date closest to the event keyword
|
|
best_match = min(
|
|
date_matches,
|
|
key=lambda m: abs(m.start() - match_pos)
|
|
)
|
|
iso_date, is_approx = parse_dutch_date(
|
|
best_match.groups(), date_type
|
|
)
|
|
|
|
if iso_date and is_valid_year(iso_date):
|
|
date_precision = detect_date_precision(iso_date, date_type)
|
|
|
|
# Clean description from original line
|
|
description = line.strip()
|
|
description = re.sub(r'\s+', ' ', description)
|
|
if len(description) > 200:
|
|
description = description[:200] + '...'
|
|
|
|
events.append({
|
|
'internal_type': event_type,
|
|
'date': iso_date,
|
|
'date_precision': date_precision,
|
|
'description': description,
|
|
'approximate': is_approx,
|
|
})
|
|
break # Found date for this event
|
|
break # Found event type for this line
|
|
|
|
return events
|
|
|
|
|
|
def extract_events_from_text(
|
|
text: str,
|
|
source_url: str | None = None
|
|
) -> list[dict]:
|
|
"""
|
|
Extract historical events with dates from text.
|
|
|
|
Uses a two-phase approach:
|
|
1. First, try to extract from structured text (bullet points, lists)
|
|
where date-event associations are clear per line
|
|
2. Then, extract from prose using proximity-based date matching
|
|
|
|
Args:
|
|
text: Text to search for events
|
|
source_url: URL source for provenance
|
|
|
|
Returns:
|
|
List of event dictionaries with internal event_type names
|
|
"""
|
|
events = []
|
|
|
|
# Phase 1: Extract from structured text (bullet points, lists)
|
|
# These have clear date-event associations per line
|
|
if '\n' in text or re.search(r'[-•*]\s+', text):
|
|
structured_events = extract_events_from_structured_text(text)
|
|
events.extend(structured_events)
|
|
|
|
# Phase 2: Extract from prose with proximity-based matching
|
|
text_lower = text.lower()
|
|
|
|
for event_type, patterns in EVENT_KEYWORDS.items():
|
|
for pattern in patterns:
|
|
matches = list(re.finditer(pattern, text_lower))
|
|
for match in matches:
|
|
# Check if we already found this event in structured extraction
|
|
# by looking at similar descriptions
|
|
already_found = False
|
|
match_text = text_lower[match.start():match.end()]
|
|
for existing in events:
|
|
if (existing['internal_type'] == event_type and
|
|
match_text in existing.get('description', '').lower()):
|
|
already_found = True
|
|
break
|
|
|
|
if already_found:
|
|
continue
|
|
|
|
# Use proximity-based date finding
|
|
date_info = find_closest_date(text, match.start(), search_window=150)
|
|
|
|
if date_info is None:
|
|
continue
|
|
|
|
iso_date, date_type, is_approx, _ = date_info
|
|
date_precision = detect_date_precision(iso_date, date_type)
|
|
|
|
# Extract description from original case text
|
|
start = max(0, match.start() - 50)
|
|
end = min(len(text), match.end() + 150)
|
|
orig_context = text[start:end]
|
|
|
|
# Clean up the description
|
|
desc_start = match.start() - start
|
|
desc_end = min(desc_start + 150, len(orig_context))
|
|
description = orig_context[desc_start:desc_end].strip()
|
|
description = re.sub(r'\s+', ' ', description)
|
|
description = description.split('.')[0] # First sentence
|
|
if len(description) > 200:
|
|
description = description[:200] + '...'
|
|
|
|
event = {
|
|
'internal_type': event_type,
|
|
'date': iso_date,
|
|
'date_precision': date_precision,
|
|
'description': description,
|
|
'approximate': is_approx,
|
|
}
|
|
if source_url:
|
|
event['source_url'] = source_url
|
|
|
|
events.append(event)
|
|
|
|
return events
|
|
|
|
|
|
def deduplicate_events(events: list[dict]) -> list[dict]:
|
|
"""
|
|
Remove duplicate events based on date, type, and approximate status.
|
|
"""
|
|
seen = set()
|
|
unique = []
|
|
for event in events:
|
|
key = (event.get('date'), event.get('internal_type'), event.get('approximate', False))
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique.append(event)
|
|
return unique
|
|
|
|
|
|
# =============================================================================
|
|
# LINKUP JSON PARSING
|
|
# =============================================================================
|
|
|
|
|
|
def extract_institution_name_from_query(query: str) -> str | None:
|
|
"""
|
|
Extract institution name from linkup query string.
|
|
|
|
The query format is typically: "Institution Name" city opgericht OR gesticht...
|
|
"""
|
|
# Try to extract quoted name first
|
|
match = re.search(r'"([^"]+)"', query)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
|
|
def is_source_relevant(source_name: str, source_url: str, institution_name: str | None) -> bool:
|
|
"""
|
|
Check if a source is relevant to the target institution.
|
|
"""
|
|
if not institution_name:
|
|
return True
|
|
|
|
inst_lower = institution_name.lower()
|
|
key_words = [w for w in inst_lower.split() if len(w) > 3]
|
|
|
|
source_lower = source_name.lower()
|
|
url_lower = source_url.lower()
|
|
|
|
for word in key_words:
|
|
if word in source_lower or word in url_lower:
|
|
return True
|
|
|
|
if 'wikipedia' in url_lower:
|
|
for word in key_words:
|
|
if word in source_lower:
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def parse_linkup_json(json_path: Path, include_sources: bool = False) -> dict:
|
|
"""
|
|
Parse a timeline source file and extract events with full provenance.
|
|
|
|
Args:
|
|
json_path: Path to linkup JSON file
|
|
include_sources: If True, also extract from source snippets
|
|
|
|
Returns:
|
|
Dict with 'events' list and 'provenance' metadata
|
|
"""
|
|
try:
|
|
with open(json_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
except (json.JSONDecodeError, FileNotFoundError) as e:
|
|
print(f" Warning: Could not parse {json_path}: {e}")
|
|
return {'events': [], 'provenance': {}}
|
|
|
|
api_response = data.get('api_response', {})
|
|
query = data.get('query', '')
|
|
fetch_timestamp = data.get('fetch_timestamp', '')
|
|
institution_name = extract_institution_name_from_query(query)
|
|
|
|
# Extract from main answer (most reliable)
|
|
answer = api_response.get('answer', '')
|
|
events = []
|
|
if answer:
|
|
events.extend(extract_events_from_text(answer))
|
|
|
|
# Collect source URLs for provenance
|
|
source_urls = []
|
|
sources = api_response.get('sources', [])
|
|
for source in sources:
|
|
url = source.get('url', '')
|
|
if url and is_source_relevant(source.get('name', ''), url, institution_name):
|
|
source_urls.append(url)
|
|
|
|
# Optionally extract from sources (higher noise)
|
|
if include_sources:
|
|
for source in sources:
|
|
source_name = source.get('name', '')
|
|
source_url = source.get('url', '')
|
|
snippet = source.get('snippet', '')
|
|
|
|
if snippet and is_source_relevant(source_name, source_url, institution_name):
|
|
source_events = extract_events_from_text(snippet, source_url)
|
|
events.extend(source_events)
|
|
|
|
events = deduplicate_events(events)
|
|
|
|
# Calculate relative archive path
|
|
# Path format: data/custodian/web/NNNN/linkup/linkup_XXX.json
|
|
# We want: web/NNNN/linkup/linkup_XXX.json
|
|
parts = json_path.parts
|
|
try:
|
|
web_idx = parts.index('web')
|
|
archive_path = '/'.join(parts[web_idx:])
|
|
except ValueError:
|
|
archive_path = str(json_path)
|
|
|
|
provenance = {
|
|
'linkup_query': query,
|
|
'linkup_answer': answer,
|
|
'fetch_timestamp': fetch_timestamp,
|
|
'archive_path': archive_path,
|
|
'source_urls': source_urls[:5], # Limit to top 5 sources
|
|
}
|
|
|
|
return {'events': events, 'provenance': provenance}
|
|
|
|
|
|
# =============================================================================
|
|
# YAML UPDATE
|
|
# =============================================================================
|
|
|
|
|
|
def load_mapping(mapping_path: Path) -> dict[int, str]:
|
|
"""
|
|
Load entry number to GHCID mapping.
|
|
"""
|
|
mapping = {}
|
|
with open(mapping_path, 'r', encoding='utf-8') as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
parts = line.split(' ', 1)
|
|
if len(parts) == 2:
|
|
try:
|
|
entry_num = int(parts[0])
|
|
ghcid = parts[1]
|
|
if entry_num not in mapping:
|
|
mapping[entry_num] = ghcid
|
|
except ValueError:
|
|
continue
|
|
return mapping
|
|
|
|
|
|
def convert_to_linkup_timeline_event(
|
|
event: dict,
|
|
provenance: dict,
|
|
extraction_timestamp: str
|
|
) -> dict | None:
|
|
"""
|
|
Convert internal event format to CustodianTimelineEvent schema format.
|
|
|
|
Args:
|
|
event: Internal event dict with 'internal_type', 'date', etc.
|
|
provenance: Provenance dict from parse_linkup_json
|
|
extraction_timestamp: ISO timestamp when extraction script ran
|
|
|
|
Returns:
|
|
Dict conforming to CustodianTimelineEvent schema
|
|
"""
|
|
internal_type = event.get('internal_type', '')
|
|
enum_type = EVENT_TYPE_MAP.get(internal_type)
|
|
|
|
if not enum_type:
|
|
return None # Skip events that don't map to enum
|
|
|
|
return {
|
|
'event_type': enum_type,
|
|
'event_date': event.get('date'),
|
|
'date_precision': event.get('date_precision', 'year'),
|
|
'approximate': event.get('approximate', False),
|
|
'description': event.get('description', ''),
|
|
'source_urls': provenance.get('source_urls', []),
|
|
'linkup_query': provenance.get('linkup_query', ''),
|
|
'linkup_answer': provenance.get('linkup_answer', ''),
|
|
'fetch_timestamp': provenance.get('fetch_timestamp', ''),
|
|
'archive_path': provenance.get('archive_path', ''),
|
|
'extraction_method': 'linkup_answer_regex',
|
|
'extraction_timestamp': extraction_timestamp,
|
|
'data_tier': 'TIER_4_INFERRED',
|
|
}
|
|
|
|
|
|
def update_yaml_timeline_enrichment(
|
|
yaml_path: Path,
|
|
events: list[dict],
|
|
provenance: dict,
|
|
extraction_timestamp: str,
|
|
dry_run: bool = False
|
|
) -> bool:
|
|
"""
|
|
Update a custodian YAML file with CustodianTimelineEvent records.
|
|
|
|
Writes to: timeline_enrichment.timeline_events (NOT timespan.events)
|
|
|
|
Args:
|
|
yaml_path: Path to custodian YAML file
|
|
events: List of internal event dicts
|
|
provenance: Provenance dict from parse_linkup_json
|
|
extraction_timestamp: ISO timestamp for extraction
|
|
dry_run: If True, don't write changes
|
|
|
|
Returns:
|
|
True if file was updated
|
|
"""
|
|
if not yaml_path.exists():
|
|
print(f" Warning: YAML file not found: {yaml_path}")
|
|
return False
|
|
|
|
try:
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
except yaml.YAMLError as e:
|
|
print(f" Warning: Could not parse YAML {yaml_path}: {e}")
|
|
return False
|
|
|
|
if data is None:
|
|
data = {}
|
|
|
|
# Initialize timeline_enrichment if not exists
|
|
if 'timeline_enrichment' not in data:
|
|
data['timeline_enrichment'] = {}
|
|
|
|
timeline_enrichment = data['timeline_enrichment']
|
|
|
|
# Initialize timeline_events array if not exists
|
|
if 'timeline_events' not in timeline_enrichment:
|
|
timeline_enrichment['timeline_events'] = []
|
|
|
|
timeline_events = timeline_enrichment['timeline_events']
|
|
|
|
# Get existing event keys to avoid duplicates
|
|
existing_keys = {
|
|
(e.get('event_date'), e.get('event_type'))
|
|
for e in timeline_events
|
|
}
|
|
|
|
# Convert and add new events
|
|
new_count = 0
|
|
for event in events:
|
|
linkup_event = convert_to_linkup_timeline_event(event, provenance, extraction_timestamp)
|
|
|
|
if linkup_event is None:
|
|
continue # Skip events that don't map to enum
|
|
|
|
key = (linkup_event.get('event_date'), linkup_event.get('event_type'))
|
|
if key not in existing_keys:
|
|
timeline_events.append(linkup_event)
|
|
existing_keys.add(key)
|
|
new_count += 1
|
|
|
|
# Sort events by date
|
|
timeline_events.sort(key=lambda e: e.get('event_date') or '')
|
|
|
|
if new_count > 0 and not dry_run:
|
|
with open(yaml_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return new_count > 0
|
|
|
|
|
|
# =============================================================================
|
|
# MAIN
|
|
# =============================================================================
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description='Extract timeline events from source archives with full provenance'
|
|
)
|
|
parser.add_argument('--dry-run', action='store_true', help='Do not write changes')
|
|
parser.add_argument('--limit', type=int, help='Limit number of entries to process')
|
|
parser.add_argument('--entry', type=int, help='Process specific entry number')
|
|
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
|
|
parser.add_argument('--include-sources', action='store_true',
|
|
help='Also extract from source snippets (higher noise)')
|
|
args = parser.parse_args()
|
|
|
|
# Paths
|
|
base_path = Path('/Users/kempersc/apps/glam')
|
|
mapping_path = base_path / 'data/custodian/web/_entry_to_ghcid.txt'
|
|
web_path = base_path / 'data/custodian/web'
|
|
custodian_path = base_path / 'data/custodian'
|
|
|
|
# Extraction timestamp (when this script runs)
|
|
extraction_timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Load mapping
|
|
print("Loading entry-to-GHCID mapping...")
|
|
mapping = load_mapping(mapping_path)
|
|
print(f" Loaded {len(mapping)} mappings")
|
|
|
|
# Process entries
|
|
processed = 0
|
|
updated = 0
|
|
total_events = 0
|
|
|
|
entries_to_process = [args.entry] if args.entry else sorted(mapping.keys())
|
|
if args.limit:
|
|
entries_to_process = entries_to_process[:args.limit]
|
|
|
|
print(f"\nProcessing {len(entries_to_process)} entries...")
|
|
print(f"Extraction timestamp: {extraction_timestamp}")
|
|
|
|
for entry_num in entries_to_process:
|
|
ghcid = mapping.get(entry_num)
|
|
if not ghcid:
|
|
continue
|
|
|
|
# Find linkup JSON files
|
|
entry_dir = web_path / f"{entry_num:04d}" / 'linkup'
|
|
if not entry_dir.exists():
|
|
if args.verbose:
|
|
print(f" Skipping entry {entry_num}: no linkup directory")
|
|
continue
|
|
|
|
# Find all linkup JSON files (founding, merger, etc.)
|
|
json_files = list(entry_dir.glob('linkup_*.json'))
|
|
if not json_files:
|
|
if args.verbose:
|
|
print(f" Skipping entry {entry_num}: no linkup JSON files")
|
|
continue
|
|
|
|
# Process all JSON files for this entry
|
|
all_events = []
|
|
combined_provenance = {
|
|
'linkup_query': '',
|
|
'linkup_answer': '',
|
|
'fetch_timestamp': '',
|
|
'archive_path': '',
|
|
'source_urls': [],
|
|
}
|
|
|
|
for json_file in json_files:
|
|
result = parse_linkup_json(json_file, include_sources=args.include_sources)
|
|
events = result['events']
|
|
provenance = result['provenance']
|
|
|
|
# Use provenance from first file with data
|
|
if provenance.get('linkup_answer') and not combined_provenance['linkup_answer']:
|
|
combined_provenance = provenance
|
|
|
|
# For multiple JSON files, keep the events but note they may have different provenance
|
|
for event in events:
|
|
event['_archive_path'] = provenance.get('archive_path', '')
|
|
|
|
all_events.extend(events)
|
|
|
|
all_events = deduplicate_events(all_events)
|
|
|
|
if not all_events:
|
|
if args.verbose:
|
|
print(f" Entry {entry_num} ({ghcid}): no events extracted")
|
|
processed += 1
|
|
continue
|
|
|
|
# Update YAML file
|
|
yaml_file = custodian_path / f"{ghcid}.yaml"
|
|
|
|
if args.verbose or args.dry_run:
|
|
print(f"\n Entry {entry_num} ({ghcid}):")
|
|
for event in all_events:
|
|
internal_type = event.get('internal_type', '')
|
|
enum_type = EVENT_TYPE_MAP.get(internal_type, 'UNKNOWN')
|
|
approx = " ~" if event.get('approximate') else ""
|
|
prec = event.get('date_precision', 'year')
|
|
print(f" - {event['date']}{approx} [{prec}] {enum_type}: {event['description'][:50]}...")
|
|
|
|
if update_yaml_timeline_enrichment(
|
|
yaml_file, all_events, combined_provenance, extraction_timestamp, dry_run=args.dry_run
|
|
):
|
|
updated += 1
|
|
total_events += len(all_events)
|
|
if not args.verbose:
|
|
print(f" Updated {ghcid}: +{len(all_events)} events")
|
|
|
|
processed += 1
|
|
|
|
# Summary
|
|
print(f"\n{'=' * 60}")
|
|
print(f"Summary:")
|
|
print(f" Entries processed: {processed}")
|
|
print(f" YAML files updated: {updated}")
|
|
print(f" Total events added: {total_events}")
|
|
print(f" Output location: timeline_enrichment.timeline_events")
|
|
print(f" Schema: CustodianTimelineEvent (TIER_4_INFERRED)")
|
|
if args.dry_run:
|
|
print(" (DRY RUN - no files modified)")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|