919 lines
36 KiB
Python
Executable file
919 lines
36 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Extract typed entities from web archives using annotated patterns.
|
|
|
|
Loads patterns from dutch_web_patterns.yaml and processes web archive HTML
|
|
to extract entities with CH-Annotator types and relationship predicates.
|
|
|
|
This script:
|
|
1. Loads entity and discard patterns from dutch_web_patterns.yaml
|
|
2. Finds custodian files with web_enrichment.web_archives references
|
|
3. For each custodian, processes HTML from web archive mirror directories
|
|
4. Extracts text content from HTML
|
|
5. Matches against discard patterns first (filter out navigation, UI, etc.)
|
|
6. Matches against entity patterns (extract with types and relationships)
|
|
7. Applies capture groups to extract sub-entities
|
|
8. Generates relationship triples
|
|
9. Adds pattern_entity_claims section to custodian YAML files
|
|
|
|
Usage:
|
|
python scripts/extract_with_patterns.py [--dry-run] [--limit N] [--custodian GHCID]
|
|
python scripts/extract_with_patterns.py --verbose --limit 3
|
|
"""
|
|
|
|
import argparse
|
|
import glob
|
|
import os
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
from html.parser import HTMLParser
|
|
from io import StringIO
|
|
|
|
import yaml
|
|
|
|
|
|
# ============================================================================
|
|
# DUTCH STOPWORD FILTER
|
|
# ============================================================================
|
|
|
|
# Common Dutch words that should NOT be extracted as place names, organization names, etc.
|
|
# These cause false positives when patterns like "gemeente (\w+)" match "gemeente op de straat"
|
|
DUTCH_STOPWORDS = {
|
|
# Articles
|
|
'de', 'het', 'een', 'der', 'des', 'den',
|
|
# Prepositions
|
|
'op', 'in', 'van', 'aan', 'te', 'tot', 'bij', 'met', 'voor', 'na', 'naar',
|
|
'om', 'uit', 'over', 'onder', 'door', 'tegen', 'tussen', 'zonder', 'binnen',
|
|
'buiten', 'langs', 'sinds', 'tijdens', 'vanaf', 'volgens', 'wegens',
|
|
# Pronouns
|
|
'ik', 'je', 'jij', 'u', 'hij', 'zij', 'ze', 'wij', 'we', 'jullie', 'hen', 'hun',
|
|
'mij', 'jou', 'hem', 'haar', 'ons', 'die', 'dat', 'dit', 'deze', 'wat', 'wie',
|
|
'welke', 'welk', 'waar', 'wanneer', 'waarom', 'hoe', 'er', 'hier', 'daar',
|
|
# Common verbs (conjugated forms that might appear after "gemeente", etc.)
|
|
'is', 'zijn', 'was', 'waren', 'ben', 'bent', 'geweest', 'wordt', 'worden',
|
|
'werd', 'werden', 'heeft', 'hebben', 'had', 'hadden', 'gehad', 'kan', 'kunnen',
|
|
'kon', 'konden', 'gekund', 'mag', 'mogen', 'mocht', 'mochten', 'moet', 'moeten',
|
|
'moest', 'moesten', 'zal', 'zullen', 'zou', 'zouden', 'wil', 'willen', 'wilde',
|
|
'wilden', 'gewild', 'zien', 'ziet', 'zag', 'zagen', 'gezien', 'gaan', 'gaat',
|
|
'ging', 'gingen', 'gegaan', 'komen', 'komt', 'kwam', 'kwamen', 'gekomen',
|
|
'doen', 'doet', 'deed', 'deden', 'gedaan', 'maken', 'maakt', 'maakte', 'maakten',
|
|
'gemaakt', 'zeggen', 'zegt', 'zei', 'zeiden', 'gezegd', 'staan', 'staat', 'stond',
|
|
'stonden', 'gestaan', 'liggen', 'ligt', 'lag', 'lagen', 'gelegen', 'woonde', 'woont',
|
|
# Common adjectives/adverbs
|
|
'ook', 'nog', 'al', 'wel', 'niet', 'geen', 'meer', 'veel', 'weinig', 'erg',
|
|
'heel', 'zeer', 'zo', 'nu', 'dan', 'toen', 'weer', 'vaak', 'altijd', 'nooit',
|
|
'soms', 'reeds', 'steeds', 'pas', 'net', 'juist', 'precies', 'ongeveer',
|
|
# Conjunctions
|
|
'en', 'of', 'maar', 'want', 'dus', 'omdat', 'als', 'indien', 'hoewel', 'tenzij',
|
|
'totdat', 'terwijl', 'voordat', 'nadat', 'zodat', 'opdat', 'mits', 'ofschoon',
|
|
# Common nouns that aren't places
|
|
'eigendom', 'bezit', 'gebied', 'plaats', 'deel', 'kant', 'zijde', 'wijze',
|
|
'manier', 'vorm', 'soort', 'type', 'naam', 'titel', 'datum', 'tijd', 'jaar',
|
|
'dag', 'week', 'maand', 'uur', 'minuut', 'eeuw', 'periode', 'men', 'iemand',
|
|
# Short words that are likely false positives
|
|
'aa', 'ab', 'ad', 'af', 'ag', 'ah', 'al', 'am', 'as', 'at', 'au', 'be', 'bi',
|
|
'bo', 'bu', 'ca', 'co', 'da', 'do', 'du', 'ed', 'ee', 'eg', 'ei', 'el', 'em',
|
|
'ex', 'fa', 'fe', 'fi', 'fo', 'fu', 'ga', 'ge', 'go', 'gu', 'ha', 'he', 'hi',
|
|
'ho', 'hu', 'id', 'ie', 'ig', 'ij', 'il', 'im', 'io', 'ir', 'ja', 'je', 'jo',
|
|
'ju', 'ka', 'ke', 'ki', 'ko', 'ku', 'la', 'le', 'li', 'lo', 'lu', 'ma', 'me',
|
|
'mi', 'mo', 'mu', 'na', 'ne', 'ni', 'no', 'nu', 'ob', 'od', 'oe', 'og', 'oh',
|
|
'ok', 'ol', 'om', 'on', 'oo', 'op', 'or', 'os', 'ot', 'ou', 'ow', 'oz', 'pa',
|
|
'pe', 'pi', 'po', 'pu', 'ra', 're', 'ri', 'ro', 'ru', 'sa', 'se', 'si', 'so',
|
|
'su', 'ta', 'te', 'ti', 'to', 'tu', 'ub', 'ue', 'ug', 'ui', 'uk', 'ul', 'um',
|
|
'un', 'up', 'ur', 'us', 'ut', 'uu', 'va', 've', 'vi', 'vo', 'vu', 'wa', 'we',
|
|
'wi', 'wo', 'wu', 'za', 'ze', 'zi', 'zo', 'zu',
|
|
}
|
|
|
|
# Generic organization words that by themselves don't make a valid entity
|
|
# e.g., "de Stichting" without a name is too generic
|
|
GENERIC_ORG_WORDS = {
|
|
'stichting', 'vereniging', 'genootschap', 'organisatie', 'instelling',
|
|
'instituut', 'centrum', 'bureau', 'dienst', 'raad', 'commissie',
|
|
'archief', 'museum', 'bibliotheek', 'collectie', 'fonds',
|
|
}
|
|
|
|
# Entity types whose capture groups should be validated against stopwords
|
|
# These are patterns where captured groups are expected to be proper nouns (places, names)
|
|
STOPWORD_FILTERED_ENTITY_TYPES = {
|
|
'GRP.GOV', # Government - municipality names should be proper nouns
|
|
'GRP.GOV.MUN', # Municipality
|
|
'GRP.GOV.PRO', # Province
|
|
'GRP.HER', # Heritage institutions - name parts should be proper nouns
|
|
'GRP.HER.MUS', # Museum
|
|
'GRP.HER.ARC', # Archive
|
|
'GRP.HER.LIB', # Library
|
|
'GRP.ORG', # Organizations - name parts should be proper nouns
|
|
'TOP.SET', # Settlement names
|
|
'TOP.BLD', # Building names
|
|
'AGT.PER', # Person names
|
|
}
|
|
|
|
|
|
def is_stopword_match(entity_result: dict) -> bool:
|
|
"""
|
|
Check if an entity match is actually a false positive due to stopwords.
|
|
|
|
Returns True if the match should be REJECTED (is a false positive).
|
|
"""
|
|
entity_type = entity_result.get('entity_type') or ''
|
|
entity_subtype = entity_result.get('entity_subtype') or ''
|
|
|
|
# Check if this entity type should be filtered
|
|
should_filter = (
|
|
entity_type in STOPWORD_FILTERED_ENTITY_TYPES or
|
|
entity_subtype in STOPWORD_FILTERED_ENTITY_TYPES
|
|
)
|
|
|
|
if not should_filter:
|
|
return False
|
|
|
|
# Check capture groups for stopwords
|
|
captures = entity_result.get('captures', {})
|
|
for idx, cap in captures.items():
|
|
value = cap.get('value', '').lower().strip()
|
|
cap_type = cap.get('type', '')
|
|
|
|
# Check if this capture group type should be validated
|
|
if cap_type in STOPWORD_FILTERED_ENTITY_TYPES or entity_type in STOPWORD_FILTERED_ENTITY_TYPES:
|
|
if value in DUTCH_STOPWORDS:
|
|
return True # Reject this match
|
|
|
|
# Also reject if captured value is too short (less than 3 chars)
|
|
# unless it's a known Dutch place abbreviation
|
|
if len(value) < 3:
|
|
return True
|
|
|
|
# Check the matched text itself if no captures
|
|
if not captures:
|
|
# Extract the last word (often the "name" part) from matched text
|
|
matched = entity_result.get('matched_text', '')
|
|
words = matched.lower().split()
|
|
if words:
|
|
last_word = words[-1]
|
|
if last_word in DUTCH_STOPWORDS:
|
|
return True
|
|
|
|
# Check for generic organization matches like "de Stichting" (without a real name)
|
|
matched_text = entity_result.get('matched_text', '').lower().strip()
|
|
words = matched_text.split()
|
|
|
|
# Pattern: article + generic org word (e.g., "de stichting", "het archief")
|
|
if len(words) == 2:
|
|
if words[0] in {'de', 'het', 'een'} and words[1] in GENERIC_ORG_WORDS:
|
|
return True # Too generic, reject
|
|
|
|
return False
|
|
|
|
|
|
# ============================================================================
|
|
# YAML HANDLING
|
|
# ============================================================================
|
|
|
|
class CustomDumper(yaml.SafeDumper):
|
|
"""Custom YAML dumper to preserve formatting."""
|
|
pass
|
|
|
|
|
|
def str_representer(dumper, data):
|
|
"""Represent strings with proper multiline handling."""
|
|
if '\n' in data:
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
|
|
|
|
|
|
CustomDumper.add_representer(str, str_representer)
|
|
|
|
|
|
def load_yaml(filepath: Path) -> dict:
|
|
"""Load a YAML file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f) or {}
|
|
|
|
|
|
def save_yaml(filepath: Path, data: dict) -> None:
|
|
"""Save data to a YAML file."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, Dumper=CustomDumper, allow_unicode=True,
|
|
default_flow_style=False, sort_keys=False, width=120)
|
|
|
|
|
|
# ============================================================================
|
|
# HTML TEXT EXTRACTION
|
|
# ============================================================================
|
|
|
|
class MLStripper(HTMLParser):
|
|
"""Simple HTML stripper to extract text content."""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.reset()
|
|
self.strict = False
|
|
self.convert_charrefs = True
|
|
self.text = StringIO()
|
|
self.in_script_or_style = False
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
if tag in ('script', 'style', 'noscript'):
|
|
self.in_script_or_style = True
|
|
|
|
def handle_endtag(self, tag):
|
|
if tag in ('script', 'style', 'noscript'):
|
|
self.in_script_or_style = False
|
|
|
|
def handle_data(self, data):
|
|
if not self.in_script_or_style:
|
|
self.text.write(data)
|
|
|
|
def get_data(self):
|
|
return self.text.getvalue()
|
|
|
|
|
|
def strip_tags(html: str) -> str:
|
|
"""Remove HTML tags and return text content."""
|
|
s = MLStripper()
|
|
try:
|
|
s.feed(html)
|
|
return s.get_data()
|
|
except Exception:
|
|
# Fallback: simple regex-based stripping
|
|
return re.sub(r'<[^>]+>', ' ', html)
|
|
|
|
|
|
def extract_text_from_html(html_content: str) -> list[str]:
|
|
"""
|
|
Extract meaningful text segments from HTML content.
|
|
|
|
Returns a list of text strings that could be entity mentions.
|
|
Filters out very short strings and common non-entity content.
|
|
"""
|
|
text = strip_tags(html_content)
|
|
|
|
# Split into lines and clean
|
|
lines = []
|
|
for line in text.split('\n'):
|
|
# Clean whitespace
|
|
line = ' '.join(line.split())
|
|
|
|
# Skip very short lines
|
|
if len(line) < 3:
|
|
continue
|
|
|
|
# Skip lines that are just whitespace/punctuation
|
|
if re.match(r'^[\s\W]+$', line):
|
|
continue
|
|
|
|
lines.append(line)
|
|
|
|
return lines
|
|
|
|
|
|
# ============================================================================
|
|
# PATTERN LOADING AND COMPILATION
|
|
# ============================================================================
|
|
|
|
class PatternMatcher:
|
|
"""
|
|
Loads and compiles patterns from dutch_web_patterns.yaml.
|
|
Provides matching against discard and entity patterns.
|
|
"""
|
|
|
|
def __init__(self, pattern_file: Path, strip_anchors: bool = True):
|
|
"""Load and compile patterns from YAML file.
|
|
|
|
Args:
|
|
pattern_file: Path to YAML pattern file
|
|
strip_anchors: If True, remove ^ and $ anchors from entity patterns
|
|
to enable substring matching (default: True)
|
|
"""
|
|
self.pattern_file = pattern_file
|
|
self.raw_data = load_yaml(pattern_file)
|
|
self.strip_anchors = strip_anchors
|
|
|
|
# Compiled patterns
|
|
self.discard_patterns: list[tuple[re.Pattern, str]] = [] # (regex, reason)
|
|
self.entity_patterns: list[dict] = [] # Full pattern config with compiled regex
|
|
|
|
self._compile_patterns()
|
|
|
|
def _strip_regex_anchors(self, pattern: str) -> str:
|
|
r"""Remove ^ and $ anchors from a regex pattern for substring matching.
|
|
|
|
Replaces anchors with word boundary markers (\b) to prevent false positives
|
|
from partial word matches. For example:
|
|
- ^gemeente\s+(\w+)$ becomes \bgemeente\s+(\w+)\b
|
|
|
|
This allows the pattern to match "... gemeente Assen ..."
|
|
but not "... gemeentebestuur ..." or "...in gemeente op de..."
|
|
|
|
Preserves anchors that are escaped (\\^ or \\$).
|
|
"""
|
|
if not self.strip_anchors:
|
|
return pattern
|
|
|
|
# Replace leading ^ with word boundary \b (but not \^)
|
|
if pattern.startswith('^'):
|
|
pattern = r'\b' + pattern[1:]
|
|
else:
|
|
# Add word boundary at start if not present
|
|
if not pattern.startswith(r'\b'):
|
|
pattern = r'\b' + pattern
|
|
|
|
# Replace trailing $ with word boundary \b (but not \$)
|
|
if pattern.endswith('$') and not pattern.endswith('\\$'):
|
|
pattern = pattern[:-1] + r'\b'
|
|
else:
|
|
# Add word boundary at end if not present
|
|
if not pattern.endswith(r'\b'):
|
|
pattern = pattern + r'\b'
|
|
|
|
return pattern
|
|
|
|
def _compile_patterns(self):
|
|
"""Compile all regex patterns for efficient matching."""
|
|
# Compile discard patterns
|
|
discard_section = self.raw_data.get('discard_patterns', {})
|
|
for category, cat_data in discard_section.items():
|
|
if isinstance(cat_data, dict) and 'patterns' in cat_data:
|
|
for pat_item in cat_data['patterns']:
|
|
pattern_str = pat_item.get('pattern', '')
|
|
reason = pat_item.get('discard_reason', category)
|
|
if pattern_str:
|
|
try:
|
|
compiled = re.compile(pattern_str, re.IGNORECASE)
|
|
self.discard_patterns.append((compiled, reason))
|
|
except re.error as e:
|
|
print(f"Warning: Invalid discard pattern '{pattern_str}': {e}")
|
|
|
|
# Compile entity patterns
|
|
entity_section = self.raw_data.get('entity_patterns', {})
|
|
self._compile_entity_section(entity_section)
|
|
|
|
def _compile_entity_section(self, section: dict, parent_path: str = ""):
|
|
"""Recursively compile entity patterns from nested structure."""
|
|
for key, value in section.items():
|
|
if isinstance(value, dict):
|
|
if 'patterns' in value:
|
|
# This is a pattern category with actual patterns
|
|
for pat_item in value['patterns']:
|
|
pattern_str = pat_item.get('pattern', '')
|
|
if pattern_str:
|
|
try:
|
|
# Strip anchors for substring matching
|
|
pattern_for_compile = self._strip_regex_anchors(pattern_str)
|
|
compiled = re.compile(pattern_for_compile, re.IGNORECASE)
|
|
entity_config = {
|
|
'regex': compiled,
|
|
'pattern_str': pattern_str, # Keep original for logging
|
|
'pattern_compiled': pattern_for_compile, # Actual compiled pattern
|
|
'category': f"{parent_path}/{key}" if parent_path else key,
|
|
'entity_type': pat_item.get('entity_type'),
|
|
'entity_subtype': pat_item.get('entity_subtype'),
|
|
'label_template': pat_item.get('label_template'),
|
|
'capture_groups': pat_item.get('capture_groups', {}),
|
|
'relationships': pat_item.get('relationships', []),
|
|
'description': pat_item.get('description', ''),
|
|
}
|
|
self.entity_patterns.append(entity_config)
|
|
except re.error as e:
|
|
print(f"Warning: Invalid entity pattern '{pattern_str}': {e}")
|
|
else:
|
|
# Nested category, recurse
|
|
new_path = f"{parent_path}/{key}" if parent_path else key
|
|
self._compile_entity_section(value, new_path)
|
|
|
|
def should_discard(self, text: str) -> tuple[bool, Optional[str]]:
|
|
"""
|
|
Check if text matches any discard pattern.
|
|
|
|
Returns:
|
|
Tuple of (should_discard, reason or None)
|
|
"""
|
|
text_lower = text.lower().strip()
|
|
|
|
for regex, reason in self.discard_patterns:
|
|
if regex.search(text_lower):
|
|
return True, reason
|
|
|
|
return False, None
|
|
|
|
def match_entity(self, text: str) -> Optional[dict]:
|
|
"""
|
|
Match text against entity patterns.
|
|
|
|
Uses search() instead of match() to find patterns anywhere in the text,
|
|
not just at the beginning. This dramatically improves entity yield.
|
|
|
|
Returns:
|
|
Dict with match info including entity_type, captures, relationships
|
|
or None if no match
|
|
"""
|
|
text_stripped = text.strip()
|
|
|
|
for pattern in self.entity_patterns:
|
|
match = pattern['regex'].search(text_stripped)
|
|
if match:
|
|
# Use the matched substring, not the full text
|
|
matched_substring = match.group(0)
|
|
|
|
result = {
|
|
'matched_text': matched_substring,
|
|
'full_context': text_stripped if text_stripped != matched_substring else None,
|
|
'entity_type': pattern['entity_type'],
|
|
'entity_subtype': pattern['entity_subtype'],
|
|
'pattern_str': pattern['pattern_str'],
|
|
'category': pattern['category'],
|
|
'description': pattern['description'],
|
|
'captures': {},
|
|
'relationships': [],
|
|
}
|
|
|
|
# Extract capture groups
|
|
if pattern['capture_groups']:
|
|
for group_num, group_config in pattern['capture_groups'].items():
|
|
try:
|
|
group_idx = int(group_num)
|
|
if group_idx <= len(match.groups()):
|
|
captured_value = match.group(group_idx)
|
|
if captured_value:
|
|
result['captures'][group_idx] = {
|
|
'value': captured_value,
|
|
'type': group_config.get('type'),
|
|
'role': group_config.get('role'),
|
|
}
|
|
except (ValueError, IndexError):
|
|
pass
|
|
|
|
# Generate relationships - use matched_substring as the entity
|
|
if pattern['relationships']:
|
|
for rel in pattern['relationships']:
|
|
relationship = {
|
|
'predicate': rel.get('predicate'),
|
|
'subject': self._resolve_reference(rel.get('subject'), matched_substring, result['captures']),
|
|
'object': self._resolve_reference(rel.get('object'), matched_substring, result['captures']),
|
|
'confidence': rel.get('confidence', 0.8),
|
|
}
|
|
|
|
# Add type info if available
|
|
if rel.get('subject_type'):
|
|
relationship['subject_type'] = rel['subject_type']
|
|
if rel.get('object_type'):
|
|
relationship['object_type'] = rel['object_type']
|
|
|
|
result['relationships'].append(relationship)
|
|
|
|
# Apply label template if exists
|
|
if pattern['label_template'] and result['captures']:
|
|
try:
|
|
label = pattern['label_template']
|
|
for idx, cap in result['captures'].items():
|
|
label = label.replace(f'{{{idx}}}', cap['value'])
|
|
result['entity_label'] = label
|
|
except Exception:
|
|
result['entity_label'] = matched_substring
|
|
else:
|
|
result['entity_label'] = matched_substring
|
|
|
|
# Filter out false positives caused by stopwords in capture groups
|
|
if is_stopword_match(result):
|
|
continue # Try next pattern instead of returning this match
|
|
|
|
return result
|
|
|
|
return None
|
|
|
|
def _resolve_reference(self, ref: Any, matched_text: str, captures: dict) -> Optional[str]:
|
|
"""Resolve a reference in relationship definition."""
|
|
if ref is None:
|
|
return None
|
|
if ref == '$0':
|
|
return matched_text
|
|
if isinstance(ref, str) and ref.startswith('$'):
|
|
try:
|
|
idx = int(ref[1:])
|
|
if idx in captures:
|
|
return captures[idx]['value']
|
|
except ValueError:
|
|
pass
|
|
if ref == 'CUSTODIAN':
|
|
return 'CUSTODIAN' # Placeholder for the custodian being processed
|
|
return str(ref)
|
|
|
|
|
|
# ============================================================================
|
|
# CUSTODIAN FILE PROCESSING
|
|
# ============================================================================
|
|
|
|
def find_html_files(archive_dir: Path) -> list[Path]:
|
|
"""Find all HTML files in a web archive directory."""
|
|
html_files = []
|
|
|
|
mirror_dir = archive_dir / 'mirror'
|
|
if mirror_dir.exists():
|
|
for html_file in mirror_dir.rglob('*.html'):
|
|
html_files.append(html_file)
|
|
|
|
pages_dir = archive_dir / 'pages'
|
|
if pages_dir.exists():
|
|
for html_file in pages_dir.rglob('*.html'):
|
|
html_files.append(html_file)
|
|
|
|
return html_files
|
|
|
|
|
|
def process_custodian_file(
|
|
custodian_path: Path,
|
|
base_path: Path,
|
|
matcher: PatternMatcher,
|
|
dry_run: bool = False,
|
|
verbose: bool = False,
|
|
show_entities: bool = False,
|
|
show_unmatched: int = 0,
|
|
min_length: int = 10
|
|
) -> dict:
|
|
"""
|
|
Process a single custodian file to extract and add pattern-based entities.
|
|
|
|
Args:
|
|
custodian_path: Path to custodian YAML file
|
|
base_path: Base path for web archives (data/custodian/)
|
|
matcher: Compiled pattern matcher
|
|
dry_run: If True, don't write changes
|
|
verbose: If True, show detailed output
|
|
show_entities: If True, print each entity as it's found
|
|
show_unmatched: Number of unmatched segments to show (for debugging)
|
|
min_length: Minimum text segment length to analyze
|
|
|
|
Returns:
|
|
Dict with processing stats
|
|
"""
|
|
stats = {
|
|
'file': str(custodian_path.name),
|
|
'web_archives_found': 0,
|
|
'html_files_processed': 0,
|
|
'text_segments_analyzed': 0,
|
|
'segments_discarded': 0,
|
|
'entities_extracted': 0,
|
|
'status': 'skipped',
|
|
'error': None,
|
|
}
|
|
|
|
# Collect unmatched segments for debugging
|
|
unmatched_samples = []
|
|
|
|
try:
|
|
custodian_data = load_yaml(custodian_path)
|
|
except Exception as e:
|
|
stats['status'] = 'error'
|
|
stats['error'] = f"Failed to load YAML: {e}"
|
|
return stats
|
|
|
|
# Check for web_enrichment section
|
|
web_enrichment = custodian_data.get('web_enrichment', {})
|
|
web_archives = web_enrichment.get('web_archives', [])
|
|
|
|
if not web_archives:
|
|
stats['status'] = 'no_web_archives'
|
|
return stats
|
|
|
|
stats['web_archives_found'] = len(web_archives)
|
|
|
|
all_claims = []
|
|
discard_counts = {}
|
|
|
|
for archive in web_archives:
|
|
archive_dir_str = archive.get('directory', '')
|
|
if not archive_dir_str:
|
|
continue
|
|
|
|
archive_dir = base_path / archive_dir_str
|
|
if not archive_dir.exists():
|
|
continue
|
|
|
|
html_files = find_html_files(archive_dir)
|
|
|
|
for html_file in html_files:
|
|
stats['html_files_processed'] += 1
|
|
|
|
try:
|
|
with open(html_file, 'r', encoding='utf-8', errors='replace') as f:
|
|
html_content = f.read()
|
|
except Exception as e:
|
|
if verbose:
|
|
print(f" Warning: Could not read {html_file}: {e}")
|
|
continue
|
|
|
|
# Extract text segments
|
|
text_segments = extract_text_from_html(html_content)
|
|
|
|
for segment in text_segments:
|
|
# Skip segments that are too short
|
|
if len(segment) < min_length:
|
|
continue
|
|
|
|
stats['text_segments_analyzed'] += 1
|
|
|
|
# First check discard patterns
|
|
should_discard, discard_reason = matcher.should_discard(segment)
|
|
if should_discard:
|
|
stats['segments_discarded'] += 1
|
|
discard_counts[discard_reason] = discard_counts.get(discard_reason, 0) + 1
|
|
continue
|
|
|
|
# Try to match entity patterns
|
|
entity_match = matcher.match_entity(segment)
|
|
if entity_match:
|
|
stats['entities_extracted'] += 1
|
|
|
|
# Build claim record
|
|
claim = {
|
|
'entity': entity_match['entity_label'],
|
|
'matched_text': entity_match['matched_text'],
|
|
'entity_type': entity_match['entity_type'],
|
|
}
|
|
|
|
if entity_match.get('entity_subtype'):
|
|
claim['entity_subtype'] = entity_match['entity_subtype']
|
|
|
|
claim['matched_pattern'] = entity_match['pattern_str']
|
|
claim['pattern_category'] = entity_match['category']
|
|
|
|
# Add capture groups if any
|
|
if entity_match['captures']:
|
|
claim['capture_groups'] = {
|
|
str(idx): cap for idx, cap in entity_match['captures'].items()
|
|
}
|
|
|
|
# Add relationships
|
|
if entity_match['relationships']:
|
|
claim['relationships'] = entity_match['relationships']
|
|
|
|
# Source file relative to custodian dir
|
|
try:
|
|
rel_path = html_file.relative_to(base_path)
|
|
claim['source_file'] = str(rel_path)
|
|
except ValueError:
|
|
claim['source_file'] = str(html_file)
|
|
|
|
claim['confidence'] = 0.85 # Pattern-based extraction confidence
|
|
|
|
all_claims.append(claim)
|
|
|
|
# Show entity if flag is set
|
|
if show_entities:
|
|
print(f" → [{entity_match['entity_type']}] {entity_match['entity_label']}")
|
|
else:
|
|
# Track unmatched segments for debugging
|
|
if show_unmatched > 0 and len(unmatched_samples) < show_unmatched:
|
|
# Only collect interesting segments (likely to contain entities)
|
|
if (len(segment) >= 15 and len(segment) <= 100 and
|
|
not segment.isupper() and
|
|
any(c.isupper() for c in segment[1:]) and
|
|
not re.match(r'^[\d\s\W]+$', segment)):
|
|
unmatched_samples.append(segment)
|
|
|
|
# Add unmatched samples to stats for debugging
|
|
if unmatched_samples:
|
|
stats['unmatched_samples'] = unmatched_samples
|
|
|
|
if not all_claims:
|
|
stats['status'] = 'no_entities_found'
|
|
return stats
|
|
|
|
# Deduplicate claims by entity + type
|
|
seen = set()
|
|
unique_claims = []
|
|
for claim in all_claims:
|
|
key = (claim['entity'], claim.get('entity_type', ''))
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique_claims.append(claim)
|
|
|
|
stats['entities_extracted'] = len(unique_claims)
|
|
|
|
# Create pattern_entity_claims section
|
|
pattern_entity_claims = {
|
|
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'pattern_based_extraction_v1',
|
|
'pattern_file': 'dutch_web_patterns.yaml',
|
|
'pattern_file_version': '1.0.0',
|
|
'html_files_processed': stats['html_files_processed'],
|
|
'text_segments_analyzed': stats['text_segments_analyzed'],
|
|
'segments_discarded': stats['segments_discarded'],
|
|
'entities_count': len(unique_claims),
|
|
'claims': unique_claims,
|
|
}
|
|
|
|
# Add to custodian data
|
|
custodian_data['pattern_entity_claims'] = pattern_entity_claims
|
|
|
|
if not dry_run:
|
|
save_yaml(custodian_path, custodian_data)
|
|
stats['status'] = 'updated'
|
|
else:
|
|
stats['status'] = 'would_update'
|
|
|
|
return stats
|
|
|
|
|
|
def find_custodian_files_with_web_archives(custodian_dir: Path) -> list[Path]:
|
|
"""
|
|
Find all custodian files that have web_enrichment.web_archives.
|
|
|
|
Args:
|
|
custodian_dir: Directory containing custodian YAML files
|
|
|
|
Returns:
|
|
List of paths to custodian files with web archives
|
|
"""
|
|
pattern = str(custodian_dir / "NL-*.yaml")
|
|
files = []
|
|
|
|
for filepath in glob.glob(pattern):
|
|
path = Path(filepath)
|
|
try:
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
# Quick check for web_archives: in file
|
|
content = f.read()
|
|
if 'web_archives:' in content:
|
|
files.append(path)
|
|
except Exception:
|
|
continue
|
|
|
|
return sorted(files)
|
|
|
|
|
|
# ============================================================================
|
|
# MAIN
|
|
# ============================================================================
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Extract typed entities from web archives using annotated patterns'
|
|
)
|
|
parser.add_argument(
|
|
'--dry-run',
|
|
action='store_true',
|
|
help='Show what would be done without making changes'
|
|
)
|
|
parser.add_argument(
|
|
'--limit',
|
|
type=int,
|
|
default=None,
|
|
help='Limit number of files to process'
|
|
)
|
|
parser.add_argument(
|
|
'--custodian',
|
|
type=str,
|
|
default=None,
|
|
help='Process only a specific custodian GHCID (e.g., NL-DR-ASS-A-DA)'
|
|
)
|
|
parser.add_argument(
|
|
'--custodian-dir',
|
|
type=Path,
|
|
default=Path('/Users/kempersc/apps/glam/data/custodian'),
|
|
help='Directory containing custodian YAML files'
|
|
)
|
|
parser.add_argument(
|
|
'--pattern-file',
|
|
type=Path,
|
|
default=Path('/Users/kempersc/apps/glam/data/entity_annotation/modules/processing/dutch_web_patterns.yaml'),
|
|
help='Path to pattern definition file'
|
|
)
|
|
parser.add_argument(
|
|
'--verbose', '-v',
|
|
action='store_true',
|
|
help='Show detailed output'
|
|
)
|
|
parser.add_argument(
|
|
'--show-entities',
|
|
action='store_true',
|
|
help='Show each extracted entity as it is found'
|
|
)
|
|
parser.add_argument(
|
|
'--show-unmatched',
|
|
type=int,
|
|
default=0,
|
|
metavar='N',
|
|
help='Show N sample unmatched text segments (for pattern development)'
|
|
)
|
|
parser.add_argument(
|
|
'--min-length',
|
|
type=int,
|
|
default=10,
|
|
help='Minimum text segment length to analyze (default: 10)'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = args.custodian_dir
|
|
base_path = custodian_dir
|
|
|
|
# Load patterns
|
|
print(f"Loading patterns from {args.pattern_file}...")
|
|
try:
|
|
matcher = PatternMatcher(args.pattern_file)
|
|
print(f" Loaded {len(matcher.discard_patterns)} discard patterns")
|
|
print(f" Loaded {len(matcher.entity_patterns)} entity patterns")
|
|
except Exception as e:
|
|
print(f"Error loading patterns: {e}")
|
|
return 1
|
|
|
|
# Find custodian files
|
|
if args.custodian:
|
|
# Process specific custodian
|
|
specific_file = custodian_dir / f"{args.custodian}.yaml"
|
|
if not specific_file.exists():
|
|
print(f"Error: Custodian file not found: {specific_file}")
|
|
return 1
|
|
files = [specific_file]
|
|
print(f"Processing specific custodian: {args.custodian}")
|
|
else:
|
|
print(f"Scanning for custodian files with web archives in {custodian_dir}...")
|
|
files = find_custodian_files_with_web_archives(custodian_dir)
|
|
print(f"Found {len(files)} custodian files with web_archives")
|
|
|
|
if args.limit:
|
|
files = files[:args.limit]
|
|
print(f"Limited to {args.limit} files")
|
|
|
|
if args.dry_run:
|
|
print("\n*** DRY RUN - No changes will be made ***\n")
|
|
|
|
# Process statistics
|
|
total_processed = 0
|
|
total_updated = 0
|
|
total_entities = 0
|
|
total_html_files = 0
|
|
total_segments = 0
|
|
total_discarded = 0
|
|
all_unmatched = []
|
|
|
|
for filepath in files:
|
|
stats = process_custodian_file(
|
|
filepath, base_path, matcher,
|
|
dry_run=args.dry_run,
|
|
verbose=args.verbose,
|
|
show_entities=args.show_entities,
|
|
show_unmatched=args.show_unmatched,
|
|
min_length=args.min_length
|
|
)
|
|
total_processed += 1
|
|
|
|
# Collect unmatched samples
|
|
if 'unmatched_samples' in stats:
|
|
all_unmatched.extend(stats['unmatched_samples'])
|
|
|
|
if stats['status'] in ('updated', 'would_update'):
|
|
total_updated += 1
|
|
total_entities += stats['entities_extracted']
|
|
total_html_files += stats['html_files_processed']
|
|
total_segments += stats['text_segments_analyzed']
|
|
total_discarded += stats['segments_discarded']
|
|
|
|
if args.verbose or stats['entities_extracted'] > 0:
|
|
msg = f"✓ {stats['file']}: {stats['entities_extracted']} entities"
|
|
msg += f" ({stats['html_files_processed']} HTML files, {stats['segments_discarded']} discarded)"
|
|
print(msg)
|
|
|
|
elif stats['status'] == 'no_entities_found':
|
|
total_html_files += stats['html_files_processed']
|
|
total_segments += stats['text_segments_analyzed']
|
|
total_discarded += stats['segments_discarded']
|
|
if args.verbose:
|
|
print(f"○ {stats['file']}: no entities found ({stats['html_files_processed']} HTML files)")
|
|
|
|
elif args.verbose:
|
|
if stats['status'] == 'error':
|
|
print(f"✗ {stats['file']}: {stats['error']}")
|
|
elif stats['status'] == 'no_web_archives':
|
|
print(f"○ {stats['file']}: no web_archives section")
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Files processed: {total_processed}")
|
|
print(f"Files with entities: {total_updated}")
|
|
print(f"Total HTML files: {total_html_files}")
|
|
print(f"Text segments analyzed: {total_segments}")
|
|
print(f"Segments discarded: {total_discarded}")
|
|
print(f"Total entities found: {total_entities}")
|
|
|
|
# Show unmatched samples if requested
|
|
if args.show_unmatched > 0 and all_unmatched:
|
|
print("\n" + "-" * 60)
|
|
print(f"UNMATCHED SAMPLES (showing up to {args.show_unmatched}):")
|
|
print("-" * 60)
|
|
for i, sample in enumerate(all_unmatched[:args.show_unmatched], 1):
|
|
print(f" {i}. {sample[:80]}{'...' if len(sample) > 80 else ''}")
|
|
|
|
if args.dry_run:
|
|
print("\n*** DRY RUN - No changes were made ***")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
exit(main())
|