glam/scripts/extract_with_patterns.py
2025-12-14 17:09:55 +01:00

919 lines
36 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Extract typed entities from web archives using annotated patterns.
Loads patterns from dutch_web_patterns.yaml and processes web archive HTML
to extract entities with CH-Annotator types and relationship predicates.
This script:
1. Loads entity and discard patterns from dutch_web_patterns.yaml
2. Finds custodian files with web_enrichment.web_archives references
3. For each custodian, processes HTML from web archive mirror directories
4. Extracts text content from HTML
5. Matches against discard patterns first (filter out navigation, UI, etc.)
6. Matches against entity patterns (extract with types and relationships)
7. Applies capture groups to extract sub-entities
8. Generates relationship triples
9. Adds pattern_entity_claims section to custodian YAML files
Usage:
python scripts/extract_with_patterns.py [--dry-run] [--limit N] [--custodian GHCID]
python scripts/extract_with_patterns.py --verbose --limit 3
"""
import argparse
import glob
import os
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional
from html.parser import HTMLParser
from io import StringIO
import yaml
# ============================================================================
# DUTCH STOPWORD FILTER
# ============================================================================
# Common Dutch words that should NOT be extracted as place names, organization names, etc.
# These cause false positives when patterns like "gemeente (\w+)" match "gemeente op de straat"
DUTCH_STOPWORDS = {
# Articles
'de', 'het', 'een', 'der', 'des', 'den',
# Prepositions
'op', 'in', 'van', 'aan', 'te', 'tot', 'bij', 'met', 'voor', 'na', 'naar',
'om', 'uit', 'over', 'onder', 'door', 'tegen', 'tussen', 'zonder', 'binnen',
'buiten', 'langs', 'sinds', 'tijdens', 'vanaf', 'volgens', 'wegens',
# Pronouns
'ik', 'je', 'jij', 'u', 'hij', 'zij', 'ze', 'wij', 'we', 'jullie', 'hen', 'hun',
'mij', 'jou', 'hem', 'haar', 'ons', 'die', 'dat', 'dit', 'deze', 'wat', 'wie',
'welke', 'welk', 'waar', 'wanneer', 'waarom', 'hoe', 'er', 'hier', 'daar',
# Common verbs (conjugated forms that might appear after "gemeente", etc.)
'is', 'zijn', 'was', 'waren', 'ben', 'bent', 'geweest', 'wordt', 'worden',
'werd', 'werden', 'heeft', 'hebben', 'had', 'hadden', 'gehad', 'kan', 'kunnen',
'kon', 'konden', 'gekund', 'mag', 'mogen', 'mocht', 'mochten', 'moet', 'moeten',
'moest', 'moesten', 'zal', 'zullen', 'zou', 'zouden', 'wil', 'willen', 'wilde',
'wilden', 'gewild', 'zien', 'ziet', 'zag', 'zagen', 'gezien', 'gaan', 'gaat',
'ging', 'gingen', 'gegaan', 'komen', 'komt', 'kwam', 'kwamen', 'gekomen',
'doen', 'doet', 'deed', 'deden', 'gedaan', 'maken', 'maakt', 'maakte', 'maakten',
'gemaakt', 'zeggen', 'zegt', 'zei', 'zeiden', 'gezegd', 'staan', 'staat', 'stond',
'stonden', 'gestaan', 'liggen', 'ligt', 'lag', 'lagen', 'gelegen', 'woonde', 'woont',
# Common adjectives/adverbs
'ook', 'nog', 'al', 'wel', 'niet', 'geen', 'meer', 'veel', 'weinig', 'erg',
'heel', 'zeer', 'zo', 'nu', 'dan', 'toen', 'weer', 'vaak', 'altijd', 'nooit',
'soms', 'reeds', 'steeds', 'pas', 'net', 'juist', 'precies', 'ongeveer',
# Conjunctions
'en', 'of', 'maar', 'want', 'dus', 'omdat', 'als', 'indien', 'hoewel', 'tenzij',
'totdat', 'terwijl', 'voordat', 'nadat', 'zodat', 'opdat', 'mits', 'ofschoon',
# Common nouns that aren't places
'eigendom', 'bezit', 'gebied', 'plaats', 'deel', 'kant', 'zijde', 'wijze',
'manier', 'vorm', 'soort', 'type', 'naam', 'titel', 'datum', 'tijd', 'jaar',
'dag', 'week', 'maand', 'uur', 'minuut', 'eeuw', 'periode', 'men', 'iemand',
# Short words that are likely false positives
'aa', 'ab', 'ad', 'af', 'ag', 'ah', 'al', 'am', 'as', 'at', 'au', 'be', 'bi',
'bo', 'bu', 'ca', 'co', 'da', 'do', 'du', 'ed', 'ee', 'eg', 'ei', 'el', 'em',
'ex', 'fa', 'fe', 'fi', 'fo', 'fu', 'ga', 'ge', 'go', 'gu', 'ha', 'he', 'hi',
'ho', 'hu', 'id', 'ie', 'ig', 'ij', 'il', 'im', 'io', 'ir', 'ja', 'je', 'jo',
'ju', 'ka', 'ke', 'ki', 'ko', 'ku', 'la', 'le', 'li', 'lo', 'lu', 'ma', 'me',
'mi', 'mo', 'mu', 'na', 'ne', 'ni', 'no', 'nu', 'ob', 'od', 'oe', 'og', 'oh',
'ok', 'ol', 'om', 'on', 'oo', 'op', 'or', 'os', 'ot', 'ou', 'ow', 'oz', 'pa',
'pe', 'pi', 'po', 'pu', 'ra', 're', 'ri', 'ro', 'ru', 'sa', 'se', 'si', 'so',
'su', 'ta', 'te', 'ti', 'to', 'tu', 'ub', 'ue', 'ug', 'ui', 'uk', 'ul', 'um',
'un', 'up', 'ur', 'us', 'ut', 'uu', 'va', 've', 'vi', 'vo', 'vu', 'wa', 'we',
'wi', 'wo', 'wu', 'za', 'ze', 'zi', 'zo', 'zu',
}
# Generic organization words that by themselves don't make a valid entity
# e.g., "de Stichting" without a name is too generic
GENERIC_ORG_WORDS = {
'stichting', 'vereniging', 'genootschap', 'organisatie', 'instelling',
'instituut', 'centrum', 'bureau', 'dienst', 'raad', 'commissie',
'archief', 'museum', 'bibliotheek', 'collectie', 'fonds',
}
# Entity types whose capture groups should be validated against stopwords
# These are patterns where captured groups are expected to be proper nouns (places, names)
STOPWORD_FILTERED_ENTITY_TYPES = {
'GRP.GOV', # Government - municipality names should be proper nouns
'GRP.GOV.MUN', # Municipality
'GRP.GOV.PRO', # Province
'GRP.HER', # Heritage institutions - name parts should be proper nouns
'GRP.HER.MUS', # Museum
'GRP.HER.ARC', # Archive
'GRP.HER.LIB', # Library
'GRP.ORG', # Organizations - name parts should be proper nouns
'TOP.SET', # Settlement names
'TOP.BLD', # Building names
'AGT.PER', # Person names
}
def is_stopword_match(entity_result: dict) -> bool:
"""
Check if an entity match is actually a false positive due to stopwords.
Returns True if the match should be REJECTED (is a false positive).
"""
entity_type = entity_result.get('entity_type') or ''
entity_subtype = entity_result.get('entity_subtype') or ''
# Check if this entity type should be filtered
should_filter = (
entity_type in STOPWORD_FILTERED_ENTITY_TYPES or
entity_subtype in STOPWORD_FILTERED_ENTITY_TYPES
)
if not should_filter:
return False
# Check capture groups for stopwords
captures = entity_result.get('captures', {})
for idx, cap in captures.items():
value = cap.get('value', '').lower().strip()
cap_type = cap.get('type', '')
# Check if this capture group type should be validated
if cap_type in STOPWORD_FILTERED_ENTITY_TYPES or entity_type in STOPWORD_FILTERED_ENTITY_TYPES:
if value in DUTCH_STOPWORDS:
return True # Reject this match
# Also reject if captured value is too short (less than 3 chars)
# unless it's a known Dutch place abbreviation
if len(value) < 3:
return True
# Check the matched text itself if no captures
if not captures:
# Extract the last word (often the "name" part) from matched text
matched = entity_result.get('matched_text', '')
words = matched.lower().split()
if words:
last_word = words[-1]
if last_word in DUTCH_STOPWORDS:
return True
# Check for generic organization matches like "de Stichting" (without a real name)
matched_text = entity_result.get('matched_text', '').lower().strip()
words = matched_text.split()
# Pattern: article + generic org word (e.g., "de stichting", "het archief")
if len(words) == 2:
if words[0] in {'de', 'het', 'een'} and words[1] in GENERIC_ORG_WORDS:
return True # Too generic, reject
return False
# ============================================================================
# YAML HANDLING
# ============================================================================
class CustomDumper(yaml.SafeDumper):
"""Custom YAML dumper to preserve formatting."""
pass
def str_representer(dumper, data):
"""Represent strings with proper multiline handling."""
if '\n' in data:
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
CustomDumper.add_representer(str, str_representer)
def load_yaml(filepath: Path) -> dict:
"""Load a YAML file."""
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f) or {}
def save_yaml(filepath: Path, data: dict) -> None:
"""Save data to a YAML file."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, Dumper=CustomDumper, allow_unicode=True,
default_flow_style=False, sort_keys=False, width=120)
# ============================================================================
# HTML TEXT EXTRACTION
# ============================================================================
class MLStripper(HTMLParser):
"""Simple HTML stripper to extract text content."""
def __init__(self):
super().__init__()
self.reset()
self.strict = False
self.convert_charrefs = True
self.text = StringIO()
self.in_script_or_style = False
def handle_starttag(self, tag, attrs):
if tag in ('script', 'style', 'noscript'):
self.in_script_or_style = True
def handle_endtag(self, tag):
if tag in ('script', 'style', 'noscript'):
self.in_script_or_style = False
def handle_data(self, data):
if not self.in_script_or_style:
self.text.write(data)
def get_data(self):
return self.text.getvalue()
def strip_tags(html: str) -> str:
"""Remove HTML tags and return text content."""
s = MLStripper()
try:
s.feed(html)
return s.get_data()
except Exception:
# Fallback: simple regex-based stripping
return re.sub(r'<[^>]+>', ' ', html)
def extract_text_from_html(html_content: str) -> list[str]:
"""
Extract meaningful text segments from HTML content.
Returns a list of text strings that could be entity mentions.
Filters out very short strings and common non-entity content.
"""
text = strip_tags(html_content)
# Split into lines and clean
lines = []
for line in text.split('\n'):
# Clean whitespace
line = ' '.join(line.split())
# Skip very short lines
if len(line) < 3:
continue
# Skip lines that are just whitespace/punctuation
if re.match(r'^[\s\W]+$', line):
continue
lines.append(line)
return lines
# ============================================================================
# PATTERN LOADING AND COMPILATION
# ============================================================================
class PatternMatcher:
"""
Loads and compiles patterns from dutch_web_patterns.yaml.
Provides matching against discard and entity patterns.
"""
def __init__(self, pattern_file: Path, strip_anchors: bool = True):
"""Load and compile patterns from YAML file.
Args:
pattern_file: Path to YAML pattern file
strip_anchors: If True, remove ^ and $ anchors from entity patterns
to enable substring matching (default: True)
"""
self.pattern_file = pattern_file
self.raw_data = load_yaml(pattern_file)
self.strip_anchors = strip_anchors
# Compiled patterns
self.discard_patterns: list[tuple[re.Pattern, str]] = [] # (regex, reason)
self.entity_patterns: list[dict] = [] # Full pattern config with compiled regex
self._compile_patterns()
def _strip_regex_anchors(self, pattern: str) -> str:
r"""Remove ^ and $ anchors from a regex pattern for substring matching.
Replaces anchors with word boundary markers (\b) to prevent false positives
from partial word matches. For example:
- ^gemeente\s+(\w+)$ becomes \bgemeente\s+(\w+)\b
This allows the pattern to match "... gemeente Assen ..."
but not "... gemeentebestuur ..." or "...in gemeente op de..."
Preserves anchors that are escaped (\\^ or \\$).
"""
if not self.strip_anchors:
return pattern
# Replace leading ^ with word boundary \b (but not \^)
if pattern.startswith('^'):
pattern = r'\b' + pattern[1:]
else:
# Add word boundary at start if not present
if not pattern.startswith(r'\b'):
pattern = r'\b' + pattern
# Replace trailing $ with word boundary \b (but not \$)
if pattern.endswith('$') and not pattern.endswith('\\$'):
pattern = pattern[:-1] + r'\b'
else:
# Add word boundary at end if not present
if not pattern.endswith(r'\b'):
pattern = pattern + r'\b'
return pattern
def _compile_patterns(self):
"""Compile all regex patterns for efficient matching."""
# Compile discard patterns
discard_section = self.raw_data.get('discard_patterns', {})
for category, cat_data in discard_section.items():
if isinstance(cat_data, dict) and 'patterns' in cat_data:
for pat_item in cat_data['patterns']:
pattern_str = pat_item.get('pattern', '')
reason = pat_item.get('discard_reason', category)
if pattern_str:
try:
compiled = re.compile(pattern_str, re.IGNORECASE)
self.discard_patterns.append((compiled, reason))
except re.error as e:
print(f"Warning: Invalid discard pattern '{pattern_str}': {e}")
# Compile entity patterns
entity_section = self.raw_data.get('entity_patterns', {})
self._compile_entity_section(entity_section)
def _compile_entity_section(self, section: dict, parent_path: str = ""):
"""Recursively compile entity patterns from nested structure."""
for key, value in section.items():
if isinstance(value, dict):
if 'patterns' in value:
# This is a pattern category with actual patterns
for pat_item in value['patterns']:
pattern_str = pat_item.get('pattern', '')
if pattern_str:
try:
# Strip anchors for substring matching
pattern_for_compile = self._strip_regex_anchors(pattern_str)
compiled = re.compile(pattern_for_compile, re.IGNORECASE)
entity_config = {
'regex': compiled,
'pattern_str': pattern_str, # Keep original for logging
'pattern_compiled': pattern_for_compile, # Actual compiled pattern
'category': f"{parent_path}/{key}" if parent_path else key,
'entity_type': pat_item.get('entity_type'),
'entity_subtype': pat_item.get('entity_subtype'),
'label_template': pat_item.get('label_template'),
'capture_groups': pat_item.get('capture_groups', {}),
'relationships': pat_item.get('relationships', []),
'description': pat_item.get('description', ''),
}
self.entity_patterns.append(entity_config)
except re.error as e:
print(f"Warning: Invalid entity pattern '{pattern_str}': {e}")
else:
# Nested category, recurse
new_path = f"{parent_path}/{key}" if parent_path else key
self._compile_entity_section(value, new_path)
def should_discard(self, text: str) -> tuple[bool, Optional[str]]:
"""
Check if text matches any discard pattern.
Returns:
Tuple of (should_discard, reason or None)
"""
text_lower = text.lower().strip()
for regex, reason in self.discard_patterns:
if regex.search(text_lower):
return True, reason
return False, None
def match_entity(self, text: str) -> Optional[dict]:
"""
Match text against entity patterns.
Uses search() instead of match() to find patterns anywhere in the text,
not just at the beginning. This dramatically improves entity yield.
Returns:
Dict with match info including entity_type, captures, relationships
or None if no match
"""
text_stripped = text.strip()
for pattern in self.entity_patterns:
match = pattern['regex'].search(text_stripped)
if match:
# Use the matched substring, not the full text
matched_substring = match.group(0)
result = {
'matched_text': matched_substring,
'full_context': text_stripped if text_stripped != matched_substring else None,
'entity_type': pattern['entity_type'],
'entity_subtype': pattern['entity_subtype'],
'pattern_str': pattern['pattern_str'],
'category': pattern['category'],
'description': pattern['description'],
'captures': {},
'relationships': [],
}
# Extract capture groups
if pattern['capture_groups']:
for group_num, group_config in pattern['capture_groups'].items():
try:
group_idx = int(group_num)
if group_idx <= len(match.groups()):
captured_value = match.group(group_idx)
if captured_value:
result['captures'][group_idx] = {
'value': captured_value,
'type': group_config.get('type'),
'role': group_config.get('role'),
}
except (ValueError, IndexError):
pass
# Generate relationships - use matched_substring as the entity
if pattern['relationships']:
for rel in pattern['relationships']:
relationship = {
'predicate': rel.get('predicate'),
'subject': self._resolve_reference(rel.get('subject'), matched_substring, result['captures']),
'object': self._resolve_reference(rel.get('object'), matched_substring, result['captures']),
'confidence': rel.get('confidence', 0.8),
}
# Add type info if available
if rel.get('subject_type'):
relationship['subject_type'] = rel['subject_type']
if rel.get('object_type'):
relationship['object_type'] = rel['object_type']
result['relationships'].append(relationship)
# Apply label template if exists
if pattern['label_template'] and result['captures']:
try:
label = pattern['label_template']
for idx, cap in result['captures'].items():
label = label.replace(f'{{{idx}}}', cap['value'])
result['entity_label'] = label
except Exception:
result['entity_label'] = matched_substring
else:
result['entity_label'] = matched_substring
# Filter out false positives caused by stopwords in capture groups
if is_stopword_match(result):
continue # Try next pattern instead of returning this match
return result
return None
def _resolve_reference(self, ref: Any, matched_text: str, captures: dict) -> Optional[str]:
"""Resolve a reference in relationship definition."""
if ref is None:
return None
if ref == '$0':
return matched_text
if isinstance(ref, str) and ref.startswith('$'):
try:
idx = int(ref[1:])
if idx in captures:
return captures[idx]['value']
except ValueError:
pass
if ref == 'CUSTODIAN':
return 'CUSTODIAN' # Placeholder for the custodian being processed
return str(ref)
# ============================================================================
# CUSTODIAN FILE PROCESSING
# ============================================================================
def find_html_files(archive_dir: Path) -> list[Path]:
"""Find all HTML files in a web archive directory."""
html_files = []
mirror_dir = archive_dir / 'mirror'
if mirror_dir.exists():
for html_file in mirror_dir.rglob('*.html'):
html_files.append(html_file)
pages_dir = archive_dir / 'pages'
if pages_dir.exists():
for html_file in pages_dir.rglob('*.html'):
html_files.append(html_file)
return html_files
def process_custodian_file(
custodian_path: Path,
base_path: Path,
matcher: PatternMatcher,
dry_run: bool = False,
verbose: bool = False,
show_entities: bool = False,
show_unmatched: int = 0,
min_length: int = 10
) -> dict:
"""
Process a single custodian file to extract and add pattern-based entities.
Args:
custodian_path: Path to custodian YAML file
base_path: Base path for web archives (data/custodian/)
matcher: Compiled pattern matcher
dry_run: If True, don't write changes
verbose: If True, show detailed output
show_entities: If True, print each entity as it's found
show_unmatched: Number of unmatched segments to show (for debugging)
min_length: Minimum text segment length to analyze
Returns:
Dict with processing stats
"""
stats = {
'file': str(custodian_path.name),
'web_archives_found': 0,
'html_files_processed': 0,
'text_segments_analyzed': 0,
'segments_discarded': 0,
'entities_extracted': 0,
'status': 'skipped',
'error': None,
}
# Collect unmatched segments for debugging
unmatched_samples = []
try:
custodian_data = load_yaml(custodian_path)
except Exception as e:
stats['status'] = 'error'
stats['error'] = f"Failed to load YAML: {e}"
return stats
# Check for web_enrichment section
web_enrichment = custodian_data.get('web_enrichment', {})
web_archives = web_enrichment.get('web_archives', [])
if not web_archives:
stats['status'] = 'no_web_archives'
return stats
stats['web_archives_found'] = len(web_archives)
all_claims = []
discard_counts = {}
for archive in web_archives:
archive_dir_str = archive.get('directory', '')
if not archive_dir_str:
continue
archive_dir = base_path / archive_dir_str
if not archive_dir.exists():
continue
html_files = find_html_files(archive_dir)
for html_file in html_files:
stats['html_files_processed'] += 1
try:
with open(html_file, 'r', encoding='utf-8', errors='replace') as f:
html_content = f.read()
except Exception as e:
if verbose:
print(f" Warning: Could not read {html_file}: {e}")
continue
# Extract text segments
text_segments = extract_text_from_html(html_content)
for segment in text_segments:
# Skip segments that are too short
if len(segment) < min_length:
continue
stats['text_segments_analyzed'] += 1
# First check discard patterns
should_discard, discard_reason = matcher.should_discard(segment)
if should_discard:
stats['segments_discarded'] += 1
discard_counts[discard_reason] = discard_counts.get(discard_reason, 0) + 1
continue
# Try to match entity patterns
entity_match = matcher.match_entity(segment)
if entity_match:
stats['entities_extracted'] += 1
# Build claim record
claim = {
'entity': entity_match['entity_label'],
'matched_text': entity_match['matched_text'],
'entity_type': entity_match['entity_type'],
}
if entity_match.get('entity_subtype'):
claim['entity_subtype'] = entity_match['entity_subtype']
claim['matched_pattern'] = entity_match['pattern_str']
claim['pattern_category'] = entity_match['category']
# Add capture groups if any
if entity_match['captures']:
claim['capture_groups'] = {
str(idx): cap for idx, cap in entity_match['captures'].items()
}
# Add relationships
if entity_match['relationships']:
claim['relationships'] = entity_match['relationships']
# Source file relative to custodian dir
try:
rel_path = html_file.relative_to(base_path)
claim['source_file'] = str(rel_path)
except ValueError:
claim['source_file'] = str(html_file)
claim['confidence'] = 0.85 # Pattern-based extraction confidence
all_claims.append(claim)
# Show entity if flag is set
if show_entities:
print(f" → [{entity_match['entity_type']}] {entity_match['entity_label']}")
else:
# Track unmatched segments for debugging
if show_unmatched > 0 and len(unmatched_samples) < show_unmatched:
# Only collect interesting segments (likely to contain entities)
if (len(segment) >= 15 and len(segment) <= 100 and
not segment.isupper() and
any(c.isupper() for c in segment[1:]) and
not re.match(r'^[\d\s\W]+$', segment)):
unmatched_samples.append(segment)
# Add unmatched samples to stats for debugging
if unmatched_samples:
stats['unmatched_samples'] = unmatched_samples
if not all_claims:
stats['status'] = 'no_entities_found'
return stats
# Deduplicate claims by entity + type
seen = set()
unique_claims = []
for claim in all_claims:
key = (claim['entity'], claim.get('entity_type', ''))
if key not in seen:
seen.add(key)
unique_claims.append(claim)
stats['entities_extracted'] = len(unique_claims)
# Create pattern_entity_claims section
pattern_entity_claims = {
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'pattern_based_extraction_v1',
'pattern_file': 'dutch_web_patterns.yaml',
'pattern_file_version': '1.0.0',
'html_files_processed': stats['html_files_processed'],
'text_segments_analyzed': stats['text_segments_analyzed'],
'segments_discarded': stats['segments_discarded'],
'entities_count': len(unique_claims),
'claims': unique_claims,
}
# Add to custodian data
custodian_data['pattern_entity_claims'] = pattern_entity_claims
if not dry_run:
save_yaml(custodian_path, custodian_data)
stats['status'] = 'updated'
else:
stats['status'] = 'would_update'
return stats
def find_custodian_files_with_web_archives(custodian_dir: Path) -> list[Path]:
"""
Find all custodian files that have web_enrichment.web_archives.
Args:
custodian_dir: Directory containing custodian YAML files
Returns:
List of paths to custodian files with web archives
"""
pattern = str(custodian_dir / "NL-*.yaml")
files = []
for filepath in glob.glob(pattern):
path = Path(filepath)
try:
with open(path, 'r', encoding='utf-8') as f:
# Quick check for web_archives: in file
content = f.read()
if 'web_archives:' in content:
files.append(path)
except Exception:
continue
return sorted(files)
# ============================================================================
# MAIN
# ============================================================================
def main():
parser = argparse.ArgumentParser(
description='Extract typed entities from web archives using annotated patterns'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be done without making changes'
)
parser.add_argument(
'--limit',
type=int,
default=None,
help='Limit number of files to process'
)
parser.add_argument(
'--custodian',
type=str,
default=None,
help='Process only a specific custodian GHCID (e.g., NL-DR-ASS-A-DA)'
)
parser.add_argument(
'--custodian-dir',
type=Path,
default=Path('/Users/kempersc/apps/glam/data/custodian'),
help='Directory containing custodian YAML files'
)
parser.add_argument(
'--pattern-file',
type=Path,
default=Path('/Users/kempersc/apps/glam/data/entity_annotation/modules/processing/dutch_web_patterns.yaml'),
help='Path to pattern definition file'
)
parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Show detailed output'
)
parser.add_argument(
'--show-entities',
action='store_true',
help='Show each extracted entity as it is found'
)
parser.add_argument(
'--show-unmatched',
type=int,
default=0,
metavar='N',
help='Show N sample unmatched text segments (for pattern development)'
)
parser.add_argument(
'--min-length',
type=int,
default=10,
help='Minimum text segment length to analyze (default: 10)'
)
args = parser.parse_args()
custodian_dir = args.custodian_dir
base_path = custodian_dir
# Load patterns
print(f"Loading patterns from {args.pattern_file}...")
try:
matcher = PatternMatcher(args.pattern_file)
print(f" Loaded {len(matcher.discard_patterns)} discard patterns")
print(f" Loaded {len(matcher.entity_patterns)} entity patterns")
except Exception as e:
print(f"Error loading patterns: {e}")
return 1
# Find custodian files
if args.custodian:
# Process specific custodian
specific_file = custodian_dir / f"{args.custodian}.yaml"
if not specific_file.exists():
print(f"Error: Custodian file not found: {specific_file}")
return 1
files = [specific_file]
print(f"Processing specific custodian: {args.custodian}")
else:
print(f"Scanning for custodian files with web archives in {custodian_dir}...")
files = find_custodian_files_with_web_archives(custodian_dir)
print(f"Found {len(files)} custodian files with web_archives")
if args.limit:
files = files[:args.limit]
print(f"Limited to {args.limit} files")
if args.dry_run:
print("\n*** DRY RUN - No changes will be made ***\n")
# Process statistics
total_processed = 0
total_updated = 0
total_entities = 0
total_html_files = 0
total_segments = 0
total_discarded = 0
all_unmatched = []
for filepath in files:
stats = process_custodian_file(
filepath, base_path, matcher,
dry_run=args.dry_run,
verbose=args.verbose,
show_entities=args.show_entities,
show_unmatched=args.show_unmatched,
min_length=args.min_length
)
total_processed += 1
# Collect unmatched samples
if 'unmatched_samples' in stats:
all_unmatched.extend(stats['unmatched_samples'])
if stats['status'] in ('updated', 'would_update'):
total_updated += 1
total_entities += stats['entities_extracted']
total_html_files += stats['html_files_processed']
total_segments += stats['text_segments_analyzed']
total_discarded += stats['segments_discarded']
if args.verbose or stats['entities_extracted'] > 0:
msg = f"{stats['file']}: {stats['entities_extracted']} entities"
msg += f" ({stats['html_files_processed']} HTML files, {stats['segments_discarded']} discarded)"
print(msg)
elif stats['status'] == 'no_entities_found':
total_html_files += stats['html_files_processed']
total_segments += stats['text_segments_analyzed']
total_discarded += stats['segments_discarded']
if args.verbose:
print(f"{stats['file']}: no entities found ({stats['html_files_processed']} HTML files)")
elif args.verbose:
if stats['status'] == 'error':
print(f"{stats['file']}: {stats['error']}")
elif stats['status'] == 'no_web_archives':
print(f"{stats['file']}: no web_archives section")
# Summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Files processed: {total_processed}")
print(f"Files with entities: {total_updated}")
print(f"Total HTML files: {total_html_files}")
print(f"Text segments analyzed: {total_segments}")
print(f"Segments discarded: {total_discarded}")
print(f"Total entities found: {total_entities}")
# Show unmatched samples if requested
if args.show_unmatched > 0 and all_unmatched:
print("\n" + "-" * 60)
print(f"UNMATCHED SAMPLES (showing up to {args.show_unmatched}):")
print("-" * 60)
for i, sample in enumerate(all_unmatched[:args.show_unmatched], 1):
print(f" {i}. {sample[:80]}{'...' if len(sample) > 80 else ''}")
if args.dry_run:
print("\n*** DRY RUN - No changes were made ***")
return 0
if __name__ == '__main__':
exit(main())