#!/usr/bin/env python3 """ Extract typed entities from web archives using annotated patterns. Loads patterns from dutch_web_patterns.yaml and processes web archive HTML to extract entities with CH-Annotator types and relationship predicates. This script: 1. Loads entity and discard patterns from dutch_web_patterns.yaml 2. Finds custodian files with web_enrichment.web_archives references 3. For each custodian, processes HTML from web archive mirror directories 4. Extracts text content from HTML 5. Matches against discard patterns first (filter out navigation, UI, etc.) 6. Matches against entity patterns (extract with types and relationships) 7. Applies capture groups to extract sub-entities 8. Generates relationship triples 9. Adds pattern_entity_claims section to custodian YAML files Usage: python scripts/extract_with_patterns.py [--dry-run] [--limit N] [--custodian GHCID] python scripts/extract_with_patterns.py --verbose --limit 3 """ import argparse import glob import os import re from datetime import datetime, timezone from pathlib import Path from typing import Any, Optional from html.parser import HTMLParser from io import StringIO import yaml # ============================================================================ # DUTCH STOPWORD FILTER # ============================================================================ # Common Dutch words that should NOT be extracted as place names, organization names, etc. # These cause false positives when patterns like "gemeente (\w+)" match "gemeente op de straat" DUTCH_STOPWORDS = { # Articles 'de', 'het', 'een', 'der', 'des', 'den', # Prepositions 'op', 'in', 'van', 'aan', 'te', 'tot', 'bij', 'met', 'voor', 'na', 'naar', 'om', 'uit', 'over', 'onder', 'door', 'tegen', 'tussen', 'zonder', 'binnen', 'buiten', 'langs', 'sinds', 'tijdens', 'vanaf', 'volgens', 'wegens', # Pronouns 'ik', 'je', 'jij', 'u', 'hij', 'zij', 'ze', 'wij', 'we', 'jullie', 'hen', 'hun', 'mij', 'jou', 'hem', 'haar', 'ons', 'die', 'dat', 'dit', 'deze', 'wat', 'wie', 'welke', 'welk', 'waar', 'wanneer', 'waarom', 'hoe', 'er', 'hier', 'daar', # Common verbs (conjugated forms that might appear after "gemeente", etc.) 'is', 'zijn', 'was', 'waren', 'ben', 'bent', 'geweest', 'wordt', 'worden', 'werd', 'werden', 'heeft', 'hebben', 'had', 'hadden', 'gehad', 'kan', 'kunnen', 'kon', 'konden', 'gekund', 'mag', 'mogen', 'mocht', 'mochten', 'moet', 'moeten', 'moest', 'moesten', 'zal', 'zullen', 'zou', 'zouden', 'wil', 'willen', 'wilde', 'wilden', 'gewild', 'zien', 'ziet', 'zag', 'zagen', 'gezien', 'gaan', 'gaat', 'ging', 'gingen', 'gegaan', 'komen', 'komt', 'kwam', 'kwamen', 'gekomen', 'doen', 'doet', 'deed', 'deden', 'gedaan', 'maken', 'maakt', 'maakte', 'maakten', 'gemaakt', 'zeggen', 'zegt', 'zei', 'zeiden', 'gezegd', 'staan', 'staat', 'stond', 'stonden', 'gestaan', 'liggen', 'ligt', 'lag', 'lagen', 'gelegen', 'woonde', 'woont', # Common adjectives/adverbs 'ook', 'nog', 'al', 'wel', 'niet', 'geen', 'meer', 'veel', 'weinig', 'erg', 'heel', 'zeer', 'zo', 'nu', 'dan', 'toen', 'weer', 'vaak', 'altijd', 'nooit', 'soms', 'reeds', 'steeds', 'pas', 'net', 'juist', 'precies', 'ongeveer', # Conjunctions 'en', 'of', 'maar', 'want', 'dus', 'omdat', 'als', 'indien', 'hoewel', 'tenzij', 'totdat', 'terwijl', 'voordat', 'nadat', 'zodat', 'opdat', 'mits', 'ofschoon', # Common nouns that aren't places 'eigendom', 'bezit', 'gebied', 'plaats', 'deel', 'kant', 'zijde', 'wijze', 'manier', 'vorm', 'soort', 'type', 'naam', 'titel', 'datum', 'tijd', 'jaar', 'dag', 'week', 'maand', 'uur', 'minuut', 'eeuw', 'periode', 'men', 'iemand', # Short words that are likely false positives 'aa', 'ab', 'ad', 'af', 'ag', 'ah', 'al', 'am', 'as', 'at', 'au', 'be', 'bi', 'bo', 'bu', 'ca', 'co', 'da', 'do', 'du', 'ed', 'ee', 'eg', 'ei', 'el', 'em', 'ex', 'fa', 'fe', 'fi', 'fo', 'fu', 'ga', 'ge', 'go', 'gu', 'ha', 'he', 'hi', 'ho', 'hu', 'id', 'ie', 'ig', 'ij', 'il', 'im', 'io', 'ir', 'ja', 'je', 'jo', 'ju', 'ka', 'ke', 'ki', 'ko', 'ku', 'la', 'le', 'li', 'lo', 'lu', 'ma', 'me', 'mi', 'mo', 'mu', 'na', 'ne', 'ni', 'no', 'nu', 'ob', 'od', 'oe', 'og', 'oh', 'ok', 'ol', 'om', 'on', 'oo', 'op', 'or', 'os', 'ot', 'ou', 'ow', 'oz', 'pa', 'pe', 'pi', 'po', 'pu', 'ra', 're', 'ri', 'ro', 'ru', 'sa', 'se', 'si', 'so', 'su', 'ta', 'te', 'ti', 'to', 'tu', 'ub', 'ue', 'ug', 'ui', 'uk', 'ul', 'um', 'un', 'up', 'ur', 'us', 'ut', 'uu', 'va', 've', 'vi', 'vo', 'vu', 'wa', 'we', 'wi', 'wo', 'wu', 'za', 'ze', 'zi', 'zo', 'zu', } # Generic organization words that by themselves don't make a valid entity # e.g., "de Stichting" without a name is too generic GENERIC_ORG_WORDS = { 'stichting', 'vereniging', 'genootschap', 'organisatie', 'instelling', 'instituut', 'centrum', 'bureau', 'dienst', 'raad', 'commissie', 'archief', 'museum', 'bibliotheek', 'collectie', 'fonds', } # Entity types whose capture groups should be validated against stopwords # These are patterns where captured groups are expected to be proper nouns (places, names) STOPWORD_FILTERED_ENTITY_TYPES = { 'GRP.GOV', # Government - municipality names should be proper nouns 'GRP.GOV.MUN', # Municipality 'GRP.GOV.PRO', # Province 'GRP.HER', # Heritage institutions - name parts should be proper nouns 'GRP.HER.MUS', # Museum 'GRP.HER.ARC', # Archive 'GRP.HER.LIB', # Library 'GRP.ORG', # Organizations - name parts should be proper nouns 'TOP.SET', # Settlement names 'TOP.BLD', # Building names 'AGT.PER', # Person names } def is_stopword_match(entity_result: dict) -> bool: """ Check if an entity match is actually a false positive due to stopwords. Returns True if the match should be REJECTED (is a false positive). """ entity_type = entity_result.get('entity_type') or '' entity_subtype = entity_result.get('entity_subtype') or '' # Check if this entity type should be filtered should_filter = ( entity_type in STOPWORD_FILTERED_ENTITY_TYPES or entity_subtype in STOPWORD_FILTERED_ENTITY_TYPES ) if not should_filter: return False # Check capture groups for stopwords captures = entity_result.get('captures', {}) for idx, cap in captures.items(): value = cap.get('value', '').lower().strip() cap_type = cap.get('type', '') # Check if this capture group type should be validated if cap_type in STOPWORD_FILTERED_ENTITY_TYPES or entity_type in STOPWORD_FILTERED_ENTITY_TYPES: if value in DUTCH_STOPWORDS: return True # Reject this match # Also reject if captured value is too short (less than 3 chars) # unless it's a known Dutch place abbreviation if len(value) < 3: return True # Check the matched text itself if no captures if not captures: # Extract the last word (often the "name" part) from matched text matched = entity_result.get('matched_text', '') words = matched.lower().split() if words: last_word = words[-1] if last_word in DUTCH_STOPWORDS: return True # Check for generic organization matches like "de Stichting" (without a real name) matched_text = entity_result.get('matched_text', '').lower().strip() words = matched_text.split() # Pattern: article + generic org word (e.g., "de stichting", "het archief") if len(words) == 2: if words[0] in {'de', 'het', 'een'} and words[1] in GENERIC_ORG_WORDS: return True # Too generic, reject return False # ============================================================================ # YAML HANDLING # ============================================================================ class CustomDumper(yaml.SafeDumper): """Custom YAML dumper to preserve formatting.""" pass def str_representer(dumper, data): """Represent strings with proper multiline handling.""" if '\n' in data: return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') return dumper.represent_scalar('tag:yaml.org,2002:str', data) CustomDumper.add_representer(str, str_representer) def load_yaml(filepath: Path) -> dict: """Load a YAML file.""" with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) or {} def save_yaml(filepath: Path, data: dict) -> None: """Save data to a YAML file.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, Dumper=CustomDumper, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) # ============================================================================ # HTML TEXT EXTRACTION # ============================================================================ class MLStripper(HTMLParser): """Simple HTML stripper to extract text content.""" def __init__(self): super().__init__() self.reset() self.strict = False self.convert_charrefs = True self.text = StringIO() self.in_script_or_style = False def handle_starttag(self, tag, attrs): if tag in ('script', 'style', 'noscript'): self.in_script_or_style = True def handle_endtag(self, tag): if tag in ('script', 'style', 'noscript'): self.in_script_or_style = False def handle_data(self, data): if not self.in_script_or_style: self.text.write(data) def get_data(self): return self.text.getvalue() def strip_tags(html: str) -> str: """Remove HTML tags and return text content.""" s = MLStripper() try: s.feed(html) return s.get_data() except Exception: # Fallback: simple regex-based stripping return re.sub(r'<[^>]+>', ' ', html) def extract_text_from_html(html_content: str) -> list[str]: """ Extract meaningful text segments from HTML content. Returns a list of text strings that could be entity mentions. Filters out very short strings and common non-entity content. """ text = strip_tags(html_content) # Split into lines and clean lines = [] for line in text.split('\n'): # Clean whitespace line = ' '.join(line.split()) # Skip very short lines if len(line) < 3: continue # Skip lines that are just whitespace/punctuation if re.match(r'^[\s\W]+$', line): continue lines.append(line) return lines # ============================================================================ # PATTERN LOADING AND COMPILATION # ============================================================================ class PatternMatcher: """ Loads and compiles patterns from dutch_web_patterns.yaml. Provides matching against discard and entity patterns. """ def __init__(self, pattern_file: Path, strip_anchors: bool = True): """Load and compile patterns from YAML file. Args: pattern_file: Path to YAML pattern file strip_anchors: If True, remove ^ and $ anchors from entity patterns to enable substring matching (default: True) """ self.pattern_file = pattern_file self.raw_data = load_yaml(pattern_file) self.strip_anchors = strip_anchors # Compiled patterns self.discard_patterns: list[tuple[re.Pattern, str]] = [] # (regex, reason) self.entity_patterns: list[dict] = [] # Full pattern config with compiled regex self._compile_patterns() def _strip_regex_anchors(self, pattern: str) -> str: r"""Remove ^ and $ anchors from a regex pattern for substring matching. Replaces anchors with word boundary markers (\b) to prevent false positives from partial word matches. For example: - ^gemeente\s+(\w+)$ becomes \bgemeente\s+(\w+)\b This allows the pattern to match "... gemeente Assen ..." but not "... gemeentebestuur ..." or "...in gemeente op de..." Preserves anchors that are escaped (\\^ or \\$). """ if not self.strip_anchors: return pattern # Replace leading ^ with word boundary \b (but not \^) if pattern.startswith('^'): pattern = r'\b' + pattern[1:] else: # Add word boundary at start if not present if not pattern.startswith(r'\b'): pattern = r'\b' + pattern # Replace trailing $ with word boundary \b (but not \$) if pattern.endswith('$') and not pattern.endswith('\\$'): pattern = pattern[:-1] + r'\b' else: # Add word boundary at end if not present if not pattern.endswith(r'\b'): pattern = pattern + r'\b' return pattern def _compile_patterns(self): """Compile all regex patterns for efficient matching.""" # Compile discard patterns discard_section = self.raw_data.get('discard_patterns', {}) for category, cat_data in discard_section.items(): if isinstance(cat_data, dict) and 'patterns' in cat_data: for pat_item in cat_data['patterns']: pattern_str = pat_item.get('pattern', '') reason = pat_item.get('discard_reason', category) if pattern_str: try: compiled = re.compile(pattern_str, re.IGNORECASE) self.discard_patterns.append((compiled, reason)) except re.error as e: print(f"Warning: Invalid discard pattern '{pattern_str}': {e}") # Compile entity patterns entity_section = self.raw_data.get('entity_patterns', {}) self._compile_entity_section(entity_section) def _compile_entity_section(self, section: dict, parent_path: str = ""): """Recursively compile entity patterns from nested structure.""" for key, value in section.items(): if isinstance(value, dict): if 'patterns' in value: # This is a pattern category with actual patterns for pat_item in value['patterns']: pattern_str = pat_item.get('pattern', '') if pattern_str: try: # Strip anchors for substring matching pattern_for_compile = self._strip_regex_anchors(pattern_str) compiled = re.compile(pattern_for_compile, re.IGNORECASE) entity_config = { 'regex': compiled, 'pattern_str': pattern_str, # Keep original for logging 'pattern_compiled': pattern_for_compile, # Actual compiled pattern 'category': f"{parent_path}/{key}" if parent_path else key, 'entity_type': pat_item.get('entity_type'), 'entity_subtype': pat_item.get('entity_subtype'), 'label_template': pat_item.get('label_template'), 'capture_groups': pat_item.get('capture_groups', {}), 'relationships': pat_item.get('relationships', []), 'description': pat_item.get('description', ''), } self.entity_patterns.append(entity_config) except re.error as e: print(f"Warning: Invalid entity pattern '{pattern_str}': {e}") else: # Nested category, recurse new_path = f"{parent_path}/{key}" if parent_path else key self._compile_entity_section(value, new_path) def should_discard(self, text: str) -> tuple[bool, Optional[str]]: """ Check if text matches any discard pattern. Returns: Tuple of (should_discard, reason or None) """ text_lower = text.lower().strip() for regex, reason in self.discard_patterns: if regex.search(text_lower): return True, reason return False, None def match_entity(self, text: str) -> Optional[dict]: """ Match text against entity patterns. Uses search() instead of match() to find patterns anywhere in the text, not just at the beginning. This dramatically improves entity yield. Returns: Dict with match info including entity_type, captures, relationships or None if no match """ text_stripped = text.strip() for pattern in self.entity_patterns: match = pattern['regex'].search(text_stripped) if match: # Use the matched substring, not the full text matched_substring = match.group(0) result = { 'matched_text': matched_substring, 'full_context': text_stripped if text_stripped != matched_substring else None, 'entity_type': pattern['entity_type'], 'entity_subtype': pattern['entity_subtype'], 'pattern_str': pattern['pattern_str'], 'category': pattern['category'], 'description': pattern['description'], 'captures': {}, 'relationships': [], } # Extract capture groups if pattern['capture_groups']: for group_num, group_config in pattern['capture_groups'].items(): try: group_idx = int(group_num) if group_idx <= len(match.groups()): captured_value = match.group(group_idx) if captured_value: result['captures'][group_idx] = { 'value': captured_value, 'type': group_config.get('type'), 'role': group_config.get('role'), } except (ValueError, IndexError): pass # Generate relationships - use matched_substring as the entity if pattern['relationships']: for rel in pattern['relationships']: relationship = { 'predicate': rel.get('predicate'), 'subject': self._resolve_reference(rel.get('subject'), matched_substring, result['captures']), 'object': self._resolve_reference(rel.get('object'), matched_substring, result['captures']), 'confidence': rel.get('confidence', 0.8), } # Add type info if available if rel.get('subject_type'): relationship['subject_type'] = rel['subject_type'] if rel.get('object_type'): relationship['object_type'] = rel['object_type'] result['relationships'].append(relationship) # Apply label template if exists if pattern['label_template'] and result['captures']: try: label = pattern['label_template'] for idx, cap in result['captures'].items(): label = label.replace(f'{{{idx}}}', cap['value']) result['entity_label'] = label except Exception: result['entity_label'] = matched_substring else: result['entity_label'] = matched_substring # Filter out false positives caused by stopwords in capture groups if is_stopword_match(result): continue # Try next pattern instead of returning this match return result return None def _resolve_reference(self, ref: Any, matched_text: str, captures: dict) -> Optional[str]: """Resolve a reference in relationship definition.""" if ref is None: return None if ref == '$0': return matched_text if isinstance(ref, str) and ref.startswith('$'): try: idx = int(ref[1:]) if idx in captures: return captures[idx]['value'] except ValueError: pass if ref == 'CUSTODIAN': return 'CUSTODIAN' # Placeholder for the custodian being processed return str(ref) # ============================================================================ # CUSTODIAN FILE PROCESSING # ============================================================================ def find_html_files(archive_dir: Path) -> list[Path]: """Find all HTML files in a web archive directory.""" html_files = [] mirror_dir = archive_dir / 'mirror' if mirror_dir.exists(): for html_file in mirror_dir.rglob('*.html'): html_files.append(html_file) pages_dir = archive_dir / 'pages' if pages_dir.exists(): for html_file in pages_dir.rglob('*.html'): html_files.append(html_file) return html_files def process_custodian_file( custodian_path: Path, base_path: Path, matcher: PatternMatcher, dry_run: bool = False, verbose: bool = False, show_entities: bool = False, show_unmatched: int = 0, min_length: int = 10 ) -> dict: """ Process a single custodian file to extract and add pattern-based entities. Args: custodian_path: Path to custodian YAML file base_path: Base path for web archives (data/custodian/) matcher: Compiled pattern matcher dry_run: If True, don't write changes verbose: If True, show detailed output show_entities: If True, print each entity as it's found show_unmatched: Number of unmatched segments to show (for debugging) min_length: Minimum text segment length to analyze Returns: Dict with processing stats """ stats = { 'file': str(custodian_path.name), 'web_archives_found': 0, 'html_files_processed': 0, 'text_segments_analyzed': 0, 'segments_discarded': 0, 'entities_extracted': 0, 'status': 'skipped', 'error': None, } # Collect unmatched segments for debugging unmatched_samples = [] try: custodian_data = load_yaml(custodian_path) except Exception as e: stats['status'] = 'error' stats['error'] = f"Failed to load YAML: {e}" return stats # Check for web_enrichment section web_enrichment = custodian_data.get('web_enrichment', {}) web_archives = web_enrichment.get('web_archives', []) if not web_archives: stats['status'] = 'no_web_archives' return stats stats['web_archives_found'] = len(web_archives) all_claims = [] discard_counts = {} for archive in web_archives: archive_dir_str = archive.get('directory', '') if not archive_dir_str: continue archive_dir = base_path / archive_dir_str if not archive_dir.exists(): continue html_files = find_html_files(archive_dir) for html_file in html_files: stats['html_files_processed'] += 1 try: with open(html_file, 'r', encoding='utf-8', errors='replace') as f: html_content = f.read() except Exception as e: if verbose: print(f" Warning: Could not read {html_file}: {e}") continue # Extract text segments text_segments = extract_text_from_html(html_content) for segment in text_segments: # Skip segments that are too short if len(segment) < min_length: continue stats['text_segments_analyzed'] += 1 # First check discard patterns should_discard, discard_reason = matcher.should_discard(segment) if should_discard: stats['segments_discarded'] += 1 discard_counts[discard_reason] = discard_counts.get(discard_reason, 0) + 1 continue # Try to match entity patterns entity_match = matcher.match_entity(segment) if entity_match: stats['entities_extracted'] += 1 # Build claim record claim = { 'entity': entity_match['entity_label'], 'matched_text': entity_match['matched_text'], 'entity_type': entity_match['entity_type'], } if entity_match.get('entity_subtype'): claim['entity_subtype'] = entity_match['entity_subtype'] claim['matched_pattern'] = entity_match['pattern_str'] claim['pattern_category'] = entity_match['category'] # Add capture groups if any if entity_match['captures']: claim['capture_groups'] = { str(idx): cap for idx, cap in entity_match['captures'].items() } # Add relationships if entity_match['relationships']: claim['relationships'] = entity_match['relationships'] # Source file relative to custodian dir try: rel_path = html_file.relative_to(base_path) claim['source_file'] = str(rel_path) except ValueError: claim['source_file'] = str(html_file) claim['confidence'] = 0.85 # Pattern-based extraction confidence all_claims.append(claim) # Show entity if flag is set if show_entities: print(f" → [{entity_match['entity_type']}] {entity_match['entity_label']}") else: # Track unmatched segments for debugging if show_unmatched > 0 and len(unmatched_samples) < show_unmatched: # Only collect interesting segments (likely to contain entities) if (len(segment) >= 15 and len(segment) <= 100 and not segment.isupper() and any(c.isupper() for c in segment[1:]) and not re.match(r'^[\d\s\W]+$', segment)): unmatched_samples.append(segment) # Add unmatched samples to stats for debugging if unmatched_samples: stats['unmatched_samples'] = unmatched_samples if not all_claims: stats['status'] = 'no_entities_found' return stats # Deduplicate claims by entity + type seen = set() unique_claims = [] for claim in all_claims: key = (claim['entity'], claim.get('entity_type', '')) if key not in seen: seen.add(key) unique_claims.append(claim) stats['entities_extracted'] = len(unique_claims) # Create pattern_entity_claims section pattern_entity_claims = { 'extraction_timestamp': datetime.now(timezone.utc).isoformat(), 'extraction_method': 'pattern_based_extraction_v1', 'pattern_file': 'dutch_web_patterns.yaml', 'pattern_file_version': '1.0.0', 'html_files_processed': stats['html_files_processed'], 'text_segments_analyzed': stats['text_segments_analyzed'], 'segments_discarded': stats['segments_discarded'], 'entities_count': len(unique_claims), 'claims': unique_claims, } # Add to custodian data custodian_data['pattern_entity_claims'] = pattern_entity_claims if not dry_run: save_yaml(custodian_path, custodian_data) stats['status'] = 'updated' else: stats['status'] = 'would_update' return stats def find_custodian_files_with_web_archives(custodian_dir: Path) -> list[Path]: """ Find all custodian files that have web_enrichment.web_archives. Args: custodian_dir: Directory containing custodian YAML files Returns: List of paths to custodian files with web archives """ pattern = str(custodian_dir / "NL-*.yaml") files = [] for filepath in glob.glob(pattern): path = Path(filepath) try: with open(path, 'r', encoding='utf-8') as f: # Quick check for web_archives: in file content = f.read() if 'web_archives:' in content: files.append(path) except Exception: continue return sorted(files) # ============================================================================ # MAIN # ============================================================================ def main(): parser = argparse.ArgumentParser( description='Extract typed entities from web archives using annotated patterns' ) parser.add_argument( '--dry-run', action='store_true', help='Show what would be done without making changes' ) parser.add_argument( '--limit', type=int, default=None, help='Limit number of files to process' ) parser.add_argument( '--custodian', type=str, default=None, help='Process only a specific custodian GHCID (e.g., NL-DR-ASS-A-DA)' ) parser.add_argument( '--custodian-dir', type=Path, default=Path('/Users/kempersc/apps/glam/data/custodian'), help='Directory containing custodian YAML files' ) parser.add_argument( '--pattern-file', type=Path, default=Path('/Users/kempersc/apps/glam/data/entity_annotation/modules/processing/dutch_web_patterns.yaml'), help='Path to pattern definition file' ) parser.add_argument( '--verbose', '-v', action='store_true', help='Show detailed output' ) parser.add_argument( '--show-entities', action='store_true', help='Show each extracted entity as it is found' ) parser.add_argument( '--show-unmatched', type=int, default=0, metavar='N', help='Show N sample unmatched text segments (for pattern development)' ) parser.add_argument( '--min-length', type=int, default=10, help='Minimum text segment length to analyze (default: 10)' ) args = parser.parse_args() custodian_dir = args.custodian_dir base_path = custodian_dir # Load patterns print(f"Loading patterns from {args.pattern_file}...") try: matcher = PatternMatcher(args.pattern_file) print(f" Loaded {len(matcher.discard_patterns)} discard patterns") print(f" Loaded {len(matcher.entity_patterns)} entity patterns") except Exception as e: print(f"Error loading patterns: {e}") return 1 # Find custodian files if args.custodian: # Process specific custodian specific_file = custodian_dir / f"{args.custodian}.yaml" if not specific_file.exists(): print(f"Error: Custodian file not found: {specific_file}") return 1 files = [specific_file] print(f"Processing specific custodian: {args.custodian}") else: print(f"Scanning for custodian files with web archives in {custodian_dir}...") files = find_custodian_files_with_web_archives(custodian_dir) print(f"Found {len(files)} custodian files with web_archives") if args.limit: files = files[:args.limit] print(f"Limited to {args.limit} files") if args.dry_run: print("\n*** DRY RUN - No changes will be made ***\n") # Process statistics total_processed = 0 total_updated = 0 total_entities = 0 total_html_files = 0 total_segments = 0 total_discarded = 0 all_unmatched = [] for filepath in files: stats = process_custodian_file( filepath, base_path, matcher, dry_run=args.dry_run, verbose=args.verbose, show_entities=args.show_entities, show_unmatched=args.show_unmatched, min_length=args.min_length ) total_processed += 1 # Collect unmatched samples if 'unmatched_samples' in stats: all_unmatched.extend(stats['unmatched_samples']) if stats['status'] in ('updated', 'would_update'): total_updated += 1 total_entities += stats['entities_extracted'] total_html_files += stats['html_files_processed'] total_segments += stats['text_segments_analyzed'] total_discarded += stats['segments_discarded'] if args.verbose or stats['entities_extracted'] > 0: msg = f"✓ {stats['file']}: {stats['entities_extracted']} entities" msg += f" ({stats['html_files_processed']} HTML files, {stats['segments_discarded']} discarded)" print(msg) elif stats['status'] == 'no_entities_found': total_html_files += stats['html_files_processed'] total_segments += stats['text_segments_analyzed'] total_discarded += stats['segments_discarded'] if args.verbose: print(f"○ {stats['file']}: no entities found ({stats['html_files_processed']} HTML files)") elif args.verbose: if stats['status'] == 'error': print(f"✗ {stats['file']}: {stats['error']}") elif stats['status'] == 'no_web_archives': print(f"○ {stats['file']}: no web_archives section") # Summary print("\n" + "=" * 60) print("SUMMARY") print("=" * 60) print(f"Files processed: {total_processed}") print(f"Files with entities: {total_updated}") print(f"Total HTML files: {total_html_files}") print(f"Text segments analyzed: {total_segments}") print(f"Segments discarded: {total_discarded}") print(f"Total entities found: {total_entities}") # Show unmatched samples if requested if args.show_unmatched > 0 and all_unmatched: print("\n" + "-" * 60) print(f"UNMATCHED SAMPLES (showing up to {args.show_unmatched}):") print("-" * 60) for i, sample in enumerate(all_unmatched[:args.show_unmatched], 1): print(f" {i}. {sample[:80]}{'...' if len(sample) > 80 else ''}") if args.dry_run: print("\n*** DRY RUN - No changes were made ***") return 0 if __name__ == '__main__': exit(main())