glam/scripts/extract_person_entities_from_annotations.py
2025-12-14 17:09:55 +01:00

525 lines
18 KiB
Python

#!/usr/bin/env python3
"""
Extract person entities from CH-Annotator web annotations and add them to custodian YAML files.
This script:
1. Finds custodian files with web_enrichment.web_archives references
2. Loads corresponding annotation files (annotations_v1.7.0.yaml)
3. Extracts person entities (AGT.PER, AGT.STF) with full provenance
4. Adds web_person_claims section to custodian files
Usage:
python scripts/extract_person_entities_from_annotations.py [--dry-run] [--limit N]
"""
import argparse
import glob
import os
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional
import yaml
# ============================================================================
# FALSE POSITIVE FILTERING FOR AGT.PER ENTITIES
# ============================================================================
# The LLM annotator may incorrectly tag groups, organizations, events, and
# topic references as AGT.PER (person). This filter provides defense-in-depth
# to catch these false positives.
# ============================================================================
# Patterns that indicate a false positive (NOT a person)
FALSE_POSITIVE_PATTERNS = [
# Group/collective references
r'\b(staff|members|curators|colleagues|board|team|committee|participants)\b',
r'\b(community|network|consortium|association|society|circle|group)\b',
r'\b(visitors|archivists|researchers|filmmakers|historians|professionals)\b',
# Conference/event references
r'\b(conference|Conference|festival|Festival|congress|Congress|symposium)\b',
r'\b(Award|award|Prize|prize|Ceremony|ceremony)\b',
r'\b(Il Cinema Ritrovato|IASA|AMIA|FIAF|Le Giornate)\b',
r'\b(Women and Silent Screen)\b',
# Organization/institution references (require word before or after to avoid matching surnames)
r'\b(collection|Collection|archive|Archive|fund|Fund|foundation|Foundation)\b',
r'\b(institute|Institute|University|university|Academy|academy)\b',
r'\b(museum|Museum|library|Library)\b',
# "Center/Centre" only when preceded by organization indicators or followed by "for/of"
r'\b(Research|Cultural|Heritage|Community|Art|Science|Information|Documentation|Knowledge)\s+(Center|Centre)\b',
r'\b(Center|Centre)\s+(for|of|voor|van)\b',
r'\b(ACE member|member institutions)\b',
r'\b(Harvard Film Archive|Toonder studio)\b',
# Network/platform references
r'\b(VPRO|Tegenlicht|network)\b',
# Topic/story references (Dutch "Verhalen van X" = "Stories of X")
r'\b(Verhalen van|verhalen van|Stories of|stories of)\b',
# Generic plural endings that indicate groups
r'\b\w+s\s+(members|colleagues|participants|curators|staff)\b',
# "X of Y" patterns that typically indicate groups/organizations
r'\b(Commission|Committee|Board|Council)\s+(of|for)\b',
# Patterns ending with group indicators
r'\b(board members|staff members|team members|committee members)\b',
r'\b(technical commission|Commission members)\b',
# Generic role descriptions (Dutch and English)
r'^(een|de|het|a|an|the)\s+(medewerker|staff|employee|curator|director|visitor|koning|koningin|king|queen)\b',
r'^de\s+Koning$', # "de Koning" = "the King" in Dutch (not a name)
r'^(echtgenote|spouse|wife|husband)$',
r'^(schilder|painter|artist|writer)$',
r'^(gevluchte|fled|escaped|refugee)\s+',
# Specific non-person references
r'WEBJONGENS',
]
# Patterns for detecting usernames (case-sensitive, applied separately)
USERNAME_PATTERNS = [
r'^[a-z][a-z0-9._]+$', # All lowercase with dots, underscores, numbers (like "basvt", "admin", "j.s.a.m.van.koningsbrugge")
]
USERNAME_REGEX = [re.compile(p) for p in USERNAME_PATTERNS] # NOT case-insensitive
# Compile patterns for efficiency
FALSE_POSITIVE_REGEX = [re.compile(p, re.IGNORECASE) for p in FALSE_POSITIVE_PATTERNS]
# Minimum name length (single characters or very short strings are suspicious)
MIN_NAME_LENGTH = 3
# Maximum word count (very long "names" are likely descriptions, not names)
MAX_WORD_COUNT = 8
# Minimum word count for proper names (single first names are often not useful)
MIN_WORD_COUNT = 2 # At least first + last name
def is_likely_person(name: str) -> tuple[bool, str]:
"""
Filter out false positive person detections.
Args:
name: The extracted person name to validate
Returns:
Tuple of (is_valid, rejection_reason)
- is_valid: True if this appears to be a real person name
- rejection_reason: Empty string if valid, otherwise reason for rejection
"""
if not name or not name.strip():
return False, "empty_name"
name = name.strip()
# Length check
if len(name) < MIN_NAME_LENGTH:
return False, f"too_short_{len(name)}_chars"
# Word count checks
word_count = len(name.split())
if word_count > MAX_WORD_COUNT:
return False, f"too_many_words_{word_count}"
# Single word names are usually not useful (just "John" or "Maria")
# Exception: historical figures often referenced by single name (Rembrandt, Vermeer)
# Exception: Names with particles (van Gogh) may appear as one "word" after splitting
if word_count < MIN_WORD_COUNT:
# Allow known single-name historical figures
known_single_names = {
'rembrandt', 'vermeer', 'multatuli', 'mucha', 'rietveld', 'dudok', 'mondriaan'
}
if name.lower() not in known_single_names:
return False, f"single_word_name"
# Check against false positive patterns
for pattern in FALSE_POSITIVE_REGEX:
if pattern.search(name):
return False, f"pattern_match:{pattern.pattern[:30]}"
# Check for username patterns (case-sensitive - usernames are typically all lowercase)
for pattern in USERNAME_REGEX:
if pattern.match(name):
return False, f"username_pattern:{pattern.pattern[:30]}"
return True, ""
# Custom YAML dumper to preserve formatting
class CustomDumper(yaml.SafeDumper):
pass
def str_representer(dumper, data):
if '\n' in data:
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
CustomDumper.add_representer(str, str_representer)
def load_yaml(filepath: Path) -> dict:
"""Load a YAML file."""
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f) or {}
def save_yaml(filepath: Path, data: dict) -> None:
"""Save data to a YAML file."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, Dumper=CustomDumper, allow_unicode=True,
default_flow_style=False, sort_keys=False, width=120)
def find_annotation_file(web_archive_dir: str, base_path: Path) -> Optional[Path]:
"""
Find the annotation file for a web archive directory.
Args:
web_archive_dir: Path like "web/0002/drentsarchief.nl"
base_path: Base path (data/custodian/)
Returns:
Path to annotations file or None
"""
# web_archive_dir is like "web/0002/drentsarchief.nl"
annotation_path = base_path / web_archive_dir / "annotations_v1.7.0.yaml"
if annotation_path.exists():
return annotation_path
return None
def extract_persons_from_annotations(annotation_data: dict, verbose: bool = False) -> tuple[list[dict], list[dict]]:
"""
Extract person entities from annotation data with false positive filtering.
Looks for entities with:
- hypernym: AGT
- hyponym: AGT.PER (person) or AGT.STF (staff)
Applies filtering to remove false positives (groups, organizations, events, etc.)
Args:
annotation_data: Loaded annotation YAML data
verbose: If True, track rejected entities
Returns:
Tuple of (valid_persons, rejected_entities)
- valid_persons: List of person entity dicts with provenance
- rejected_entities: List of rejected entities with rejection reasons
"""
persons = []
rejected = []
session = annotation_data.get('session', {})
claims = session.get('claims', {})
entities = claims.get('entity', [])
relationships = claims.get('relationship', [])
# Build entity lookup for relationship resolution
entity_lookup = {e.get('claim_id'): e for e in entities}
# Extract person entities
for entity in entities:
hypernym = entity.get('hypernym', '')
hyponym = entity.get('hyponym', '')
# Check if this is a person entity
if hypernym == 'AGT' and hyponym in ('AGT.PER', 'AGT.STF'):
name = entity.get('text_content', '').strip()
# Apply false positive filter
is_valid, rejection_reason = is_likely_person(name)
if not is_valid:
if verbose:
rejected.append({
'name': name,
'entity_type': hyponym,
'rejection_reason': rejection_reason,
})
continue
person = {
'name': name,
'entity_type': hyponym,
'entity_id': entity.get('claim_id'),
'class_uri': entity.get('class_uri'),
'recognition_confidence': entity.get('recognition_confidence', 0.0),
'provenance': {
'xpath': entity.get('provenance', {}).get('path'),
'timestamp': entity.get('provenance', {}).get('timestamp'),
'agent': entity.get('provenance', {}).get('agent'),
'confidence': entity.get('provenance', {}).get('confidence', 0.0),
'context_convention': entity.get('provenance', {}).get('context_convention'),
},
'relationships': []
}
# Find relationships involving this person
for rel in relationships:
subject = rel.get('subject', {})
obj = rel.get('object', {})
# Check if person is subject or object
if subject.get('entity_id') == entity.get('claim_id'):
person['relationships'].append({
'role': 'subject',
'predicate': rel.get('predicate', {}).get('uri'),
'predicate_label': rel.get('predicate', {}).get('label'),
'object': obj.get('span_text'),
'object_type': obj.get('entity_type'),
'confidence': rel.get('extraction_confidence', 0.0),
})
elif obj.get('entity_id') == entity.get('claim_id'):
person['relationships'].append({
'role': 'object',
'predicate': rel.get('predicate', {}).get('uri'),
'predicate_label': rel.get('predicate', {}).get('label'),
'subject': subject.get('span_text'),
'subject_type': subject.get('entity_type'),
'confidence': rel.get('extraction_confidence', 0.0),
})
persons.append(person)
return persons, rejected
def process_custodian_file(
custodian_path: Path,
base_path: Path,
dry_run: bool = False,
verbose: bool = False
) -> dict:
"""
Process a single custodian file to extract and add person entities.
Args:
custodian_path: Path to custodian YAML file
base_path: Base path for web archives
dry_run: If True, don't write changes
verbose: If True, track rejected entities
Returns:
Dict with processing stats
"""
stats = {
'file': str(custodian_path.name),
'web_archives_found': 0,
'annotation_files_found': 0,
'persons_extracted': 0,
'persons_rejected': 0,
'rejected_names': [],
'status': 'skipped',
'error': None,
}
try:
custodian_data = load_yaml(custodian_path)
except Exception as e:
stats['status'] = 'error'
stats['error'] = f"Failed to load YAML: {e}"
return stats
# Check for web_enrichment section
web_enrichment = custodian_data.get('web_enrichment', {})
web_archives = web_enrichment.get('web_archives', [])
if not web_archives:
stats['status'] = 'no_web_archives'
return stats
stats['web_archives_found'] = len(web_archives)
all_persons = []
all_rejected = []
source_annotations = []
for archive in web_archives:
archive_dir = archive.get('directory', '')
if not archive_dir:
continue
annotation_path = find_annotation_file(archive_dir, base_path)
if not annotation_path:
continue
stats['annotation_files_found'] += 1
try:
annotation_data = load_yaml(annotation_path)
except Exception as e:
stats['error'] = f"Failed to load annotation: {e}"
continue
persons, rejected = extract_persons_from_annotations(annotation_data, verbose=verbose)
all_rejected.extend(rejected)
if persons:
# Add source URL to each person
source_url = annotation_data.get('source_url', archive.get('url', ''))
for person in persons:
person['provenance']['source_url'] = source_url
all_persons.extend(persons)
source_annotations.append(str(annotation_path.relative_to(base_path.parent)))
stats['persons_extracted'] = len(all_persons)
stats['persons_rejected'] = len(all_rejected)
stats['rejected_names'] = [r['name'] for r in all_rejected]
if not all_persons:
stats['status'] = 'no_persons_found'
return stats
# Create web_person_claims section
web_person_claims = {
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'ch_annotator_entity_extraction_v2', # v2 includes filtering
'filtering_applied': True,
'source_annotations': source_annotations,
'persons_count': len(all_persons),
'persons_rejected_count': len(all_rejected),
'persons': all_persons,
}
# Add to custodian data
custodian_data['web_person_claims'] = web_person_claims
if not dry_run:
save_yaml(custodian_path, custodian_data)
stats['status'] = 'updated'
else:
stats['status'] = 'would_update'
return stats
def find_custodian_files_with_web_archives(custodian_dir: Path) -> list[Path]:
"""
Find all custodian files that have web_enrichment.web_archives.
Args:
custodian_dir: Directory containing custodian YAML files
Returns:
List of paths to custodian files with web archives
"""
pattern = str(custodian_dir / "NL-*.yaml")
files = []
for filepath in glob.glob(pattern):
path = Path(filepath)
try:
with open(path, 'r', encoding='utf-8') as f:
# Quick check for web_archives: in file
content = f.read()
if 'web_archives:' in content:
files.append(path)
except Exception:
continue
return sorted(files)
def main():
parser = argparse.ArgumentParser(
description='Extract person entities from web annotations to custodian files'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be done without making changes'
)
parser.add_argument(
'--limit',
type=int,
default=None,
help='Limit number of files to process'
)
parser.add_argument(
'--custodian-dir',
type=Path,
default=Path('/Users/kempersc/apps/glam/data/custodian'),
help='Directory containing custodian YAML files'
)
parser.add_argument(
'--verbose',
'-v',
action='store_true',
help='Show detailed output'
)
args = parser.parse_args()
custodian_dir = args.custodian_dir
base_path = custodian_dir # web archives are relative to custodian dir
print(f"Scanning for custodian files with web archives in {custodian_dir}...")
files = find_custodian_files_with_web_archives(custodian_dir)
print(f"Found {len(files)} custodian files with web_archives")
if args.limit:
files = files[:args.limit]
print(f"Limited to {args.limit} files")
if args.dry_run:
print("\n*** DRY RUN - No changes will be made ***\n")
# Process statistics
total_processed = 0
total_updated = 0
total_persons = 0
total_rejected = 0
all_rejected_names = []
for filepath in files:
stats = process_custodian_file(filepath, base_path, dry_run=args.dry_run, verbose=args.verbose)
total_processed += 1
if stats['status'] in ('updated', 'would_update'):
total_updated += 1
total_persons += stats['persons_extracted']
total_rejected += stats['persons_rejected']
all_rejected_names.extend(stats['rejected_names'])
if args.verbose or stats['persons_extracted'] > 0:
msg = f"{stats['file']}: {stats['persons_extracted']} persons"
if stats['persons_rejected'] > 0:
msg += f" ({stats['persons_rejected']} filtered out)"
print(msg)
elif args.verbose:
if stats['status'] == 'error':
print(f"{stats['file']}: {stats['error']}")
elif stats['status'] == 'no_persons_found':
print(f"{stats['file']}: no persons in annotations")
# Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Files processed: {total_processed}")
print(f"Files with persons: {total_updated}")
print(f"Total persons found: {total_persons}")
print(f"Total filtered out: {total_rejected}")
if all_rejected_names and args.verbose:
print(f"\nFiltered out names (false positives):")
for name in sorted(set(all_rejected_names)):
print(f" - {name}")
if args.dry_run:
print("\n*** DRY RUN - No changes were made ***")
if __name__ == '__main__':
main()