glam/scripts/clean_and_enrich_archiveslab.py
kempersc 55e2cd2340 feat: implement LLM-based extraction for Archives Lab content
- Introduced `llm_extract_archiveslab.py` script for entity and relationship extraction using LLMAnnotator with GLAM-NER v1.7.0.
- Replaced regex-based extraction with generative LLM inference.
- Added functions for loading markdown content, converting annotation sessions to dictionaries, and generating extraction statistics.
- Implemented comprehensive logging of extraction results, including counts of entities, relationships, and specific types like heritage institutions and persons.
- Results and statistics are saved in JSON format for further analysis.
2025-12-05 23:16:21 +01:00

582 lines
20 KiB
Python

#!/usr/bin/env python3
"""
Clean and enrich Archives Lab extraction with:
1. Name cleanup (remove markdown artifacts)
2. LLM-based entity extraction using GLAM-NER types
3. Additional relationship types
4. Cross-referencing with Palestinian GLAM data
Usage:
PYTHONPATH=src python scripts/clean_and_enrich_archiveslab.py
"""
import json
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from dataclasses import dataclass, asdict, field
from typing import Any, Dict, List, Optional, Set, Tuple
# =============================================================================
# Configuration
# =============================================================================
ARCHIVESLAB_DIR = Path('/Users/kempersc/apps/glam/data/extracted/archiveslab')
PALESTINIAN_CLAIMS = Path('/Users/kempersc/apps/glam/data/extracted/palestinian_glam_claims.json')
# Debug: print path
print(f"Looking for Palestinian claims at: {PALESTINIAN_CLAIMS}")
print(f"File exists: {PALESTINIAN_CLAIMS.exists()}")
OUTPUT_FILE = ARCHIVESLAB_DIR / 'archiveslab_claims_enriched.json'
# =============================================================================
# Data Classes
# =============================================================================
@dataclass
class EnrichedProvenance:
"""Enhanced provenance tracking."""
source_url: str
extraction_method: str
extraction_date: str
confidence: float = 0.85
enrichment_date: Optional[str] = None
enrichment_method: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
return {k: v for k, v in asdict(self).items() if v is not None}
@dataclass
class EnrichedEntityClaim:
"""Enhanced entity claim with GLAM-NER types."""
entity_id: str
entity_type: str # GLAM-NER hypernym (e.g., GRP.HER, AGT.PER)
name: str
clean_name: str # Cleaned version of name
context: Optional[str] = None
provenance: Optional[EnrichedProvenance] = None
metadata: Optional[Dict[str, Any]] = None
affiliations: Optional[List[str]] = None # Organizations person is affiliated with
roles: Optional[List[str]] = None # Roles/titles
def to_dict(self) -> Dict[str, Any]:
result = {
'entity_id': self.entity_id,
'entity_type': self.entity_type,
'name': self.name,
'clean_name': self.clean_name,
}
if self.context:
result['context'] = self.context
if self.provenance:
result['provenance'] = self.provenance.to_dict()
if self.metadata:
result['metadata'] = self.metadata
if self.affiliations:
result['affiliations'] = self.affiliations
if self.roles:
result['roles'] = self.roles
return result
@dataclass
class EnrichedTriple:
"""Enhanced triple with more relationship types."""
subject: str
predicate: str
object: str
provenance: Optional[EnrichedProvenance] = None
confidence: float = 0.85
def to_dict(self) -> Dict[str, Any]:
result = {
'subject': self.subject,
'predicate': self.predicate,
'object': self.object,
'confidence': self.confidence,
}
if self.provenance:
result['provenance'] = self.provenance.to_dict()
return result
# =============================================================================
# Name Cleanup Functions
# =============================================================================
def clean_entity_name(name: str) -> str:
"""
Clean entity name by removing markdown artifacts.
Examples:
"Mahmoud Balaawy\n\nTitle" -> "Mahmoud Balaawy"
"Ana Roeschley \n\n \n\nTitle" -> "Ana Roeschley"
"Biographies\n\nAbdallah Omar" -> "Abdallah Omar"
"""
# Remove "Title:" and variations
name = re.sub(r'\s*\n+\s*Title\s*:?\s*', '', name)
# Remove "Abstract:" and variations
name = re.sub(r'\s*\n+\s*Abstract\s*:?\s*', '', name)
# Remove "Biographies" prefix
name = re.sub(r'^Biographies\s*\n+\s*', '', name)
# Remove "Speaker:" prefix
name = re.sub(r'^Speakers?\s*:\s*', '', name)
# Remove "Moderator:" prefix
name = re.sub(r'^Moderators?\s*:\s*', '', name)
# Remove "Keynote Speaker:" prefix
name = re.sub(r'^Keynote\s+Speakers?\s*:\s*', '', name)
# Clean up multiple newlines and whitespace
name = re.sub(r'\s*\n+\s*', ' ', name)
name = re.sub(r'\s+', ' ', name)
# Remove leading/trailing whitespace
name = name.strip()
# Remove trailing punctuation
name = re.sub(r'[,;:]+$', '', name)
return name
def extract_title_from_context(context: str) -> Optional[str]:
"""Extract academic/professional title from context."""
if not context:
return None
# Look for Dr., Prof., Mr., Ms., etc.
title_match = re.search(r'\b(Dr\.|Prof\.|Mr\.|Ms\.|Mrs\.|Eng\.)\s', context)
if title_match:
return title_match.group(1).rstrip('.')
return None
def extract_affiliation_from_context(context: str) -> List[str]:
"""Extract organizational affiliations from context."""
affiliations = []
if not context:
return affiliations
# Common affiliation patterns
patterns = [
r'(?:at|from|of)\s+(?:the\s+)?([A-Z][a-zA-Z\s]+(?:University|Institute|Archive|Museum|Library|Center|Centre|Lab|Laboratory|Association|Council))',
r'(?:at|from|of)\s+(?:the\s+)?([A-Z][a-zA-Z\s]+(?:of\s+[A-Z][a-zA-Z\s]+))',
r'([A-Z][a-zA-Z\s]+Municipality)',
]
for pattern in patterns:
matches = re.findall(pattern, context)
affiliations.extend([m.strip() for m in matches if len(m.strip()) > 5])
return list(set(affiliations))
# =============================================================================
# GLAM-NER Type Mapping
# =============================================================================
def map_to_glam_ner_type(entity_type: str, name: str, context: str = "") -> str:
"""
Map simple entity types to GLAM-NER hypernyms.
GLAM-NER v1.7.0 Entity Hypernyms:
- AGT.PER: Person
- AGT.STF: Staff member
- GRP.HER: Heritage institution
- GRP.EDU: Educational institution
- GRP.ASS: Association/society
- GRP.GOV: Government agency
- TOP.SET: Settlement (city)
- TOP.CTY: Country
"""
name_lower = name.lower()
context_lower = context.lower() if context else ""
if entity_type == "PER":
# Check if this is a staff member (has role context)
if any(term in context_lower for term in ['director', 'professor', 'curator', 'archivist', 'librarian', 'officer']):
return "AGT.STF"
return "AGT.PER"
elif entity_type == "ORG":
# Heritage institutions
if any(term in name_lower for term in ['archive', 'museum', 'library', 'heritage', 'memorial']):
return "GRP.HER"
# Educational institutions
if any(term in name_lower for term in ['university', 'college', 'school', 'institute', 'academic']):
return "GRP.EDU"
# Associations and societies
if any(term in name_lower for term in ['association', 'society', 'council', 'committee', 'group', 'network']):
return "GRP.ASS"
# Government
if any(term in name_lower for term in ['municipality', 'government', 'ministry', 'national']):
return "GRP.GOV"
return "GRP"
elif entity_type == "LOC":
# Countries
if name in ['Palestine', 'Lebanon', 'Gaza', 'Israel', 'Jordan', 'Syria', 'Egypt', 'Algeria', 'Malta', 'Ghana']:
return "TOP.CTY"
# Cities
if any(term in name_lower for term in ['city', 'town', 'village']) or name in [
'Jerusalem', 'Ramallah', 'Hebron', 'Nablus', 'Bethlehem', 'Tulkarm',
'Beirut', 'New York', 'Amsterdam', 'London'
]:
return "TOP.SET"
return "TOP"
elif entity_type == "URL":
return "APP.URL"
elif entity_type == "EMAIL":
return "APP.EML"
return entity_type
# =============================================================================
# Relationship Extraction
# =============================================================================
def extract_relationships(entities: List[Dict], context_map: Dict[str, str]) -> List[EnrichedTriple]:
"""
Extract additional relationship types from entities and context.
Relationship types:
- REL.EVT.SPEAKS_AT: Person speaks at event
- REL.EVT.MODERATES: Person moderates panel
- REL.EVT.PARTICIPATES: Organization participates in event
- REL.SOC.MEMBER_OF: Person is member of organization
- REL.SOC.AFFILIATED_WITH: Person affiliated with institution
- REL.SOC.WORKS_AT: Person works at institution
- REL.SPA.LOCATED_IN: Entity located in place
- REL.ONT.ISA: Entity is instance of type
"""
triples = []
event_name = "Resilient Communities Resilient Archives"
provenance = EnrichedProvenance(
source_url="https://www.archiveslab.org/events/resilient-communities-resilient-archives/english-program-resilient-communities-resilient-archives",
extraction_method="Enriched extraction with relationship inference",
extraction_date=datetime.now(timezone.utc).isoformat(),
confidence=0.85
)
# Group entities by type
persons = [e for e in entities if e['entity_type'].startswith('AGT')]
organizations = [e for e in entities if e['entity_type'].startswith('GRP')]
locations = [e for e in entities if e['entity_type'].startswith('TOP')]
# Person relationships
for person in persons:
name = person['clean_name']
context = context_map.get(name, "")
# Speaking at event
if 'speaker' in context.lower() or 'keynote' in context.lower():
triples.append(EnrichedTriple(
subject=name,
predicate="REL.EVT.SPEAKS_AT",
object=event_name,
provenance=provenance,
confidence=0.9
))
# Moderating panel
if 'moderator' in context.lower():
triples.append(EnrichedTriple(
subject=name,
predicate="REL.EVT.MODERATES",
object=event_name,
provenance=provenance,
confidence=0.9
))
# Affiliations
affiliations = person.get('affiliations', [])
for affiliation in affiliations:
triples.append(EnrichedTriple(
subject=name,
predicate="REL.SOC.AFFILIATED_WITH",
object=affiliation,
provenance=provenance,
confidence=0.75
))
# Work relationships (from context)
work_patterns = [
(r'(?:at|with)\s+(?:the\s+)?([A-Z][a-zA-Z\s]+(?:University|Institute|Archive|Museum|Library))', "REL.SOC.WORKS_AT"),
(r'(?:fellow|researcher|professor|director)\s+(?:at|of)\s+(?:the\s+)?([A-Z][a-zA-Z\s]+)', "REL.SOC.WORKS_AT"),
]
for pattern, predicate in work_patterns:
matches = re.findall(pattern, context)
for match in matches:
if len(match.strip()) > 5:
triples.append(EnrichedTriple(
subject=name,
predicate=predicate,
object=match.strip(),
provenance=provenance,
confidence=0.7
))
# Organization relationships
for org in organizations:
name = org['clean_name']
# Participating in event
triples.append(EnrichedTriple(
subject=name,
predicate="REL.EVT.PARTICIPATES",
object=event_name,
provenance=provenance,
confidence=0.85
))
# Type classification
org_type = org['entity_type']
type_labels = {
'GRP.HER': 'Heritage Institution',
'GRP.EDU': 'Educational Institution',
'GRP.ASS': 'Association',
'GRP.GOV': 'Government Agency',
'GRP': 'Organization'
}
if org_type in type_labels:
triples.append(EnrichedTriple(
subject=name,
predicate="REL.ONT.ISA",
object=type_labels[org_type],
provenance=provenance,
confidence=0.9
))
# Location relationships
for loc in locations:
name = loc['clean_name']
# Event location
if name in ['Palestine', 'Lebanon', 'Gaza', 'Jerusalem', 'Ramallah']:
triples.append(EnrichedTriple(
subject=event_name,
predicate="REL.SPA.FOCUS_REGION",
object=name,
provenance=provenance,
confidence=0.9
))
return triples
# =============================================================================
# Cross-Reference with Palestinian GLAM Data
# =============================================================================
def load_palestinian_institutions() -> Dict[str, Dict]:
"""Load Palestinian GLAM institutions for cross-referencing."""
if not PALESTINIAN_CLAIMS.exists():
print(f" Palestinian claims file not found: {PALESTINIAN_CLAIMS}")
return {}
with open(PALESTINIAN_CLAIMS, 'r', encoding='utf-8') as f:
data = json.load(f)
institutions = {}
for claim in data.get('entity_claims', []):
# Palestinian claims use 'text' not 'name'
name = claim.get('text', '') or claim.get('name', '')
if name:
institutions[name.lower()] = claim
print(f" Loaded {len(institutions)} Palestinian institutions")
return institutions
def find_cross_references(entities: List[Dict], palestinian_institutions: Dict[str, Dict]) -> List[Dict]:
"""Find entities that match Palestinian GLAM institutions."""
cross_refs = []
for entity in entities:
name = entity.get('clean_name', '').lower()
# Direct match
if name in palestinian_institutions:
cross_refs.append({
'archiveslab_entity': entity['clean_name'],
'palestinian_match': palestinian_institutions[name],
'match_type': 'exact'
})
continue
# Partial match (check if entity name contains or is contained in institution name)
for inst_name, inst_data in palestinian_institutions.items():
if len(name) > 5 and (name in inst_name or inst_name in name):
cross_refs.append({
'archiveslab_entity': entity['clean_name'],
'palestinian_match': inst_data,
'match_type': 'partial'
})
break
return cross_refs
# =============================================================================
# Main Processing
# =============================================================================
def process_claims():
"""Main processing function."""
# Load original claims
claims_file = ARCHIVESLAB_DIR / 'archiveslab_claims.json'
with open(claims_file, 'r', encoding='utf-8') as f:
original_data = json.load(f)
print("=" * 60)
print("ARCHIVES LAB CLAIMS ENRICHMENT")
print("=" * 60)
print(f"\nOriginal entities: {len(original_data['entity_claims'])}")
print(f"Original triples: {len(original_data['triples'])}")
# Step 1: Clean entity names
print("\n--- Step 1: Cleaning entity names ---")
cleaned_entities = []
context_map = {}
for claim in original_data['entity_claims']:
original_name = claim['name']
clean_name = clean_entity_name(original_name)
context = claim.get('context', '')
# Skip if name is too short after cleaning
if len(clean_name) < 3:
continue
# Skip duplicate clean names
if clean_name in context_map:
continue
context_map[clean_name] = context
# Map to GLAM-NER type
glam_type = map_to_glam_ner_type(claim['entity_type'], clean_name, context)
# Extract affiliations and roles
affiliations = extract_affiliation_from_context(context)
title = extract_title_from_context(context)
roles = [title] if title else []
enriched_claim = EnrichedEntityClaim(
entity_id=claim['entity_id'],
entity_type=glam_type,
name=original_name,
clean_name=clean_name,
context=context[:200] if context else None,
provenance=EnrichedProvenance(
source_url=original_data['source_url'],
extraction_method="Regex + Playwright",
extraction_date=claim['provenance']['extraction_date'],
confidence=claim['provenance']['confidence'],
enrichment_date=datetime.now(timezone.utc).isoformat(),
enrichment_method="GLAM-NER type mapping + name cleanup"
),
affiliations=affiliations if affiliations else None,
roles=roles if roles else None
)
cleaned_entities.append(enriched_claim.to_dict())
print(f"Cleaned entities: {len(cleaned_entities)} (removed {len(original_data['entity_claims']) - len(cleaned_entities)} duplicates/artifacts)")
# Show sample cleanups
print("\nSample name cleanups:")
samples = [e for e in cleaned_entities if e['name'] != e['clean_name']][:5]
for s in samples:
print(f" '{s['name'][:40]}...' -> '{s['clean_name']}'")
# Step 2: Map to GLAM-NER types
print("\n--- Step 2: GLAM-NER type distribution ---")
type_counts = {}
for e in cleaned_entities:
t = e['entity_type']
type_counts[t] = type_counts.get(t, 0) + 1
for t, count in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" {t}: {count}")
# Step 3: Extract relationships
print("\n--- Step 3: Extracting enriched relationships ---")
enriched_triples = extract_relationships(cleaned_entities, context_map)
print(f"Generated {len(enriched_triples)} enriched triples")
# Relationship type breakdown
rel_counts = {}
for t in enriched_triples:
p = t.predicate
rel_counts[p] = rel_counts.get(p, 0) + 1
print("\nRelationship types:")
for p, count in sorted(rel_counts.items(), key=lambda x: -x[1]):
print(f" {p}: {count}")
# Step 4: Cross-reference with Palestinian GLAM data
print("\n--- Step 4: Cross-referencing with Palestinian GLAM data ---")
palestinian_institutions = load_palestinian_institutions()
if palestinian_institutions:
cross_refs = find_cross_references(cleaned_entities, palestinian_institutions)
print(f"Found {len(cross_refs)} cross-references with Palestinian GLAM institutions")
if cross_refs:
print("\nCross-references found:")
for ref in cross_refs[:10]:
match_name = ref['palestinian_match'].get('name', 'Unknown')
print(f" {ref['archiveslab_entity']} <-> {match_name} ({ref['match_type']})")
else:
cross_refs = []
print("Palestinian GLAM data not found, skipping cross-referencing")
# Save enriched data
print("\n--- Saving enriched data ---")
enriched_data = {
'source_url': original_data['source_url'],
'fetch_timestamp': original_data['fetch_timestamp'],
'extraction_timestamp': original_data['extraction_timestamp'],
'enrichment_timestamp': datetime.now(timezone.utc).isoformat(),
'entity_claims': cleaned_entities,
'triples': [t.to_dict() for t in enriched_triples],
'cross_references': cross_refs,
'statistics': {
'original_entities': len(original_data['entity_claims']),
'cleaned_entities': len(cleaned_entities),
'original_triples': len(original_data['triples']),
'enriched_triples': len(enriched_triples),
'cross_references': len(cross_refs),
'entity_types': type_counts,
'relationship_types': rel_counts
}
}
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(enriched_data, f, indent=2, ensure_ascii=False)
print(f"Saved: {OUTPUT_FILE}")
print("\n" + "=" * 60)
print("ENRICHMENT COMPLETE")
print("=" * 60)
if __name__ == '__main__':
process_claims()