- Introduced `llm_extract_archiveslab.py` script for entity and relationship extraction using LLMAnnotator with GLAM-NER v1.7.0. - Replaced regex-based extraction with generative LLM inference. - Added functions for loading markdown content, converting annotation sessions to dictionaries, and generating extraction statistics. - Implemented comprehensive logging of extraction results, including counts of entities, relationships, and specific types like heritage institutions and persons. - Results and statistics are saved in JSON format for further analysis.
582 lines
20 KiB
Python
582 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Clean and enrich Archives Lab extraction with:
|
|
1. Name cleanup (remove markdown artifacts)
|
|
2. LLM-based entity extraction using GLAM-NER types
|
|
3. Additional relationship types
|
|
4. Cross-referencing with Palestinian GLAM data
|
|
|
|
Usage:
|
|
PYTHONPATH=src python scripts/clean_and_enrich_archiveslab.py
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from dataclasses import dataclass, asdict, field
|
|
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
|
|
# =============================================================================
|
|
# Configuration
|
|
# =============================================================================
|
|
|
|
ARCHIVESLAB_DIR = Path('/Users/kempersc/apps/glam/data/extracted/archiveslab')
|
|
PALESTINIAN_CLAIMS = Path('/Users/kempersc/apps/glam/data/extracted/palestinian_glam_claims.json')
|
|
|
|
# Debug: print path
|
|
print(f"Looking for Palestinian claims at: {PALESTINIAN_CLAIMS}")
|
|
print(f"File exists: {PALESTINIAN_CLAIMS.exists()}")
|
|
OUTPUT_FILE = ARCHIVESLAB_DIR / 'archiveslab_claims_enriched.json'
|
|
|
|
# =============================================================================
|
|
# Data Classes
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class EnrichedProvenance:
|
|
"""Enhanced provenance tracking."""
|
|
source_url: str
|
|
extraction_method: str
|
|
extraction_date: str
|
|
confidence: float = 0.85
|
|
enrichment_date: Optional[str] = None
|
|
enrichment_method: Optional[str] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {k: v for k, v in asdict(self).items() if v is not None}
|
|
|
|
|
|
@dataclass
|
|
class EnrichedEntityClaim:
|
|
"""Enhanced entity claim with GLAM-NER types."""
|
|
entity_id: str
|
|
entity_type: str # GLAM-NER hypernym (e.g., GRP.HER, AGT.PER)
|
|
name: str
|
|
clean_name: str # Cleaned version of name
|
|
context: Optional[str] = None
|
|
provenance: Optional[EnrichedProvenance] = None
|
|
metadata: Optional[Dict[str, Any]] = None
|
|
affiliations: Optional[List[str]] = None # Organizations person is affiliated with
|
|
roles: Optional[List[str]] = None # Roles/titles
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
result = {
|
|
'entity_id': self.entity_id,
|
|
'entity_type': self.entity_type,
|
|
'name': self.name,
|
|
'clean_name': self.clean_name,
|
|
}
|
|
if self.context:
|
|
result['context'] = self.context
|
|
if self.provenance:
|
|
result['provenance'] = self.provenance.to_dict()
|
|
if self.metadata:
|
|
result['metadata'] = self.metadata
|
|
if self.affiliations:
|
|
result['affiliations'] = self.affiliations
|
|
if self.roles:
|
|
result['roles'] = self.roles
|
|
return result
|
|
|
|
|
|
@dataclass
|
|
class EnrichedTriple:
|
|
"""Enhanced triple with more relationship types."""
|
|
subject: str
|
|
predicate: str
|
|
object: str
|
|
provenance: Optional[EnrichedProvenance] = None
|
|
confidence: float = 0.85
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
result = {
|
|
'subject': self.subject,
|
|
'predicate': self.predicate,
|
|
'object': self.object,
|
|
'confidence': self.confidence,
|
|
}
|
|
if self.provenance:
|
|
result['provenance'] = self.provenance.to_dict()
|
|
return result
|
|
|
|
|
|
# =============================================================================
|
|
# Name Cleanup Functions
|
|
# =============================================================================
|
|
|
|
def clean_entity_name(name: str) -> str:
|
|
"""
|
|
Clean entity name by removing markdown artifacts.
|
|
|
|
Examples:
|
|
"Mahmoud Balaawy\n\nTitle" -> "Mahmoud Balaawy"
|
|
"Ana Roeschley \n\n \n\nTitle" -> "Ana Roeschley"
|
|
"Biographies\n\nAbdallah Omar" -> "Abdallah Omar"
|
|
"""
|
|
# Remove "Title:" and variations
|
|
name = re.sub(r'\s*\n+\s*Title\s*:?\s*', '', name)
|
|
|
|
# Remove "Abstract:" and variations
|
|
name = re.sub(r'\s*\n+\s*Abstract\s*:?\s*', '', name)
|
|
|
|
# Remove "Biographies" prefix
|
|
name = re.sub(r'^Biographies\s*\n+\s*', '', name)
|
|
|
|
# Remove "Speaker:" prefix
|
|
name = re.sub(r'^Speakers?\s*:\s*', '', name)
|
|
|
|
# Remove "Moderator:" prefix
|
|
name = re.sub(r'^Moderators?\s*:\s*', '', name)
|
|
|
|
# Remove "Keynote Speaker:" prefix
|
|
name = re.sub(r'^Keynote\s+Speakers?\s*:\s*', '', name)
|
|
|
|
# Clean up multiple newlines and whitespace
|
|
name = re.sub(r'\s*\n+\s*', ' ', name)
|
|
name = re.sub(r'\s+', ' ', name)
|
|
|
|
# Remove leading/trailing whitespace
|
|
name = name.strip()
|
|
|
|
# Remove trailing punctuation
|
|
name = re.sub(r'[,;:]+$', '', name)
|
|
|
|
return name
|
|
|
|
|
|
def extract_title_from_context(context: str) -> Optional[str]:
|
|
"""Extract academic/professional title from context."""
|
|
if not context:
|
|
return None
|
|
|
|
# Look for Dr., Prof., Mr., Ms., etc.
|
|
title_match = re.search(r'\b(Dr\.|Prof\.|Mr\.|Ms\.|Mrs\.|Eng\.)\s', context)
|
|
if title_match:
|
|
return title_match.group(1).rstrip('.')
|
|
|
|
return None
|
|
|
|
|
|
def extract_affiliation_from_context(context: str) -> List[str]:
|
|
"""Extract organizational affiliations from context."""
|
|
affiliations = []
|
|
|
|
if not context:
|
|
return affiliations
|
|
|
|
# Common affiliation patterns
|
|
patterns = [
|
|
r'(?:at|from|of)\s+(?:the\s+)?([A-Z][a-zA-Z\s]+(?:University|Institute|Archive|Museum|Library|Center|Centre|Lab|Laboratory|Association|Council))',
|
|
r'(?:at|from|of)\s+(?:the\s+)?([A-Z][a-zA-Z\s]+(?:of\s+[A-Z][a-zA-Z\s]+))',
|
|
r'([A-Z][a-zA-Z\s]+Municipality)',
|
|
]
|
|
|
|
for pattern in patterns:
|
|
matches = re.findall(pattern, context)
|
|
affiliations.extend([m.strip() for m in matches if len(m.strip()) > 5])
|
|
|
|
return list(set(affiliations))
|
|
|
|
|
|
# =============================================================================
|
|
# GLAM-NER Type Mapping
|
|
# =============================================================================
|
|
|
|
def map_to_glam_ner_type(entity_type: str, name: str, context: str = "") -> str:
|
|
"""
|
|
Map simple entity types to GLAM-NER hypernyms.
|
|
|
|
GLAM-NER v1.7.0 Entity Hypernyms:
|
|
- AGT.PER: Person
|
|
- AGT.STF: Staff member
|
|
- GRP.HER: Heritage institution
|
|
- GRP.EDU: Educational institution
|
|
- GRP.ASS: Association/society
|
|
- GRP.GOV: Government agency
|
|
- TOP.SET: Settlement (city)
|
|
- TOP.CTY: Country
|
|
"""
|
|
name_lower = name.lower()
|
|
context_lower = context.lower() if context else ""
|
|
|
|
if entity_type == "PER":
|
|
# Check if this is a staff member (has role context)
|
|
if any(term in context_lower for term in ['director', 'professor', 'curator', 'archivist', 'librarian', 'officer']):
|
|
return "AGT.STF"
|
|
return "AGT.PER"
|
|
|
|
elif entity_type == "ORG":
|
|
# Heritage institutions
|
|
if any(term in name_lower for term in ['archive', 'museum', 'library', 'heritage', 'memorial']):
|
|
return "GRP.HER"
|
|
# Educational institutions
|
|
if any(term in name_lower for term in ['university', 'college', 'school', 'institute', 'academic']):
|
|
return "GRP.EDU"
|
|
# Associations and societies
|
|
if any(term in name_lower for term in ['association', 'society', 'council', 'committee', 'group', 'network']):
|
|
return "GRP.ASS"
|
|
# Government
|
|
if any(term in name_lower for term in ['municipality', 'government', 'ministry', 'national']):
|
|
return "GRP.GOV"
|
|
return "GRP"
|
|
|
|
elif entity_type == "LOC":
|
|
# Countries
|
|
if name in ['Palestine', 'Lebanon', 'Gaza', 'Israel', 'Jordan', 'Syria', 'Egypt', 'Algeria', 'Malta', 'Ghana']:
|
|
return "TOP.CTY"
|
|
# Cities
|
|
if any(term in name_lower for term in ['city', 'town', 'village']) or name in [
|
|
'Jerusalem', 'Ramallah', 'Hebron', 'Nablus', 'Bethlehem', 'Tulkarm',
|
|
'Beirut', 'New York', 'Amsterdam', 'London'
|
|
]:
|
|
return "TOP.SET"
|
|
return "TOP"
|
|
|
|
elif entity_type == "URL":
|
|
return "APP.URL"
|
|
|
|
elif entity_type == "EMAIL":
|
|
return "APP.EML"
|
|
|
|
return entity_type
|
|
|
|
|
|
# =============================================================================
|
|
# Relationship Extraction
|
|
# =============================================================================
|
|
|
|
def extract_relationships(entities: List[Dict], context_map: Dict[str, str]) -> List[EnrichedTriple]:
|
|
"""
|
|
Extract additional relationship types from entities and context.
|
|
|
|
Relationship types:
|
|
- REL.EVT.SPEAKS_AT: Person speaks at event
|
|
- REL.EVT.MODERATES: Person moderates panel
|
|
- REL.EVT.PARTICIPATES: Organization participates in event
|
|
- REL.SOC.MEMBER_OF: Person is member of organization
|
|
- REL.SOC.AFFILIATED_WITH: Person affiliated with institution
|
|
- REL.SOC.WORKS_AT: Person works at institution
|
|
- REL.SPA.LOCATED_IN: Entity located in place
|
|
- REL.ONT.ISA: Entity is instance of type
|
|
"""
|
|
triples = []
|
|
event_name = "Resilient Communities Resilient Archives"
|
|
|
|
provenance = EnrichedProvenance(
|
|
source_url="https://www.archiveslab.org/events/resilient-communities-resilient-archives/english-program-resilient-communities-resilient-archives",
|
|
extraction_method="Enriched extraction with relationship inference",
|
|
extraction_date=datetime.now(timezone.utc).isoformat(),
|
|
confidence=0.85
|
|
)
|
|
|
|
# Group entities by type
|
|
persons = [e for e in entities if e['entity_type'].startswith('AGT')]
|
|
organizations = [e for e in entities if e['entity_type'].startswith('GRP')]
|
|
locations = [e for e in entities if e['entity_type'].startswith('TOP')]
|
|
|
|
# Person relationships
|
|
for person in persons:
|
|
name = person['clean_name']
|
|
context = context_map.get(name, "")
|
|
|
|
# Speaking at event
|
|
if 'speaker' in context.lower() or 'keynote' in context.lower():
|
|
triples.append(EnrichedTriple(
|
|
subject=name,
|
|
predicate="REL.EVT.SPEAKS_AT",
|
|
object=event_name,
|
|
provenance=provenance,
|
|
confidence=0.9
|
|
))
|
|
|
|
# Moderating panel
|
|
if 'moderator' in context.lower():
|
|
triples.append(EnrichedTriple(
|
|
subject=name,
|
|
predicate="REL.EVT.MODERATES",
|
|
object=event_name,
|
|
provenance=provenance,
|
|
confidence=0.9
|
|
))
|
|
|
|
# Affiliations
|
|
affiliations = person.get('affiliations', [])
|
|
for affiliation in affiliations:
|
|
triples.append(EnrichedTriple(
|
|
subject=name,
|
|
predicate="REL.SOC.AFFILIATED_WITH",
|
|
object=affiliation,
|
|
provenance=provenance,
|
|
confidence=0.75
|
|
))
|
|
|
|
# Work relationships (from context)
|
|
work_patterns = [
|
|
(r'(?:at|with)\s+(?:the\s+)?([A-Z][a-zA-Z\s]+(?:University|Institute|Archive|Museum|Library))', "REL.SOC.WORKS_AT"),
|
|
(r'(?:fellow|researcher|professor|director)\s+(?:at|of)\s+(?:the\s+)?([A-Z][a-zA-Z\s]+)', "REL.SOC.WORKS_AT"),
|
|
]
|
|
|
|
for pattern, predicate in work_patterns:
|
|
matches = re.findall(pattern, context)
|
|
for match in matches:
|
|
if len(match.strip()) > 5:
|
|
triples.append(EnrichedTriple(
|
|
subject=name,
|
|
predicate=predicate,
|
|
object=match.strip(),
|
|
provenance=provenance,
|
|
confidence=0.7
|
|
))
|
|
|
|
# Organization relationships
|
|
for org in organizations:
|
|
name = org['clean_name']
|
|
|
|
# Participating in event
|
|
triples.append(EnrichedTriple(
|
|
subject=name,
|
|
predicate="REL.EVT.PARTICIPATES",
|
|
object=event_name,
|
|
provenance=provenance,
|
|
confidence=0.85
|
|
))
|
|
|
|
# Type classification
|
|
org_type = org['entity_type']
|
|
type_labels = {
|
|
'GRP.HER': 'Heritage Institution',
|
|
'GRP.EDU': 'Educational Institution',
|
|
'GRP.ASS': 'Association',
|
|
'GRP.GOV': 'Government Agency',
|
|
'GRP': 'Organization'
|
|
}
|
|
if org_type in type_labels:
|
|
triples.append(EnrichedTriple(
|
|
subject=name,
|
|
predicate="REL.ONT.ISA",
|
|
object=type_labels[org_type],
|
|
provenance=provenance,
|
|
confidence=0.9
|
|
))
|
|
|
|
# Location relationships
|
|
for loc in locations:
|
|
name = loc['clean_name']
|
|
|
|
# Event location
|
|
if name in ['Palestine', 'Lebanon', 'Gaza', 'Jerusalem', 'Ramallah']:
|
|
triples.append(EnrichedTriple(
|
|
subject=event_name,
|
|
predicate="REL.SPA.FOCUS_REGION",
|
|
object=name,
|
|
provenance=provenance,
|
|
confidence=0.9
|
|
))
|
|
|
|
return triples
|
|
|
|
|
|
# =============================================================================
|
|
# Cross-Reference with Palestinian GLAM Data
|
|
# =============================================================================
|
|
|
|
def load_palestinian_institutions() -> Dict[str, Dict]:
|
|
"""Load Palestinian GLAM institutions for cross-referencing."""
|
|
if not PALESTINIAN_CLAIMS.exists():
|
|
print(f" Palestinian claims file not found: {PALESTINIAN_CLAIMS}")
|
|
return {}
|
|
|
|
with open(PALESTINIAN_CLAIMS, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
institutions = {}
|
|
for claim in data.get('entity_claims', []):
|
|
# Palestinian claims use 'text' not 'name'
|
|
name = claim.get('text', '') or claim.get('name', '')
|
|
if name:
|
|
institutions[name.lower()] = claim
|
|
|
|
print(f" Loaded {len(institutions)} Palestinian institutions")
|
|
return institutions
|
|
|
|
|
|
def find_cross_references(entities: List[Dict], palestinian_institutions: Dict[str, Dict]) -> List[Dict]:
|
|
"""Find entities that match Palestinian GLAM institutions."""
|
|
cross_refs = []
|
|
|
|
for entity in entities:
|
|
name = entity.get('clean_name', '').lower()
|
|
|
|
# Direct match
|
|
if name in palestinian_institutions:
|
|
cross_refs.append({
|
|
'archiveslab_entity': entity['clean_name'],
|
|
'palestinian_match': palestinian_institutions[name],
|
|
'match_type': 'exact'
|
|
})
|
|
continue
|
|
|
|
# Partial match (check if entity name contains or is contained in institution name)
|
|
for inst_name, inst_data in palestinian_institutions.items():
|
|
if len(name) > 5 and (name in inst_name or inst_name in name):
|
|
cross_refs.append({
|
|
'archiveslab_entity': entity['clean_name'],
|
|
'palestinian_match': inst_data,
|
|
'match_type': 'partial'
|
|
})
|
|
break
|
|
|
|
return cross_refs
|
|
|
|
|
|
# =============================================================================
|
|
# Main Processing
|
|
# =============================================================================
|
|
|
|
def process_claims():
|
|
"""Main processing function."""
|
|
|
|
# Load original claims
|
|
claims_file = ARCHIVESLAB_DIR / 'archiveslab_claims.json'
|
|
with open(claims_file, 'r', encoding='utf-8') as f:
|
|
original_data = json.load(f)
|
|
|
|
print("=" * 60)
|
|
print("ARCHIVES LAB CLAIMS ENRICHMENT")
|
|
print("=" * 60)
|
|
print(f"\nOriginal entities: {len(original_data['entity_claims'])}")
|
|
print(f"Original triples: {len(original_data['triples'])}")
|
|
|
|
# Step 1: Clean entity names
|
|
print("\n--- Step 1: Cleaning entity names ---")
|
|
cleaned_entities = []
|
|
context_map = {}
|
|
|
|
for claim in original_data['entity_claims']:
|
|
original_name = claim['name']
|
|
clean_name = clean_entity_name(original_name)
|
|
context = claim.get('context', '')
|
|
|
|
# Skip if name is too short after cleaning
|
|
if len(clean_name) < 3:
|
|
continue
|
|
|
|
# Skip duplicate clean names
|
|
if clean_name in context_map:
|
|
continue
|
|
|
|
context_map[clean_name] = context
|
|
|
|
# Map to GLAM-NER type
|
|
glam_type = map_to_glam_ner_type(claim['entity_type'], clean_name, context)
|
|
|
|
# Extract affiliations and roles
|
|
affiliations = extract_affiliation_from_context(context)
|
|
title = extract_title_from_context(context)
|
|
roles = [title] if title else []
|
|
|
|
enriched_claim = EnrichedEntityClaim(
|
|
entity_id=claim['entity_id'],
|
|
entity_type=glam_type,
|
|
name=original_name,
|
|
clean_name=clean_name,
|
|
context=context[:200] if context else None,
|
|
provenance=EnrichedProvenance(
|
|
source_url=original_data['source_url'],
|
|
extraction_method="Regex + Playwright",
|
|
extraction_date=claim['provenance']['extraction_date'],
|
|
confidence=claim['provenance']['confidence'],
|
|
enrichment_date=datetime.now(timezone.utc).isoformat(),
|
|
enrichment_method="GLAM-NER type mapping + name cleanup"
|
|
),
|
|
affiliations=affiliations if affiliations else None,
|
|
roles=roles if roles else None
|
|
)
|
|
cleaned_entities.append(enriched_claim.to_dict())
|
|
|
|
print(f"Cleaned entities: {len(cleaned_entities)} (removed {len(original_data['entity_claims']) - len(cleaned_entities)} duplicates/artifacts)")
|
|
|
|
# Show sample cleanups
|
|
print("\nSample name cleanups:")
|
|
samples = [e for e in cleaned_entities if e['name'] != e['clean_name']][:5]
|
|
for s in samples:
|
|
print(f" '{s['name'][:40]}...' -> '{s['clean_name']}'")
|
|
|
|
# Step 2: Map to GLAM-NER types
|
|
print("\n--- Step 2: GLAM-NER type distribution ---")
|
|
type_counts = {}
|
|
for e in cleaned_entities:
|
|
t = e['entity_type']
|
|
type_counts[t] = type_counts.get(t, 0) + 1
|
|
|
|
for t, count in sorted(type_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {t}: {count}")
|
|
|
|
# Step 3: Extract relationships
|
|
print("\n--- Step 3: Extracting enriched relationships ---")
|
|
enriched_triples = extract_relationships(cleaned_entities, context_map)
|
|
print(f"Generated {len(enriched_triples)} enriched triples")
|
|
|
|
# Relationship type breakdown
|
|
rel_counts = {}
|
|
for t in enriched_triples:
|
|
p = t.predicate
|
|
rel_counts[p] = rel_counts.get(p, 0) + 1
|
|
|
|
print("\nRelationship types:")
|
|
for p, count in sorted(rel_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {p}: {count}")
|
|
|
|
# Step 4: Cross-reference with Palestinian GLAM data
|
|
print("\n--- Step 4: Cross-referencing with Palestinian GLAM data ---")
|
|
palestinian_institutions = load_palestinian_institutions()
|
|
|
|
if palestinian_institutions:
|
|
cross_refs = find_cross_references(cleaned_entities, palestinian_institutions)
|
|
print(f"Found {len(cross_refs)} cross-references with Palestinian GLAM institutions")
|
|
|
|
if cross_refs:
|
|
print("\nCross-references found:")
|
|
for ref in cross_refs[:10]:
|
|
match_name = ref['palestinian_match'].get('name', 'Unknown')
|
|
print(f" {ref['archiveslab_entity']} <-> {match_name} ({ref['match_type']})")
|
|
else:
|
|
cross_refs = []
|
|
print("Palestinian GLAM data not found, skipping cross-referencing")
|
|
|
|
# Save enriched data
|
|
print("\n--- Saving enriched data ---")
|
|
|
|
enriched_data = {
|
|
'source_url': original_data['source_url'],
|
|
'fetch_timestamp': original_data['fetch_timestamp'],
|
|
'extraction_timestamp': original_data['extraction_timestamp'],
|
|
'enrichment_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'entity_claims': cleaned_entities,
|
|
'triples': [t.to_dict() for t in enriched_triples],
|
|
'cross_references': cross_refs,
|
|
'statistics': {
|
|
'original_entities': len(original_data['entity_claims']),
|
|
'cleaned_entities': len(cleaned_entities),
|
|
'original_triples': len(original_data['triples']),
|
|
'enriched_triples': len(enriched_triples),
|
|
'cross_references': len(cross_refs),
|
|
'entity_types': type_counts,
|
|
'relationship_types': rel_counts
|
|
}
|
|
}
|
|
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
json.dump(enriched_data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Saved: {OUTPUT_FILE}")
|
|
print("\n" + "=" * 60)
|
|
print("ENRICHMENT COMPLETE")
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
process_claims()
|