glam/scripts/clean_and_enrich_archiveslab.py

#!/usr/bin/env python3
"""
Clean and enrich Archives Lab extraction with:
1. Name cleanup (remove markdown artifacts)
2. LLM-based entity extraction using GLAM-NER types
3. Additional relationship types
4. Cross-referencing with Palestinian GLAM data

Usage:
    PYTHONPATH=src python scripts/clean_and_enrich_archiveslab.py
"""

import json
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from dataclasses import dataclass, asdict, field
from typing import Any, Dict, List, Optional, Set, Tuple

# =============================================================================
# Configuration
# =============================================================================

ARCHIVESLAB_DIR = Path('/Users/kempersc/apps/glam/data/extracted/archiveslab')
PALESTINIAN_CLAIMS = Path('/Users/kempersc/apps/glam/data/extracted/palestinian_glam_claims.json')

# Debug: print path
print(f"Looking for Palestinian claims at: {PALESTINIAN_CLAIMS}")
print(f"File exists: {PALESTINIAN_CLAIMS.exists()}")
OUTPUT_FILE = ARCHIVESLAB_DIR / 'archiveslab_claims_enriched.json'

# =============================================================================
# Data Classes
# =============================================================================

@dataclass
class EnrichedProvenance:
    """Enhanced provenance tracking."""
    source_url: str
    extraction_method: str
    extraction_date: str
    confidence: float = 0.85
    enrichment_date: Optional[str] = None
    enrichment_method: Optional[str] = None

    def to_dict(self) -> Dict[str, Any]:
        return {k: v for k, v in asdict(self).items() if v is not None}


@dataclass
class EnrichedEntityClaim:
    """Enhanced entity claim with GLAM-NER types."""
    entity_id: str
    entity_type: str  # GLAM-NER hypernym (e.g., GRP.HER, AGT.PER)
    name: str
    clean_name: str  # Cleaned version of name
    context: Optional[str] = None
    provenance: Optional[EnrichedProvenance] = None
    metadata: Optional[Dict[str, Any]] = None
    affiliations: Optional[List[str]] = None  # Organizations person is affiliated with
    roles: Optional[List[str]] = None  # Roles/titles

    def to_dict(self) -> Dict[str, Any]:
        result = {
            'entity_id': self.entity_id,
            'entity_type': self.entity_type,
            'name': self.name,
            'clean_name': self.clean_name,
        }
        if self.context:
            result['context'] = self.context
        if self.provenance:
            result['provenance'] = self.provenance.to_dict()
        if self.metadata:
            result['metadata'] = self.metadata
        if self.affiliations:
            result['affiliations'] = self.affiliations
        if self.roles:
            result['roles'] = self.roles
        return result


@dataclass
class EnrichedTriple:
    """Enhanced triple with more relationship types."""
    subject: str
    predicate: str
    object: str
    provenance: Optional[EnrichedProvenance] = None
    confidence: float = 0.85

    def to_dict(self) -> Dict[str, Any]:
        result = {
            'subject': self.subject,
            'predicate': self.predicate,
            'object': self.object,
            'confidence': self.confidence,
        }
        if self.provenance:
            result['provenance'] = self.provenance.to_dict()
        return result


# =============================================================================
# Name Cleanup Functions
# =============================================================================

def clean_entity_name(name: str) -> str:
    """
    Clean entity name by removing markdown artifacts.

    Examples:
        "Mahmoud Balaawy\n\nTitle" -> "Mahmoud Balaawy"
        "Ana Roeschley \n\n  \n\nTitle" -> "Ana Roeschley"
        "Biographies\n\nAbdallah Omar" -> "Abdallah Omar"
    """
    # Remove "Title:" and variations
    name = re.sub(r'\s*\n+\s*Title\s*:?\s*', '', name)

    # Remove "Abstract:" and variations
    name = re.sub(r'\s*\n+\s*Abstract\s*:?\s*', '', name)

    # Remove "Biographies" prefix
    name = re.sub(r'^Biographies\s*\n+\s*', '', name)

    # Remove "Speaker:" prefix
    name = re.sub(r'^Speakers?\s*:\s*', '', name)

    # Remove "Moderator:" prefix
    name = re.sub(r'^Moderators?\s*:\s*', '', name)

    # Remove "Keynote Speaker:" prefix
    name = re.sub(r'^Keynote\s+Speakers?\s*:\s*', '', name)

    # Clean up multiple newlines and whitespace
    name = re.sub(r'\s*\n+\s*', ' ', name)
    name = re.sub(r'\s+', ' ', name)

    # Remove leading/trailing whitespace
    name = name.strip()

    # Remove trailing punctuation
    name = re.sub(r'[,;:]+$', '', name)

    return name


def extract_title_from_context(context: str) -> Optional[str]:
    """Extract academic/professional title from context."""
    if not context:
        return None

    # Look for Dr., Prof., Mr., Ms., etc.
    title_match = re.search(r'\b(Dr\.|Prof\.|Mr\.|Ms\.|Mrs\.|Eng\.)\s', context)
    if title_match:
        return title_match.group(1).rstrip('.')

    return None


def extract_affiliation_from_context(context: str) -> List[str]:
    """Extract organizational affiliations from context."""
    affiliations = []

    if not context:
        return affiliations

    # Common affiliation patterns
    patterns = [
        r'(?:at|from|of)\s+(?:the\s+)?([A-Z][a-zA-Z\s]+(?:University|Institute|Archive|Museum|Library|Center|Centre|Lab|Laboratory|Association|Council))',
        r'(?:at|from|of)\s+(?:the\s+)?([A-Z][a-zA-Z\s]+(?:of\s+[A-Z][a-zA-Z\s]+))',
        r'([A-Z][a-zA-Z\s]+Municipality)',
    ]

    for pattern in patterns:
        matches = re.findall(pattern, context)
        affiliations.extend([m.strip() for m in matches if len(m.strip()) > 5])

    return list(set(affiliations))


# =============================================================================
# GLAM-NER Type Mapping
# =============================================================================

def map_to_glam_ner_type(entity_type: str, name: str, context: str = "") -> str:
    """
    Map simple entity types to GLAM-NER hypernyms.

    GLAM-NER v1.7.0 Entity Hypernyms:
    - AGT.PER: Person
    - AGT.STF: Staff member
    - GRP.HER: Heritage institution
    - GRP.EDU: Educational institution
    - GRP.ASS: Association/society
    - GRP.GOV: Government agency
    - TOP.SET: Settlement (city)
    - TOP.CTY: Country
    """
    name_lower = name.lower()
    context_lower = context.lower() if context else ""

    if entity_type == "PER":
        # Check if this is a staff member (has role context)
        if any(term in context_lower for term in ['director', 'professor', 'curator', 'archivist', 'librarian', 'officer']):
            return "AGT.STF"
        return "AGT.PER"

    elif entity_type == "ORG":
        # Heritage institutions
        if any(term in name_lower for term in ['archive', 'museum', 'library', 'heritage', 'memorial']):
            return "GRP.HER"
        # Educational institutions
        if any(term in name_lower for term in ['university', 'college', 'school', 'institute', 'academic']):
            return "GRP.EDU"
        # Associations and societies
        if any(term in name_lower for term in ['association', 'society', 'council', 'committee', 'group', 'network']):
            return "GRP.ASS"
        # Government
        if any(term in name_lower for term in ['municipality', 'government', 'ministry', 'national']):
            return "GRP.GOV"
        return "GRP"

    elif entity_type == "LOC":
        # Countries
        if name in ['Palestine', 'Lebanon', 'Gaza', 'Israel', 'Jordan', 'Syria', 'Egypt', 'Algeria', 'Malta', 'Ghana']:
            return "TOP.CTY"
        # Cities
        if any(term in name_lower for term in ['city', 'town', 'village']) or name in [
            'Jerusalem', 'Ramallah', 'Hebron', 'Nablus', 'Bethlehem', 'Tulkarm',
            'Beirut', 'New York', 'Amsterdam', 'London'
        ]:
            return "TOP.SET"
        return "TOP"

    elif entity_type == "URL":
        return "APP.URL"

    elif entity_type == "EMAIL":
        return "APP.EML"

    return entity_type


# =============================================================================
# Relationship Extraction
# =============================================================================

def extract_relationships(entities: List[Dict], context_map: Dict[str, str]) -> List[EnrichedTriple]:
    """
    Extract additional relationship types from entities and context.

    Relationship types:
    - REL.EVT.SPEAKS_AT: Person speaks at event
    - REL.EVT.MODERATES: Person moderates panel
    - REL.EVT.PARTICIPATES: Organization participates in event
    - REL.SOC.MEMBER_OF: Person is member of organization
    - REL.SOC.AFFILIATED_WITH: Person affiliated with institution
    - REL.SOC.WORKS_AT: Person works at institution
    - REL.SPA.LOCATED_IN: Entity located in place
    - REL.ONT.ISA: Entity is instance of type
    """
    triples = []
    event_name = "Resilient Communities Resilient Archives"

    provenance = EnrichedProvenance(
        source_url="https://www.archiveslab.org/events/resilient-communities-resilient-archives/english-program-resilient-communities-resilient-archives",
        extraction_method="Enriched extraction with relationship inference",
        extraction_date=datetime.now(timezone.utc).isoformat(),
        confidence=0.85
    )

    # Group entities by type
    persons = [e for e in entities if e['entity_type'].startswith('AGT')]
    organizations = [e for e in entities if e['entity_type'].startswith('GRP')]
    locations = [e for e in entities if e['entity_type'].startswith('TOP')]

    # Person relationships
    for person in persons:
        name = person['clean_name']
        context = context_map.get(name, "")

        # Speaking at event
        if 'speaker' in context.lower() or 'keynote' in context.lower():
            triples.append(EnrichedTriple(
                subject=name,
                predicate="REL.EVT.SPEAKS_AT",
                object=event_name,
                provenance=provenance,
                confidence=0.9
            ))

        # Moderating panel
        if 'moderator' in context.lower():
            triples.append(EnrichedTriple(
                subject=name,
                predicate="REL.EVT.MODERATES",
                object=event_name,
                provenance=provenance,
                confidence=0.9
            ))

        # Affiliations
        affiliations = person.get('affiliations', [])
        for affiliation in affiliations:
            triples.append(EnrichedTriple(
                subject=name,
                predicate="REL.SOC.AFFILIATED_WITH",
                object=affiliation,
                provenance=provenance,
                confidence=0.75
            ))

        # Work relationships (from context)
        work_patterns = [
            (r'(?:at|with)\s+(?:the\s+)?([A-Z][a-zA-Z\s]+(?:University|Institute|Archive|Museum|Library))', "REL.SOC.WORKS_AT"),
            (r'(?:fellow|researcher|professor|director)\s+(?:at|of)\s+(?:the\s+)?([A-Z][a-zA-Z\s]+)', "REL.SOC.WORKS_AT"),
        ]

        for pattern, predicate in work_patterns:
            matches = re.findall(pattern, context)
            for match in matches:
                if len(match.strip()) > 5:
                    triples.append(EnrichedTriple(
                        subject=name,
                        predicate=predicate,
                        object=match.strip(),
                        provenance=provenance,
                        confidence=0.7
                    ))

    # Organization relationships
    for org in organizations:
        name = org['clean_name']

        # Participating in event
        triples.append(EnrichedTriple(
            subject=name,
            predicate="REL.EVT.PARTICIPATES",
            object=event_name,
            provenance=provenance,
            confidence=0.85
        ))

        # Type classification
        org_type = org['entity_type']
        type_labels = {
            'GRP.HER': 'Heritage Institution',
            'GRP.EDU': 'Educational Institution',
            'GRP.ASS': 'Association',
            'GRP.GOV': 'Government Agency',
            'GRP': 'Organization'
        }
        if org_type in type_labels:
            triples.append(EnrichedTriple(
                subject=name,
                predicate="REL.ONT.ISA",
                object=type_labels[org_type],
                provenance=provenance,
                confidence=0.9
            ))

    # Location relationships
    for loc in locations:
        name = loc['clean_name']

        # Event location
        if name in ['Palestine', 'Lebanon', 'Gaza', 'Jerusalem', 'Ramallah']:
            triples.append(EnrichedTriple(
                subject=event_name,
                predicate="REL.SPA.FOCUS_REGION",
                object=name,
                provenance=provenance,
                confidence=0.9
            ))

    return triples


# =============================================================================
# Cross-Reference with Palestinian GLAM Data
# =============================================================================

def load_palestinian_institutions() -> Dict[str, Dict]:
    """Load Palestinian GLAM institutions for cross-referencing."""
    if not PALESTINIAN_CLAIMS.exists():
        print(f"  Palestinian claims file not found: {PALESTINIAN_CLAIMS}")
        return {}

    with open(PALESTINIAN_CLAIMS, 'r', encoding='utf-8') as f:
        data = json.load(f)

    institutions = {}
    for claim in data.get('entity_claims', []):
        # Palestinian claims use 'text' not 'name'
        name = claim.get('text', '') or claim.get('name', '')
        if name:
            institutions[name.lower()] = claim

    print(f"  Loaded {len(institutions)} Palestinian institutions")
    return institutions


def find_cross_references(entities: List[Dict], palestinian_institutions: Dict[str, Dict]) -> List[Dict]:
    """Find entities that match Palestinian GLAM institutions."""
    cross_refs = []

    for entity in entities:
        name = entity.get('clean_name', '').lower()

        # Direct match
        if name in palestinian_institutions:
            cross_refs.append({
                'archiveslab_entity': entity['clean_name'],
                'palestinian_match': palestinian_institutions[name],
                'match_type': 'exact'
            })
            continue

        # Partial match (check if entity name contains or is contained in institution name)
        for inst_name, inst_data in palestinian_institutions.items():
            if len(name) > 5 and (name in inst_name or inst_name in name):
                cross_refs.append({
                    'archiveslab_entity': entity['clean_name'],
                    'palestinian_match': inst_data,
                    'match_type': 'partial'
                })
                break

    return cross_refs


# =============================================================================
# Main Processing
# =============================================================================

def process_claims():
    """Main processing function."""

    # Load original claims
    claims_file = ARCHIVESLAB_DIR / 'archiveslab_claims.json'
    with open(claims_file, 'r', encoding='utf-8') as f:
        original_data = json.load(f)

    print("=" * 60)
    print("ARCHIVES LAB CLAIMS ENRICHMENT")
    print("=" * 60)
    print(f"\nOriginal entities: {len(original_data['entity_claims'])}")
    print(f"Original triples: {len(original_data['triples'])}")

    # Step 1: Clean entity names
    print("\n--- Step 1: Cleaning entity names ---")
    cleaned_entities = []
    context_map = {}

    for claim in original_data['entity_claims']:
        original_name = claim['name']
        clean_name = clean_entity_name(original_name)
        context = claim.get('context', '')

        # Skip if name is too short after cleaning
        if len(clean_name) < 3:
            continue

        # Skip duplicate clean names
        if clean_name in context_map:
            continue

        context_map[clean_name] = context

        # Map to GLAM-NER type
        glam_type = map_to_glam_ner_type(claim['entity_type'], clean_name, context)

        # Extract affiliations and roles
        affiliations = extract_affiliation_from_context(context)
        title = extract_title_from_context(context)
        roles = [title] if title else []

        enriched_claim = EnrichedEntityClaim(
            entity_id=claim['entity_id'],
            entity_type=glam_type,
            name=original_name,
            clean_name=clean_name,
            context=context[:200] if context else None,
            provenance=EnrichedProvenance(
                source_url=original_data['source_url'],
                extraction_method="Regex + Playwright",
                extraction_date=claim['provenance']['extraction_date'],
                confidence=claim['provenance']['confidence'],
                enrichment_date=datetime.now(timezone.utc).isoformat(),
                enrichment_method="GLAM-NER type mapping + name cleanup"
            ),
            affiliations=affiliations if affiliations else None,
            roles=roles if roles else None
        )
        cleaned_entities.append(enriched_claim.to_dict())

    print(f"Cleaned entities: {len(cleaned_entities)} (removed {len(original_data['entity_claims']) - len(cleaned_entities)} duplicates/artifacts)")

    # Show sample cleanups
    print("\nSample name cleanups:")
    samples = [e for e in cleaned_entities if e['name'] != e['clean_name']][:5]
    for s in samples:
        print(f"  '{s['name'][:40]}...' -> '{s['clean_name']}'")

    # Step 2: Map to GLAM-NER types
    print("\n--- Step 2: GLAM-NER type distribution ---")
    type_counts = {}
    for e in cleaned_entities:
        t = e['entity_type']
        type_counts[t] = type_counts.get(t, 0) + 1

    for t, count in sorted(type_counts.items(), key=lambda x: -x[1]):
        print(f"  {t}: {count}")

    # Step 3: Extract relationships
    print("\n--- Step 3: Extracting enriched relationships ---")
    enriched_triples = extract_relationships(cleaned_entities, context_map)
    print(f"Generated {len(enriched_triples)} enriched triples")

    # Relationship type breakdown
    rel_counts = {}
    for t in enriched_triples:
        p = t.predicate
        rel_counts[p] = rel_counts.get(p, 0) + 1

    print("\nRelationship types:")
    for p, count in sorted(rel_counts.items(), key=lambda x: -x[1]):
        print(f"  {p}: {count}")

    # Step 4: Cross-reference with Palestinian GLAM data
    print("\n--- Step 4: Cross-referencing with Palestinian GLAM data ---")
    palestinian_institutions = load_palestinian_institutions()

    if palestinian_institutions:
        cross_refs = find_cross_references(cleaned_entities, palestinian_institutions)
        print(f"Found {len(cross_refs)} cross-references with Palestinian GLAM institutions")

        if cross_refs:
            print("\nCross-references found:")
            for ref in cross_refs[:10]:
                match_name = ref['palestinian_match'].get('name', 'Unknown')
                print(f"  {ref['archiveslab_entity']} <-> {match_name} ({ref['match_type']})")
    else:
        cross_refs = []
        print("Palestinian GLAM data not found, skipping cross-referencing")

    # Save enriched data
    print("\n--- Saving enriched data ---")

    enriched_data = {
        'source_url': original_data['source_url'],
        'fetch_timestamp': original_data['fetch_timestamp'],
        'extraction_timestamp': original_data['extraction_timestamp'],
        'enrichment_timestamp': datetime.now(timezone.utc).isoformat(),
        'entity_claims': cleaned_entities,
        'triples': [t.to_dict() for t in enriched_triples],
        'cross_references': cross_refs,
        'statistics': {
            'original_entities': len(original_data['entity_claims']),
            'cleaned_entities': len(cleaned_entities),
            'original_triples': len(original_data['triples']),
            'enriched_triples': len(enriched_triples),
            'cross_references': len(cross_refs),
            'entity_types': type_counts,
            'relationship_types': rel_counts
        }
    }

    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(enriched_data, f, indent=2, ensure_ascii=False)

    print(f"Saved: {OUTPUT_FILE}")
    print("\n" + "=" * 60)
    print("ENRICHMENT COMPLETE")
    print("=" * 60)


if __name__ == '__main__':
    process_claims()