glam/scripts/lap_gaza_report_extractor.py

#!/usr/bin/env python3
"""
LAP Gaza Report 2024 Claim Extractor

Extracts structured claims about Palestinian heritage institutions from the
LAP (Librarians and Archivists with Palestine) Gaza Report 2024.

Each claim includes XPath provenance for verifiability per CH-Annotator convention.
"""

import json
import re
from datetime import datetime, timezone
from pathlib import Path
from lxml import html
from lxml.etree import tostring
import hashlib

# Configuration
REPORT_URL = "https://librarianswithpalestine.org/gaza-report-2024/"
ARCHIVE_DIR = Path("/Users/kempersc/apps/glam/data/web/lap_gaza_report_2024")
OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/extracted/lap_gaza_claims.json")

# Institution mappings: LAP report name -> YAML file basename
INSTITUTION_MAPPINGS = {
    # Archives
    "Central Archives of Gaza City": "PS-GZ-GAZ-A-CAGC",
    "Omari Mosque and Library": "PS-GZ-GAZ-L-GOMAL",

    # Libraries
    "Ataa Library, IBBY Children in Crisis Library": "PS-GZ-BHA-L-ALICCL",
    "Diana Tamari Sabbagh Library": "PS-GZ-GAZ-L-DTSL",
    "Edward Said Library": "PS-GZ-GAZ-L-ESL",
    "Enaim Library": "PS-GZ-GAZ-L-EL",
    "Gaza Municipal Library": "PS-GZ-GAZ-L-GML",
    "Al-Kalima Library and Publishing House": "PS-GZ-GAZ-L-KL",
    "Kana'an Educational Development Institute": "PS-GZ-GAZ-L-KEDI",
    "Lubbud Library": "PS-GZ-GAZ-L-LL",
    "Al-Nahda Library": "PS-GZ-GAZ-L-NL",
    "Samir Mansour Bookshop and Library": "PS-GZ-GAZ-L-SMBAP",
    "Al-Shorouq Al-Daem Library": "PS-GZ-GAZ-L-SDL",

    # University Libraries
    "Al-Aqsa University Library": "PS-GZ-GAZ-L-AUL",
    "Islamic University of Gaza Library": "PS-GZA-GAZ-L-IUGL",
    "Al-Israa University Library and National Museum": "PS-GZ-GAZ-M-IULANM",
    "Al-Quds Open University Library": "PS-GZ-GAZ-L-QOULG",
    "Jawaharlal Nehru Library, Al-Azhar University": "PS-GZ-GAZ-L-JNLAUG",

    # Museums
    "Akkad Museum": "PS-GZ-GAZ-M-AM",
    "Cultural Kararah Museum": "PS-GZ-KYN-M-CKM",
    "Deir Al-Balah Museum": "PS-GZ-DEB-M-DBM",
    "Ibrahim Abu Sha'ar Heritage Diwan": "PS-GZ-GAZ-M-IASHD",
    "Khudari Museum": "PS-GZ-GAZ-M-KM",
    "Palestinian Costume Museum": "PS-GZ-GAZ-M-PCM",
    "Al-Qarara Cultural Museum": "PS-GZ-KYN-M-QCM",
    "Rafah Museum": "PS-GZ-RAF-M-RM",
    "Shahwan Museum": "PS-GZ-GAZ-M-SM",
}


def get_element_xpath(element) -> str:
    """Generate XPath for an element."""
    parts = []
    while element is not None:
        parent = element.getparent()
        if parent is None:
            parts.insert(0, element.tag)
            break
        siblings = [s for s in parent if s.tag == element.tag]
        if len(siblings) == 1:
            parts.insert(0, element.tag)
        else:
            index = siblings.index(element) + 1
            parts.insert(0, f"{element.tag}[{index}]")
        element = parent
    return "/" + "/".join(parts)


def extract_damage_status(text: str) -> str:
    """Extract damage status from description text."""
    text_lower = text.lower()
    if "complete destruction" in text_lower or "completely destroyed" in text_lower:
        return "DESTROYED"
    elif "destroyed" in text_lower:
        return "DESTROYED"
    elif "heavily damaged" in text_lower or "severe damage" in text_lower:
        return "SEVERE_DAMAGE"
    elif "damaged" in text_lower:
        return "DAMAGED"
    elif "partial damage" in text_lower:
        return "PARTIAL_DAMAGE"
    elif "fire" in text_lower or "burned" in text_lower:
        return "FIRE_DAMAGE"
    return "UNKNOWN"


def extract_date(text: str) -> str | None:
    """Extract date from text in various formats."""
    # Try specific date patterns
    patterns = [
        r"(\w+ \d{1,2}, \d{4})",  # November 29, 2023
        r"(\d{4}-\d{2}-\d{2})",   # 2023-11-29
        r"(October|November|December|January|February|March|April|May) \d{4}",  # October 2023
        r"Early (\w+ \d{4})",     # Early October 2023
    ]

    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(0)
    return None


def parse_institution_section(h3_element, tree) -> dict | None:
    """Parse an institution section starting from its h3 heading."""
    name = h3_element.text_content().strip()
    if not name:
        return None

    # Get all following siblings until next h3 or h2
    description_parts = []
    date_of_damage = None
    damage_status = None
    sources = []

    current = h3_element.getnext()
    while current is not None:
        if current.tag in ("h2", "h3"):
            break

        text = current.text_content().strip()

        # Parse description of damage
        if text.startswith("Description of damage:"):
            desc = text.replace("Description of damage:", "").strip()
            description_parts.append(desc)
            damage_status = extract_damage_status(desc)

        # Parse date of damage
        elif text.startswith("Date of damage:"):
            date_text = text.replace("Date of damage:", "").strip()
            date_of_damage = extract_date(date_text) or date_text

        # Parse sources (ul elements)
        elif current.tag == "ul":
            for li in current.findall(".//li"):
                source_text = li.text_content().strip()
                links = li.findall(".//a")
                source_entry = {"text": source_text}
                if links:
                    source_entry["urls"] = [a.get("href") for a in links if a.get("href")]
                sources.append(source_entry)

        current = current.getnext()

    xpath = get_element_xpath(h3_element)

    return {
        "institution_name": name,
        "damage_status": damage_status,
        "date_of_damage": date_of_damage,
        "description": " ".join(description_parts),
        "sources": sources,
        "xpath": xpath,
        "ghcid": INSTITUTION_MAPPINGS.get(name.split(" (")[0])  # Remove location suffix for matching
    }


def parse_martyrs_section(tree) -> list[dict]:
    """Parse the martyred information workers section."""
    martyrs = []

    # Find the martyrs section
    martyrs_h2 = tree.xpath("//h2[@id='people']")
    if not martyrs_h2:
        return martyrs

    # Get all h3 elements after this h2
    current = martyrs_h2[0].getnext()
    while current is not None:
        if current.tag == "h2":
            break

        if current.tag == "h3":
            name = current.text_content().strip()
            role = None
            date_of_death = None
            details = []
            sources = []

            sibling = current.getnext()
            while sibling is not None and sibling.tag not in ("h2", "h3"):
                text = sibling.text_content().strip()

                if sibling.tag == "p" and not text.startswith("Date of death:") and not text.startswith("Sources:"):
                    if not role:
                        role = text
                    else:
                        details.append(text)

                if text.startswith("Date of death:"):
                    date_text = text.replace("Date of death:", "").strip()
                    date_of_death = extract_date(date_text) or date_text

                if sibling.tag == "ul":
                    for li in sibling.findall(".//li"):
                        source_text = li.text_content().strip()
                        links = li.findall(".//a")
                        source_entry = {"text": source_text}
                        if links:
                            source_entry["urls"] = [a.get("href") for a in links if a.get("href")]
                        sources.append(source_entry)

                sibling = sibling.getnext()

            martyrs.append({
                "name": name,
                "role": role,
                "date_of_death": date_of_death,
                "details": " ".join(details) if details else None,
                "sources": sources,
                "xpath": get_element_xpath(current)
            })

        current = current.getnext()

    return martyrs


def main():
    """Main extraction function."""
    # Read archived HTML
    html_file = ARCHIVE_DIR / "rendered.html"

    if not html_file.exists():
        print(f"ERROR: HTML file not found at {html_file}")
        print("Please archive the LAP report HTML first.")
        return

    with open(html_file, "r", encoding="utf-8") as f:
        html_content = f.read()

    tree = html.fromstring(html_content)

    # Extract metadata
    timestamp = datetime.now(timezone.utc).isoformat()
    content_hash = hashlib.sha256(html_content.encode()).hexdigest()

    # Extract institutions from each section
    institutions = []

    # Find all h3 headings (institution names)
    h3_elements = tree.xpath("//div[@class='entry-content']//h3")

    for h3 in h3_elements:
        # Skip martyrs section (handled separately)
        parent_h2 = h3.xpath("preceding::h2[1]")
        if parent_h2 and parent_h2[0].get("id") == "people":
            continue

        institution = parse_institution_section(h3, tree)
        if institution:
            institutions.append(institution)

    # Extract martyred information workers
    martyrs = parse_martyrs_section(tree)

    # Build output
    output = {
        "source": {
            "url": REPORT_URL,
            "title": "Israeli Damage to Archives, Libraries, and Museums in Gaza, October 2023-January 2024",
            "publisher": "Librarians and Archivists with Palestine",
            "report_date": "2024-02-01",
            "archived_file": str(html_file),
            "content_hash_sha256": content_hash,
        },
        "extraction_metadata": {
            "extraction_timestamp": timestamp,
            "extractor": "lap_gaza_report_extractor.py",
            "ch_annotator_version": "ch_annotator-v1_7_0",
        },
        "institutions": institutions,
        "martyred_information_workers": martyrs,
        "statistics": {
            "total_institutions": len(institutions),
            "total_martyrs": len(martyrs),
            "institutions_with_ghcid_match": len([i for i in institutions if i.get("ghcid")]),
        }
    }

    # Write output
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)

    print(f"Extracted {len(institutions)} institutions")
    print(f"Extracted {len(martyrs)} martyred information workers")
    print(f"Output written to {OUTPUT_FILE}")

    # Print institutions with GHCID matches for verification
    print("\nInstitutions with GHCID matches:")
    for inst in institutions:
        if inst.get("ghcid"):
            print(f"  {inst['institution_name'][:50]:50} -> {inst['ghcid']}")


if __name__ == "__main__":
    main()