#!/usr/bin/env python3 """ LAP Gaza Report 2024 Claim Extractor Extracts structured claims about Palestinian heritage institutions from the LAP (Librarians and Archivists with Palestine) Gaza Report 2024. Each claim includes XPath provenance for verifiability per CH-Annotator convention. """ import json import re from datetime import datetime, timezone from pathlib import Path from lxml import html from lxml.etree import tostring import hashlib # Configuration REPORT_URL = "https://librarianswithpalestine.org/gaza-report-2024/" ARCHIVE_DIR = Path("/Users/kempersc/apps/glam/data/web/lap_gaza_report_2024") OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/extracted/lap_gaza_claims.json") # Institution mappings: LAP report name -> YAML file basename INSTITUTION_MAPPINGS = { # Archives "Central Archives of Gaza City": "PS-GZ-GAZ-A-CAGC", "Omari Mosque and Library": "PS-GZ-GAZ-L-GOMAL", # Libraries "Ataa Library, IBBY Children in Crisis Library": "PS-GZ-BHA-L-ALICCL", "Diana Tamari Sabbagh Library": "PS-GZ-GAZ-L-DTSL", "Edward Said Library": "PS-GZ-GAZ-L-ESL", "Enaim Library": "PS-GZ-GAZ-L-EL", "Gaza Municipal Library": "PS-GZ-GAZ-L-GML", "Al-Kalima Library and Publishing House": "PS-GZ-GAZ-L-KL", "Kana'an Educational Development Institute": "PS-GZ-GAZ-L-KEDI", "Lubbud Library": "PS-GZ-GAZ-L-LL", "Al-Nahda Library": "PS-GZ-GAZ-L-NL", "Samir Mansour Bookshop and Library": "PS-GZ-GAZ-L-SMBAP", "Al-Shorouq Al-Daem Library": "PS-GZ-GAZ-L-SDL", # University Libraries "Al-Aqsa University Library": "PS-GZ-GAZ-L-AUL", "Islamic University of Gaza Library": "PS-GZA-GAZ-L-IUGL", "Al-Israa University Library and National Museum": "PS-GZ-GAZ-M-IULANM", "Al-Quds Open University Library": "PS-GZ-GAZ-L-QOULG", "Jawaharlal Nehru Library, Al-Azhar University": "PS-GZ-GAZ-L-JNLAUG", # Museums "Akkad Museum": "PS-GZ-GAZ-M-AM", "Cultural Kararah Museum": "PS-GZ-KYN-M-CKM", "Deir Al-Balah Museum": "PS-GZ-DEB-M-DBM", "Ibrahim Abu Sha'ar Heritage Diwan": "PS-GZ-GAZ-M-IASHD", "Khudari Museum": "PS-GZ-GAZ-M-KM", "Palestinian Costume Museum": "PS-GZ-GAZ-M-PCM", "Al-Qarara Cultural Museum": "PS-GZ-KYN-M-QCM", "Rafah Museum": "PS-GZ-RAF-M-RM", "Shahwan Museum": "PS-GZ-GAZ-M-SM", } def get_element_xpath(element) -> str: """Generate XPath for an element.""" parts = [] while element is not None: parent = element.getparent() if parent is None: parts.insert(0, element.tag) break siblings = [s for s in parent if s.tag == element.tag] if len(siblings) == 1: parts.insert(0, element.tag) else: index = siblings.index(element) + 1 parts.insert(0, f"{element.tag}[{index}]") element = parent return "/" + "/".join(parts) def extract_damage_status(text: str) -> str: """Extract damage status from description text.""" text_lower = text.lower() if "complete destruction" in text_lower or "completely destroyed" in text_lower: return "DESTROYED" elif "destroyed" in text_lower: return "DESTROYED" elif "heavily damaged" in text_lower or "severe damage" in text_lower: return "SEVERE_DAMAGE" elif "damaged" in text_lower: return "DAMAGED" elif "partial damage" in text_lower: return "PARTIAL_DAMAGE" elif "fire" in text_lower or "burned" in text_lower: return "FIRE_DAMAGE" return "UNKNOWN" def extract_date(text: str) -> str | None: """Extract date from text in various formats.""" # Try specific date patterns patterns = [ r"(\w+ \d{1,2}, \d{4})", # November 29, 2023 r"(\d{4}-\d{2}-\d{2})", # 2023-11-29 r"(October|November|December|January|February|March|April|May) \d{4}", # October 2023 r"Early (\w+ \d{4})", # Early October 2023 ] for pattern in patterns: match = re.search(pattern, text, re.IGNORECASE) if match: return match.group(0) return None def parse_institution_section(h3_element, tree) -> dict | None: """Parse an institution section starting from its h3 heading.""" name = h3_element.text_content().strip() if not name: return None # Get all following siblings until next h3 or h2 description_parts = [] date_of_damage = None damage_status = None sources = [] current = h3_element.getnext() while current is not None: if current.tag in ("h2", "h3"): break text = current.text_content().strip() # Parse description of damage if text.startswith("Description of damage:"): desc = text.replace("Description of damage:", "").strip() description_parts.append(desc) damage_status = extract_damage_status(desc) # Parse date of damage elif text.startswith("Date of damage:"): date_text = text.replace("Date of damage:", "").strip() date_of_damage = extract_date(date_text) or date_text # Parse sources (ul elements) elif current.tag == "ul": for li in current.findall(".//li"): source_text = li.text_content().strip() links = li.findall(".//a") source_entry = {"text": source_text} if links: source_entry["urls"] = [a.get("href") for a in links if a.get("href")] sources.append(source_entry) current = current.getnext() xpath = get_element_xpath(h3_element) return { "institution_name": name, "damage_status": damage_status, "date_of_damage": date_of_damage, "description": " ".join(description_parts), "sources": sources, "xpath": xpath, "ghcid": INSTITUTION_MAPPINGS.get(name.split(" (")[0]) # Remove location suffix for matching } def parse_martyrs_section(tree) -> list[dict]: """Parse the martyred information workers section.""" martyrs = [] # Find the martyrs section martyrs_h2 = tree.xpath("//h2[@id='people']") if not martyrs_h2: return martyrs # Get all h3 elements after this h2 current = martyrs_h2[0].getnext() while current is not None: if current.tag == "h2": break if current.tag == "h3": name = current.text_content().strip() role = None date_of_death = None details = [] sources = [] sibling = current.getnext() while sibling is not None and sibling.tag not in ("h2", "h3"): text = sibling.text_content().strip() if sibling.tag == "p" and not text.startswith("Date of death:") and not text.startswith("Sources:"): if not role: role = text else: details.append(text) if text.startswith("Date of death:"): date_text = text.replace("Date of death:", "").strip() date_of_death = extract_date(date_text) or date_text if sibling.tag == "ul": for li in sibling.findall(".//li"): source_text = li.text_content().strip() links = li.findall(".//a") source_entry = {"text": source_text} if links: source_entry["urls"] = [a.get("href") for a in links if a.get("href")] sources.append(source_entry) sibling = sibling.getnext() martyrs.append({ "name": name, "role": role, "date_of_death": date_of_death, "details": " ".join(details) if details else None, "sources": sources, "xpath": get_element_xpath(current) }) current = current.getnext() return martyrs def main(): """Main extraction function.""" # Read archived HTML html_file = ARCHIVE_DIR / "rendered.html" if not html_file.exists(): print(f"ERROR: HTML file not found at {html_file}") print("Please archive the LAP report HTML first.") return with open(html_file, "r", encoding="utf-8") as f: html_content = f.read() tree = html.fromstring(html_content) # Extract metadata timestamp = datetime.now(timezone.utc).isoformat() content_hash = hashlib.sha256(html_content.encode()).hexdigest() # Extract institutions from each section institutions = [] # Find all h3 headings (institution names) h3_elements = tree.xpath("//div[@class='entry-content']//h3") for h3 in h3_elements: # Skip martyrs section (handled separately) parent_h2 = h3.xpath("preceding::h2[1]") if parent_h2 and parent_h2[0].get("id") == "people": continue institution = parse_institution_section(h3, tree) if institution: institutions.append(institution) # Extract martyred information workers martyrs = parse_martyrs_section(tree) # Build output output = { "source": { "url": REPORT_URL, "title": "Israeli Damage to Archives, Libraries, and Museums in Gaza, October 2023-January 2024", "publisher": "Librarians and Archivists with Palestine", "report_date": "2024-02-01", "archived_file": str(html_file), "content_hash_sha256": content_hash, }, "extraction_metadata": { "extraction_timestamp": timestamp, "extractor": "lap_gaza_report_extractor.py", "ch_annotator_version": "ch_annotator-v1_7_0", }, "institutions": institutions, "martyred_information_workers": martyrs, "statistics": { "total_institutions": len(institutions), "total_martyrs": len(martyrs), "institutions_with_ghcid_match": len([i for i in institutions if i.get("ghcid")]), } } # Write output OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(output, f, indent=2, ensure_ascii=False) print(f"Extracted {len(institutions)} institutions") print(f"Extracted {len(martyrs)} martyred information workers") print(f"Output written to {OUTPUT_FILE}") # Print institutions with GHCID matches for verification print("\nInstitutions with GHCID matches:") for inst in institutions: if inst.get("ghcid"): print(f" {inst['institution_name'][:50]:50} -> {inst['ghcid']}") if __name__ == "__main__": main()