glam/scripts/lap_gaza_report_extractor.py
2025-12-09 07:56:35 +01:00

311 lines
11 KiB
Python

#!/usr/bin/env python3
"""
LAP Gaza Report 2024 Claim Extractor
Extracts structured claims about Palestinian heritage institutions from the
LAP (Librarians and Archivists with Palestine) Gaza Report 2024.
Each claim includes XPath provenance for verifiability per CH-Annotator convention.
"""
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from lxml import html
from lxml.etree import tostring
import hashlib
# Configuration
REPORT_URL = "https://librarianswithpalestine.org/gaza-report-2024/"
ARCHIVE_DIR = Path("/Users/kempersc/apps/glam/data/web/lap_gaza_report_2024")
OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/extracted/lap_gaza_claims.json")
# Institution mappings: LAP report name -> YAML file basename
INSTITUTION_MAPPINGS = {
# Archives
"Central Archives of Gaza City": "PS-GZ-GAZ-A-CAGC",
"Omari Mosque and Library": "PS-GZ-GAZ-L-GOMAL",
# Libraries
"Ataa Library, IBBY Children in Crisis Library": "PS-GZ-BHA-L-ALICCL",
"Diana Tamari Sabbagh Library": "PS-GZ-GAZ-L-DTSL",
"Edward Said Library": "PS-GZ-GAZ-L-ESL",
"Enaim Library": "PS-GZ-GAZ-L-EL",
"Gaza Municipal Library": "PS-GZ-GAZ-L-GML",
"Al-Kalima Library and Publishing House": "PS-GZ-GAZ-L-KL",
"Kana'an Educational Development Institute": "PS-GZ-GAZ-L-KEDI",
"Lubbud Library": "PS-GZ-GAZ-L-LL",
"Al-Nahda Library": "PS-GZ-GAZ-L-NL",
"Samir Mansour Bookshop and Library": "PS-GZ-GAZ-L-SMBAP",
"Al-Shorouq Al-Daem Library": "PS-GZ-GAZ-L-SDL",
# University Libraries
"Al-Aqsa University Library": "PS-GZ-GAZ-L-AUL",
"Islamic University of Gaza Library": "PS-GZA-GAZ-L-IUGL",
"Al-Israa University Library and National Museum": "PS-GZ-GAZ-M-IULANM",
"Al-Quds Open University Library": "PS-GZ-GAZ-L-QOULG",
"Jawaharlal Nehru Library, Al-Azhar University": "PS-GZ-GAZ-L-JNLAUG",
# Museums
"Akkad Museum": "PS-GZ-GAZ-M-AM",
"Cultural Kararah Museum": "PS-GZ-KYN-M-CKM",
"Deir Al-Balah Museum": "PS-GZ-DEB-M-DBM",
"Ibrahim Abu Sha'ar Heritage Diwan": "PS-GZ-GAZ-M-IASHD",
"Khudari Museum": "PS-GZ-GAZ-M-KM",
"Palestinian Costume Museum": "PS-GZ-GAZ-M-PCM",
"Al-Qarara Cultural Museum": "PS-GZ-KYN-M-QCM",
"Rafah Museum": "PS-GZ-RAF-M-RM",
"Shahwan Museum": "PS-GZ-GAZ-M-SM",
}
def get_element_xpath(element) -> str:
"""Generate XPath for an element."""
parts = []
while element is not None:
parent = element.getparent()
if parent is None:
parts.insert(0, element.tag)
break
siblings = [s for s in parent if s.tag == element.tag]
if len(siblings) == 1:
parts.insert(0, element.tag)
else:
index = siblings.index(element) + 1
parts.insert(0, f"{element.tag}[{index}]")
element = parent
return "/" + "/".join(parts)
def extract_damage_status(text: str) -> str:
"""Extract damage status from description text."""
text_lower = text.lower()
if "complete destruction" in text_lower or "completely destroyed" in text_lower:
return "DESTROYED"
elif "destroyed" in text_lower:
return "DESTROYED"
elif "heavily damaged" in text_lower or "severe damage" in text_lower:
return "SEVERE_DAMAGE"
elif "damaged" in text_lower:
return "DAMAGED"
elif "partial damage" in text_lower:
return "PARTIAL_DAMAGE"
elif "fire" in text_lower or "burned" in text_lower:
return "FIRE_DAMAGE"
return "UNKNOWN"
def extract_date(text: str) -> str | None:
"""Extract date from text in various formats."""
# Try specific date patterns
patterns = [
r"(\w+ \d{1,2}, \d{4})", # November 29, 2023
r"(\d{4}-\d{2}-\d{2})", # 2023-11-29
r"(October|November|December|January|February|March|April|May) \d{4}", # October 2023
r"Early (\w+ \d{4})", # Early October 2023
]
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(0)
return None
def parse_institution_section(h3_element, tree) -> dict | None:
"""Parse an institution section starting from its h3 heading."""
name = h3_element.text_content().strip()
if not name:
return None
# Get all following siblings until next h3 or h2
description_parts = []
date_of_damage = None
damage_status = None
sources = []
current = h3_element.getnext()
while current is not None:
if current.tag in ("h2", "h3"):
break
text = current.text_content().strip()
# Parse description of damage
if text.startswith("Description of damage:"):
desc = text.replace("Description of damage:", "").strip()
description_parts.append(desc)
damage_status = extract_damage_status(desc)
# Parse date of damage
elif text.startswith("Date of damage:"):
date_text = text.replace("Date of damage:", "").strip()
date_of_damage = extract_date(date_text) or date_text
# Parse sources (ul elements)
elif current.tag == "ul":
for li in current.findall(".//li"):
source_text = li.text_content().strip()
links = li.findall(".//a")
source_entry = {"text": source_text}
if links:
source_entry["urls"] = [a.get("href") for a in links if a.get("href")]
sources.append(source_entry)
current = current.getnext()
xpath = get_element_xpath(h3_element)
return {
"institution_name": name,
"damage_status": damage_status,
"date_of_damage": date_of_damage,
"description": " ".join(description_parts),
"sources": sources,
"xpath": xpath,
"ghcid": INSTITUTION_MAPPINGS.get(name.split(" (")[0]) # Remove location suffix for matching
}
def parse_martyrs_section(tree) -> list[dict]:
"""Parse the martyred information workers section."""
martyrs = []
# Find the martyrs section
martyrs_h2 = tree.xpath("//h2[@id='people']")
if not martyrs_h2:
return martyrs
# Get all h3 elements after this h2
current = martyrs_h2[0].getnext()
while current is not None:
if current.tag == "h2":
break
if current.tag == "h3":
name = current.text_content().strip()
role = None
date_of_death = None
details = []
sources = []
sibling = current.getnext()
while sibling is not None and sibling.tag not in ("h2", "h3"):
text = sibling.text_content().strip()
if sibling.tag == "p" and not text.startswith("Date of death:") and not text.startswith("Sources:"):
if not role:
role = text
else:
details.append(text)
if text.startswith("Date of death:"):
date_text = text.replace("Date of death:", "").strip()
date_of_death = extract_date(date_text) or date_text
if sibling.tag == "ul":
for li in sibling.findall(".//li"):
source_text = li.text_content().strip()
links = li.findall(".//a")
source_entry = {"text": source_text}
if links:
source_entry["urls"] = [a.get("href") for a in links if a.get("href")]
sources.append(source_entry)
sibling = sibling.getnext()
martyrs.append({
"name": name,
"role": role,
"date_of_death": date_of_death,
"details": " ".join(details) if details else None,
"sources": sources,
"xpath": get_element_xpath(current)
})
current = current.getnext()
return martyrs
def main():
"""Main extraction function."""
# Read archived HTML
html_file = ARCHIVE_DIR / "rendered.html"
if not html_file.exists():
print(f"ERROR: HTML file not found at {html_file}")
print("Please archive the LAP report HTML first.")
return
with open(html_file, "r", encoding="utf-8") as f:
html_content = f.read()
tree = html.fromstring(html_content)
# Extract metadata
timestamp = datetime.now(timezone.utc).isoformat()
content_hash = hashlib.sha256(html_content.encode()).hexdigest()
# Extract institutions from each section
institutions = []
# Find all h3 headings (institution names)
h3_elements = tree.xpath("//div[@class='entry-content']//h3")
for h3 in h3_elements:
# Skip martyrs section (handled separately)
parent_h2 = h3.xpath("preceding::h2[1]")
if parent_h2 and parent_h2[0].get("id") == "people":
continue
institution = parse_institution_section(h3, tree)
if institution:
institutions.append(institution)
# Extract martyred information workers
martyrs = parse_martyrs_section(tree)
# Build output
output = {
"source": {
"url": REPORT_URL,
"title": "Israeli Damage to Archives, Libraries, and Museums in Gaza, October 2023-January 2024",
"publisher": "Librarians and Archivists with Palestine",
"report_date": "2024-02-01",
"archived_file": str(html_file),
"content_hash_sha256": content_hash,
},
"extraction_metadata": {
"extraction_timestamp": timestamp,
"extractor": "lap_gaza_report_extractor.py",
"ch_annotator_version": "ch_annotator-v1_7_0",
},
"institutions": institutions,
"martyred_information_workers": martyrs,
"statistics": {
"total_institutions": len(institutions),
"total_martyrs": len(martyrs),
"institutions_with_ghcid_match": len([i for i in institutions if i.get("ghcid")]),
}
}
# Write output
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"Extracted {len(institutions)} institutions")
print(f"Extracted {len(martyrs)} martyred information workers")
print(f"Output written to {OUTPUT_FILE}")
# Print institutions with GHCID matches for verification
print("\nInstitutions with GHCID matches:")
for inst in institutions:
if inst.get("ghcid"):
print(f" {inst['institution_name'][:50]:50} -> {inst['ghcid']}")
if __name__ == "__main__":
main()