311 lines
11 KiB
Python
311 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
LAP Gaza Report 2024 Claim Extractor
|
|
|
|
Extracts structured claims about Palestinian heritage institutions from the
|
|
LAP (Librarians and Archivists with Palestine) Gaza Report 2024.
|
|
|
|
Each claim includes XPath provenance for verifiability per CH-Annotator convention.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from lxml import html
|
|
from lxml.etree import tostring
|
|
import hashlib
|
|
|
|
# Configuration
|
|
REPORT_URL = "https://librarianswithpalestine.org/gaza-report-2024/"
|
|
ARCHIVE_DIR = Path("/Users/kempersc/apps/glam/data/web/lap_gaza_report_2024")
|
|
OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/extracted/lap_gaza_claims.json")
|
|
|
|
# Institution mappings: LAP report name -> YAML file basename
|
|
INSTITUTION_MAPPINGS = {
|
|
# Archives
|
|
"Central Archives of Gaza City": "PS-GZ-GAZ-A-CAGC",
|
|
"Omari Mosque and Library": "PS-GZ-GAZ-L-GOMAL",
|
|
|
|
# Libraries
|
|
"Ataa Library, IBBY Children in Crisis Library": "PS-GZ-BHA-L-ALICCL",
|
|
"Diana Tamari Sabbagh Library": "PS-GZ-GAZ-L-DTSL",
|
|
"Edward Said Library": "PS-GZ-GAZ-L-ESL",
|
|
"Enaim Library": "PS-GZ-GAZ-L-EL",
|
|
"Gaza Municipal Library": "PS-GZ-GAZ-L-GML",
|
|
"Al-Kalima Library and Publishing House": "PS-GZ-GAZ-L-KL",
|
|
"Kana'an Educational Development Institute": "PS-GZ-GAZ-L-KEDI",
|
|
"Lubbud Library": "PS-GZ-GAZ-L-LL",
|
|
"Al-Nahda Library": "PS-GZ-GAZ-L-NL",
|
|
"Samir Mansour Bookshop and Library": "PS-GZ-GAZ-L-SMBAP",
|
|
"Al-Shorouq Al-Daem Library": "PS-GZ-GAZ-L-SDL",
|
|
|
|
# University Libraries
|
|
"Al-Aqsa University Library": "PS-GZ-GAZ-L-AUL",
|
|
"Islamic University of Gaza Library": "PS-GZA-GAZ-L-IUGL",
|
|
"Al-Israa University Library and National Museum": "PS-GZ-GAZ-M-IULANM",
|
|
"Al-Quds Open University Library": "PS-GZ-GAZ-L-QOULG",
|
|
"Jawaharlal Nehru Library, Al-Azhar University": "PS-GZ-GAZ-L-JNLAUG",
|
|
|
|
# Museums
|
|
"Akkad Museum": "PS-GZ-GAZ-M-AM",
|
|
"Cultural Kararah Museum": "PS-GZ-KYN-M-CKM",
|
|
"Deir Al-Balah Museum": "PS-GZ-DEB-M-DBM",
|
|
"Ibrahim Abu Sha'ar Heritage Diwan": "PS-GZ-GAZ-M-IASHD",
|
|
"Khudari Museum": "PS-GZ-GAZ-M-KM",
|
|
"Palestinian Costume Museum": "PS-GZ-GAZ-M-PCM",
|
|
"Al-Qarara Cultural Museum": "PS-GZ-KYN-M-QCM",
|
|
"Rafah Museum": "PS-GZ-RAF-M-RM",
|
|
"Shahwan Museum": "PS-GZ-GAZ-M-SM",
|
|
}
|
|
|
|
|
|
def get_element_xpath(element) -> str:
|
|
"""Generate XPath for an element."""
|
|
parts = []
|
|
while element is not None:
|
|
parent = element.getparent()
|
|
if parent is None:
|
|
parts.insert(0, element.tag)
|
|
break
|
|
siblings = [s for s in parent if s.tag == element.tag]
|
|
if len(siblings) == 1:
|
|
parts.insert(0, element.tag)
|
|
else:
|
|
index = siblings.index(element) + 1
|
|
parts.insert(0, f"{element.tag}[{index}]")
|
|
element = parent
|
|
return "/" + "/".join(parts)
|
|
|
|
|
|
def extract_damage_status(text: str) -> str:
|
|
"""Extract damage status from description text."""
|
|
text_lower = text.lower()
|
|
if "complete destruction" in text_lower or "completely destroyed" in text_lower:
|
|
return "DESTROYED"
|
|
elif "destroyed" in text_lower:
|
|
return "DESTROYED"
|
|
elif "heavily damaged" in text_lower or "severe damage" in text_lower:
|
|
return "SEVERE_DAMAGE"
|
|
elif "damaged" in text_lower:
|
|
return "DAMAGED"
|
|
elif "partial damage" in text_lower:
|
|
return "PARTIAL_DAMAGE"
|
|
elif "fire" in text_lower or "burned" in text_lower:
|
|
return "FIRE_DAMAGE"
|
|
return "UNKNOWN"
|
|
|
|
|
|
def extract_date(text: str) -> str | None:
|
|
"""Extract date from text in various formats."""
|
|
# Try specific date patterns
|
|
patterns = [
|
|
r"(\w+ \d{1,2}, \d{4})", # November 29, 2023
|
|
r"(\d{4}-\d{2}-\d{2})", # 2023-11-29
|
|
r"(October|November|December|January|February|March|April|May) \d{4}", # October 2023
|
|
r"Early (\w+ \d{4})", # Early October 2023
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
return match.group(0)
|
|
return None
|
|
|
|
|
|
def parse_institution_section(h3_element, tree) -> dict | None:
|
|
"""Parse an institution section starting from its h3 heading."""
|
|
name = h3_element.text_content().strip()
|
|
if not name:
|
|
return None
|
|
|
|
# Get all following siblings until next h3 or h2
|
|
description_parts = []
|
|
date_of_damage = None
|
|
damage_status = None
|
|
sources = []
|
|
|
|
current = h3_element.getnext()
|
|
while current is not None:
|
|
if current.tag in ("h2", "h3"):
|
|
break
|
|
|
|
text = current.text_content().strip()
|
|
|
|
# Parse description of damage
|
|
if text.startswith("Description of damage:"):
|
|
desc = text.replace("Description of damage:", "").strip()
|
|
description_parts.append(desc)
|
|
damage_status = extract_damage_status(desc)
|
|
|
|
# Parse date of damage
|
|
elif text.startswith("Date of damage:"):
|
|
date_text = text.replace("Date of damage:", "").strip()
|
|
date_of_damage = extract_date(date_text) or date_text
|
|
|
|
# Parse sources (ul elements)
|
|
elif current.tag == "ul":
|
|
for li in current.findall(".//li"):
|
|
source_text = li.text_content().strip()
|
|
links = li.findall(".//a")
|
|
source_entry = {"text": source_text}
|
|
if links:
|
|
source_entry["urls"] = [a.get("href") for a in links if a.get("href")]
|
|
sources.append(source_entry)
|
|
|
|
current = current.getnext()
|
|
|
|
xpath = get_element_xpath(h3_element)
|
|
|
|
return {
|
|
"institution_name": name,
|
|
"damage_status": damage_status,
|
|
"date_of_damage": date_of_damage,
|
|
"description": " ".join(description_parts),
|
|
"sources": sources,
|
|
"xpath": xpath,
|
|
"ghcid": INSTITUTION_MAPPINGS.get(name.split(" (")[0]) # Remove location suffix for matching
|
|
}
|
|
|
|
|
|
def parse_martyrs_section(tree) -> list[dict]:
|
|
"""Parse the martyred information workers section."""
|
|
martyrs = []
|
|
|
|
# Find the martyrs section
|
|
martyrs_h2 = tree.xpath("//h2[@id='people']")
|
|
if not martyrs_h2:
|
|
return martyrs
|
|
|
|
# Get all h3 elements after this h2
|
|
current = martyrs_h2[0].getnext()
|
|
while current is not None:
|
|
if current.tag == "h2":
|
|
break
|
|
|
|
if current.tag == "h3":
|
|
name = current.text_content().strip()
|
|
role = None
|
|
date_of_death = None
|
|
details = []
|
|
sources = []
|
|
|
|
sibling = current.getnext()
|
|
while sibling is not None and sibling.tag not in ("h2", "h3"):
|
|
text = sibling.text_content().strip()
|
|
|
|
if sibling.tag == "p" and not text.startswith("Date of death:") and not text.startswith("Sources:"):
|
|
if not role:
|
|
role = text
|
|
else:
|
|
details.append(text)
|
|
|
|
if text.startswith("Date of death:"):
|
|
date_text = text.replace("Date of death:", "").strip()
|
|
date_of_death = extract_date(date_text) or date_text
|
|
|
|
if sibling.tag == "ul":
|
|
for li in sibling.findall(".//li"):
|
|
source_text = li.text_content().strip()
|
|
links = li.findall(".//a")
|
|
source_entry = {"text": source_text}
|
|
if links:
|
|
source_entry["urls"] = [a.get("href") for a in links if a.get("href")]
|
|
sources.append(source_entry)
|
|
|
|
sibling = sibling.getnext()
|
|
|
|
martyrs.append({
|
|
"name": name,
|
|
"role": role,
|
|
"date_of_death": date_of_death,
|
|
"details": " ".join(details) if details else None,
|
|
"sources": sources,
|
|
"xpath": get_element_xpath(current)
|
|
})
|
|
|
|
current = current.getnext()
|
|
|
|
return martyrs
|
|
|
|
|
|
def main():
|
|
"""Main extraction function."""
|
|
# Read archived HTML
|
|
html_file = ARCHIVE_DIR / "rendered.html"
|
|
|
|
if not html_file.exists():
|
|
print(f"ERROR: HTML file not found at {html_file}")
|
|
print("Please archive the LAP report HTML first.")
|
|
return
|
|
|
|
with open(html_file, "r", encoding="utf-8") as f:
|
|
html_content = f.read()
|
|
|
|
tree = html.fromstring(html_content)
|
|
|
|
# Extract metadata
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
content_hash = hashlib.sha256(html_content.encode()).hexdigest()
|
|
|
|
# Extract institutions from each section
|
|
institutions = []
|
|
|
|
# Find all h3 headings (institution names)
|
|
h3_elements = tree.xpath("//div[@class='entry-content']//h3")
|
|
|
|
for h3 in h3_elements:
|
|
# Skip martyrs section (handled separately)
|
|
parent_h2 = h3.xpath("preceding::h2[1]")
|
|
if parent_h2 and parent_h2[0].get("id") == "people":
|
|
continue
|
|
|
|
institution = parse_institution_section(h3, tree)
|
|
if institution:
|
|
institutions.append(institution)
|
|
|
|
# Extract martyred information workers
|
|
martyrs = parse_martyrs_section(tree)
|
|
|
|
# Build output
|
|
output = {
|
|
"source": {
|
|
"url": REPORT_URL,
|
|
"title": "Israeli Damage to Archives, Libraries, and Museums in Gaza, October 2023-January 2024",
|
|
"publisher": "Librarians and Archivists with Palestine",
|
|
"report_date": "2024-02-01",
|
|
"archived_file": str(html_file),
|
|
"content_hash_sha256": content_hash,
|
|
},
|
|
"extraction_metadata": {
|
|
"extraction_timestamp": timestamp,
|
|
"extractor": "lap_gaza_report_extractor.py",
|
|
"ch_annotator_version": "ch_annotator-v1_7_0",
|
|
},
|
|
"institutions": institutions,
|
|
"martyred_information_workers": martyrs,
|
|
"statistics": {
|
|
"total_institutions": len(institutions),
|
|
"total_martyrs": len(martyrs),
|
|
"institutions_with_ghcid_match": len([i for i in institutions if i.get("ghcid")]),
|
|
}
|
|
}
|
|
|
|
# Write output
|
|
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
|
json.dump(output, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Extracted {len(institutions)} institutions")
|
|
print(f"Extracted {len(martyrs)} martyred information workers")
|
|
print(f"Output written to {OUTPUT_FILE}")
|
|
|
|
# Print institutions with GHCID matches for verification
|
|
print("\nInstitutions with GHCID matches:")
|
|
for inst in institutions:
|
|
if inst.get("ghcid"):
|
|
print(f" {inst['institution_name'][:50]:50} -> {inst['ghcid']}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|