glam/scripts/extract_palestinian_claims.py
kempersc 55e2cd2340 feat: implement LLM-based extraction for Archives Lab content
- Introduced `llm_extract_archiveslab.py` script for entity and relationship extraction using LLMAnnotator with GLAM-NER v1.7.0.
- Replaced regex-based extraction with generative LLM inference.
- Added functions for loading markdown content, converting annotation sessions to dictionaries, and generating extraction statistics.
- Implemented comprehensive logging of extraction results, including counts of entities, relationships, and specific types like heritage institutions and persons.
- Results and statistics are saved in JSON format for further analysis.
2025-12-05 23:16:21 +01:00

792 lines
29 KiB
Python

#!/usr/bin/env python3
"""
Extract heritage institution claims from Palestinian GLAM conversation.
Uses pattern-based extraction to extract structured claims and triples
from the conversation JSON file containing Palestinian GLAM institution data.
Features:
- Entity claims with GLAM-NER entity types
- Metadata extraction (URLs, contacts, collections, dates)
- Triple statements (subject-predicate-object) for relationships
- Full provenance tracking
Usage:
python scripts/extract_palestinian_claims.py
"""
import json
import logging
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from dataclasses import dataclass, field, asdict
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Input file
CONVERSATION_PATH = Path(
"docs/reflection/2025-09-23T14-51-08-a150d437-419f-4e91-bfe9-ee2954c213a7-"
"Palestinian_GLAM_resources_and_archives.json"
)
# Output file
OUTPUT_PATH = Path("data/extracted/palestinian_glam_claims.json")
@dataclass
class Provenance:
"""Provenance information for claims."""
source_type: str
conversation_id: str
extraction_date: str
source_line: Optional[int] = None
confidence: float = 0.9
@dataclass
class EntityClaim:
"""An entity claim with metadata."""
hypernym: str
hyponym: str
text: str
xpath: str
confidence: float
class_uri: Optional[str] = None
notes: Optional[str] = None
provenance: Optional[Provenance] = None
@dataclass
class Triple:
"""A subject-predicate-object triple representing a relationship."""
subject: Dict[str, Any] # {entity_type, text, uri}
predicate: Dict[str, Any] # {type, uri, label}
object: Dict[str, Any] # {entity_type, text, uri, literal_value}
confidence: float = 0.9
provenance: Optional[Provenance] = None
@dataclass
class Institution:
"""A heritage institution with full metadata."""
name: str
name_arabic: Optional[str] = None
institution_type: str = "GRP.HER"
section: Optional[str] = None
# Location
location: Optional[str] = None
city: Optional[str] = None
country: str = "PS" # Palestine
# Identifiers
website: Optional[str] = None
email: Optional[str] = None
phone: Optional[str] = None
# Metadata
established: Optional[str] = None
collections: Optional[str] = None
collections_count: Optional[str] = None
languages: Optional[str] = None
special_collections: Optional[str] = None
digital_access: Optional[str] = None
services: Optional[str] = None
hours: Optional[str] = None
status: Optional[str] = None
# International identifiers
international_status: Optional[str] = None
unesco_status: Optional[str] = None
# Provenance
source_line: Optional[int] = None
# Raw claims
raw_claims: List[Dict[str, Any]] = field(default_factory=list)
def extract_artifact_content(conversation_data: dict) -> str:
"""
Extract the markdown artifact content from the conversation.
"""
for message in conversation_data.get("chat_messages", []):
for content_block in message.get("content", []):
# Format 1: tool_use with artifacts in input.content
if content_block.get("type") == "tool_use":
if content_block.get("name") == "artifacts":
input_data = content_block.get("input", {})
if input_data.get("type") == "text/markdown":
content = input_data.get("content", "")
if content:
return content
# Format 2: tool_result with artifacts output
if content_block.get("type") == "tool_result":
if content_block.get("name") == "artifacts":
output = content_block.get("output", {})
if isinstance(output, list):
for artifact in output:
if artifact.get("type") == "text/markdown":
return artifact.get("content", "")
return ""
def parse_arabic_name(name: str) -> Tuple[str, Optional[str]]:
"""Extract Arabic name from parentheses if present."""
match = re.search(r'\(([^\)]+)\)', name)
if match:
arabic = match.group(1)
# Check if it contains Arabic characters
if any('\u0600' <= c <= '\u06FF' for c in arabic):
english = name.replace(f"({arabic})", "").strip()
return english, arabic
return name, None
def parse_metadata_line(line: str) -> Tuple[Optional[str], Optional[str]]:
"""Parse a metadata line like '- **Field:** Value'."""
match = re.match(r'-\s*\*\*([^:*]+)\*\*:\s*(.+)', line)
if match:
return match.group(1).strip(), match.group(2).strip()
return None, None
def determine_institution_type(section: Optional[str]) -> str:
"""Determine institution type from section header."""
if not section:
return "GRP.HER"
section_lower = section.lower()
if "libraries" in section_lower or "library" in section_lower:
return "GRP.HER.LIB"
elif "museums" in section_lower or "museum" in section_lower:
return "GRP.HER.MUS"
elif "archives" in section_lower or "archive" in section_lower:
return "GRP.HER.ARC"
elif "galleries" in section_lower or "cultural" in section_lower:
return "GRP.HER.GAL"
elif "digital" in section_lower:
return "GRP.HER.DIG"
return "GRP.HER"
def extract_url(text: str) -> Optional[str]:
"""Extract URL from text."""
match = re.search(r'https?://[^\s<>\"\)]+', text)
if match:
return match.group(0).rstrip('.,;:')
return None
def extract_email(text: str) -> Optional[str]:
"""Extract email from text."""
match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
if match:
return match.group(0)
return None
def extract_phone(text: str) -> Optional[str]:
"""Extract phone number from text."""
match = re.search(r'\+?[\d\s\-\(\)]{10,}', text)
if match:
return match.group(0).strip()
return None
def parse_markdown_to_institutions(
markdown_content: str,
conversation_id: str
) -> List[Institution]:
"""
Parse the markdown artifact to extract structured institution data.
"""
institutions = []
lines = markdown_content.split("\n")
current_institution = None
current_section = None
current_subsection = None
for i, line in enumerate(lines):
line_stripped = line.strip()
# Section headers (## Part I: Libraries, ## Part II: Museums, etc.)
if line_stripped.startswith("## Part") or line_stripped.startswith("## Current"):
current_section = line_stripped
current_subsection = None
continue
# Subsection headers (### National Libraries, ### Academic Libraries, etc.)
if line_stripped.startswith("### ") and not line_stripped.startswith("#### "):
current_subsection = line_stripped.lstrip("#").strip()
continue
# Institution headers (#### Name)
if line_stripped.startswith("#### "):
# Save previous institution
if current_institution:
institutions.append(current_institution)
name = line_stripped.lstrip("#").strip()
english_name, arabic_name = parse_arabic_name(name)
current_institution = Institution(
name=english_name,
name_arabic=arabic_name,
institution_type=determine_institution_type(current_section),
section=current_section,
source_line=i + 1,
)
continue
# Extract metadata from bullet points
if line_stripped.startswith("- **") and current_institution:
field, value = parse_metadata_line(line_stripped)
if field and value:
field_lower = field.lower()
# Map fields to institution attributes
if field_lower == "location":
current_institution.location = value
# Try to extract city
if "," in value:
parts = value.split(",")
current_institution.city = parts[-1].strip().split()[0]
elif field_lower == "website" or field_lower == "url":
current_institution.website = extract_url(value) or value
elif field_lower == "contact":
current_institution.email = extract_email(value)
current_institution.phone = extract_phone(value)
elif field_lower == "established" or field_lower == "founded":
current_institution.established = value
elif field_lower == "collections":
current_institution.collections = value
# Extract count if present
count_match = re.search(r'([\d,]+)\+?\s*(items|volumes|books|manuscripts|materials)', value, re.I)
if count_match:
current_institution.collections_count = count_match.group(1).replace(",", "")
elif field_lower == "languages":
current_institution.languages = value
elif field_lower == "special collections":
current_institution.special_collections = value
elif field_lower == "digital access" or field_lower == "digital repository":
current_institution.digital_access = value
elif field_lower == "services":
current_institution.services = value
elif field_lower == "hours" or field_lower == "operating hours":
current_institution.hours = value
elif field_lower == "current status" or field_lower == "status":
current_institution.status = value
elif field_lower == "international status":
current_institution.international_status = value
elif field_lower == "unesco status" or field_lower == "unesco":
current_institution.unesco_status = value
# Store raw claim
current_institution.raw_claims.append({
"field": field,
"value": value,
"line": i + 1,
})
# Don't forget the last institution
if current_institution:
institutions.append(current_institution)
return institutions
def generate_entity_claims(
institutions: List[Institution],
conversation_id: str
) -> List[Dict[str, Any]]:
"""Generate entity claims from institutions."""
claims = []
extraction_date = datetime.now(timezone.utc).isoformat()
for inst in institutions:
# Skip section headers that got parsed as institutions
if inst.name.lower() in ["national libraries", "major public libraries",
"academic libraries", "major national museums",
"regional museums", "major digital platforms",
"international collections", "jerusalem cultural centers"]:
continue
provenance = {
"source_type": "CONVERSATION_NLP",
"conversation_id": conversation_id,
"extraction_date": extraction_date,
"source_line": inst.source_line,
}
# Main institution claim
claims.append({
"hypernym": "GRP",
"hyponym": inst.institution_type,
"text": inst.name,
"text_arabic": inst.name_arabic,
"xpath": f"/section['{inst.section}']/institution[@line={inst.source_line}]",
"confidence": 0.95,
"class_uri": "glam:HeritageCustodian",
"notes": f"Heritage institution in {inst.section}",
"metadata": {
"location": inst.location,
"city": inst.city,
"country": inst.country,
"website": inst.website,
"email": inst.email,
"phone": inst.phone,
"established": inst.established,
"collections": inst.collections,
"collections_count": inst.collections_count,
"languages": inst.languages,
"special_collections": inst.special_collections,
"digital_access": inst.digital_access,
"services": inst.services,
"hours": inst.hours,
"status": inst.status,
"international_status": inst.international_status,
"unesco_status": inst.unesco_status,
},
"provenance": provenance,
})
# Location claim if present
if inst.city:
claims.append({
"hypernym": "TOP",
"hyponym": "TOP.SET",
"text": inst.city,
"xpath": f"/section['{inst.section}']/institution[@line={inst.source_line}]/location",
"confidence": 0.90,
"class_uri": "schema:Place",
"notes": f"Location of {inst.name}",
"related_entity": inst.name,
"provenance": provenance,
})
# URL claim if present
if inst.website:
claims.append({
"hypernym": "ID",
"hyponym": "ID.URL",
"text": inst.website,
"xpath": f"/section['{inst.section}']/institution[@line={inst.source_line}]/website",
"confidence": 0.95,
"class_uri": "schema:URL",
"notes": f"Website of {inst.name}",
"related_entity": inst.name,
"provenance": provenance,
})
# Date claim if established date present
if inst.established:
claims.append({
"hypernym": "TMP",
"hyponym": "TMP.DAT",
"text": inst.established,
"xpath": f"/section['{inst.section}']/institution[@line={inst.source_line}]/established",
"confidence": 0.85,
"class_uri": "schema:Date",
"notes": f"Founding date of {inst.name}",
"related_entity": inst.name,
"provenance": provenance,
})
return claims
def generate_triples(
institutions: List[Institution],
conversation_id: str
) -> List[Dict[str, Any]]:
"""Generate triple statements from institutions."""
triples = []
extraction_date = datetime.now(timezone.utc).isoformat()
for inst in institutions:
# Skip section headers
if inst.name.lower() in ["national libraries", "major public libraries",
"academic libraries", "major national museums",
"regional museums", "major digital platforms",
"international collections", "jerusalem cultural centers"]:
continue
provenance = {
"source_type": "CONVERSATION_NLP",
"conversation_id": conversation_id,
"extraction_date": extraction_date,
"source_line": inst.source_line,
}
# Subject (the institution)
subject = {
"entity_type": inst.institution_type,
"text": inst.name,
"uri": f"glam:institution/{inst.name.lower().replace(' ', '_').replace('(', '').replace(')', '')}",
}
# Triple: Institution -> located_in -> City
if inst.city:
triples.append({
"subject": subject,
"predicate": {
"type": "REL.SPA.LOC",
"uri": "schema:location",
"label": "located in",
},
"object": {
"entity_type": "TOP.SET",
"text": inst.city,
"uri": f"geonames:{inst.city.lower()}",
},
"confidence": 0.90,
"provenance": provenance,
})
# Triple: Institution -> located_in -> Country (Palestine)
triples.append({
"subject": subject,
"predicate": {
"type": "REL.SPA.LOC",
"uri": "schema:containedInPlace",
"label": "located in country",
},
"object": {
"entity_type": "TOP.CTY",
"text": "Palestine",
"uri": "wd:Q219060", # Wikidata ID for Palestine
},
"confidence": 0.95,
"provenance": provenance,
})
# Triple: Institution -> instance_of -> Institution Type
type_labels = {
"GRP.HER.LIB": ("Library", "wd:Q7075"),
"GRP.HER.MUS": ("Museum", "wd:Q33506"),
"GRP.HER.ARC": ("Archive", "wd:Q166118"),
"GRP.HER.GAL": ("Gallery", "wd:Q1007870"),
"GRP.HER.DIG": ("Digital Platform", "wd:Q35127"),
"GRP.HER": ("Heritage Institution", "glam:HeritageCustodian"),
}
type_label, type_uri = type_labels.get(inst.institution_type, ("Heritage Institution", "glam:HeritageCustodian"))
triples.append({
"subject": subject,
"predicate": {
"type": "REL.ONT.ISA",
"uri": "rdf:type",
"label": "instance of",
},
"object": {
"entity_type": "THG.CON",
"text": type_label,
"uri": type_uri,
},
"confidence": 0.95,
"provenance": provenance,
})
# Triple: Institution -> has_website -> URL
if inst.website:
triples.append({
"subject": subject,
"predicate": {
"type": "REL.ID.URL",
"uri": "schema:url",
"label": "has website",
},
"object": {
"entity_type": "ID.URL",
"text": inst.website,
"literal_value": inst.website,
},
"confidence": 0.95,
"provenance": provenance,
})
# Triple: Institution -> founded_in -> Year
if inst.established:
# Extract year
year_match = re.search(r'\b(1\d{3}|20\d{2})\b', inst.established)
if year_match:
triples.append({
"subject": subject,
"predicate": {
"type": "REL.TMP.CRE",
"uri": "schema:foundingDate",
"label": "founded in",
},
"object": {
"entity_type": "TMP.DAT",
"text": year_match.group(1),
"literal_value": year_match.group(1),
},
"confidence": 0.85,
"provenance": provenance,
})
# Triple: Institution -> has_collection_size -> Count
if inst.collections_count:
triples.append({
"subject": subject,
"predicate": {
"type": "REL.QTY.CNT",
"uri": "schema:size",
"label": "has collection size",
},
"object": {
"entity_type": "QTY.CNT",
"text": f"{inst.collections_count} items",
"literal_value": int(inst.collections_count),
},
"confidence": 0.80,
"provenance": provenance,
})
# Triple: Institution -> has_contact_email -> Email
if inst.email:
triples.append({
"subject": subject,
"predicate": {
"type": "REL.ID.CON",
"uri": "schema:email",
"label": "has email",
},
"object": {
"entity_type": "ID.CON",
"text": inst.email,
"literal_value": inst.email,
},
"confidence": 0.90,
"provenance": provenance,
})
# Triple: Institution -> has_languages -> Languages
if inst.languages:
triples.append({
"subject": subject,
"predicate": {
"type": "REL.ATT.LNG",
"uri": "schema:inLanguage",
"label": "uses languages",
},
"object": {
"entity_type": "THG.LNG",
"text": inst.languages,
"literal_value": inst.languages,
},
"confidence": 0.85,
"provenance": provenance,
})
# Triple: Institution -> has_status -> Status (for Gaza institutions)
if inst.status:
triples.append({
"subject": subject,
"predicate": {
"type": "REL.STA.CUR",
"uri": "schema:status",
"label": "has current status",
},
"object": {
"entity_type": "THG.STA",
"text": inst.status,
"literal_value": inst.status,
},
"confidence": 0.90,
"provenance": provenance,
})
# Triple: Institution -> member_of -> International Organization
if inst.international_status:
# Parse organizations from international status
if "IFLA" in inst.international_status:
triples.append({
"subject": subject,
"predicate": {
"type": "REL.ORG.MEM",
"uri": "org:memberOf",
"label": "member of",
},
"object": {
"entity_type": "GRP.ORG",
"text": "IFLA",
"uri": "wd:Q46550", # International Federation of Library Associations
},
"confidence": 0.90,
"provenance": provenance,
})
if "UNESCO" in inst.international_status or inst.unesco_status:
triples.append({
"subject": subject,
"predicate": {
"type": "REL.ORG.REC",
"uri": "schema:award",
"label": "recognized by",
},
"object": {
"entity_type": "GRP.GOV",
"text": "UNESCO",
"uri": "wd:Q7809",
},
"confidence": 0.85,
"provenance": provenance,
})
return triples
def main():
"""Main extraction workflow."""
logger.info("=" * 80)
logger.info("Palestinian GLAM Claims Extraction - Enhanced with Triples")
logger.info("=" * 80)
# Check input file exists
if not CONVERSATION_PATH.exists():
logger.error(f"Conversation file not found: {CONVERSATION_PATH}")
sys.exit(1)
logger.info(f"Reading conversation: {CONVERSATION_PATH.name}")
# Load conversation JSON
with open(CONVERSATION_PATH, "r", encoding="utf-8") as f:
conversation_data = json.load(f)
conversation_id = conversation_data.get("uuid", "unknown")
conversation_name = conversation_data.get("name", "Unknown")
logger.info(f"Conversation: {conversation_name}")
logger.info(f"UUID: {conversation_id}")
# Extract artifact content
logger.info("Extracting artifact content...")
artifact_content = extract_artifact_content(conversation_data)
if artifact_content:
logger.info(f"Found artifact with {len(artifact_content):,} characters")
else:
logger.error("No artifact content found!")
sys.exit(1)
# Parse markdown to institutions
logger.info("Parsing markdown to extract institutions...")
institutions = parse_markdown_to_institutions(artifact_content, conversation_id)
logger.info(f"Found {len(institutions)} institutions")
# Generate entity claims
logger.info("Generating entity claims...")
claims = generate_entity_claims(institutions, conversation_id)
logger.info(f"Generated {len(claims)} entity claims")
# Generate triples
logger.info("Generating triple statements...")
triples = generate_triples(institutions, conversation_id)
logger.info(f"Generated {len(triples)} triples")
# Count by type
claim_type_counts = {}
for claim in claims:
hyponym = claim.get("hyponym", "UNKNOWN")
claim_type_counts[hyponym] = claim_type_counts.get(hyponym, 0) + 1
triple_type_counts = {}
for triple in triples:
pred_type = triple["predicate"]["type"]
triple_type_counts[pred_type] = triple_type_counts.get(pred_type, 0) + 1
logger.info("Entity claims by type:")
for hyponym, count in sorted(claim_type_counts.items()):
logger.info(f" {hyponym}: {count}")
logger.info("Triples by predicate type:")
for pred_type, count in sorted(triple_type_counts.items()):
logger.info(f" {pred_type}: {count}")
# Prepare output
output = {
"metadata": {
"source_file": str(CONVERSATION_PATH),
"conversation_id": conversation_id,
"conversation_name": conversation_name,
"extraction_date": datetime.now(timezone.utc).isoformat(),
"institution_count": len([i for i in institutions if i.name.lower() not in
["national libraries", "major public libraries", "academic libraries",
"major national museums", "regional museums", "major digital platforms",
"international collections", "jerusalem cultural centers"]]),
"total_entity_claims": len(claims),
"total_triples": len(triples),
"claim_type_counts": claim_type_counts,
"triple_type_counts": triple_type_counts,
},
"institutions": [
{
"name": inst.name,
"name_arabic": inst.name_arabic,
"type": inst.institution_type,
"location": inst.location,
"city": inst.city,
"website": inst.website,
"email": inst.email,
"phone": inst.phone,
"established": inst.established,
"collections": inst.collections,
"collections_count": inst.collections_count,
"status": inst.status,
}
for inst in institutions
if inst.name.lower() not in ["national libraries", "major public libraries", "academic libraries",
"major national museums", "regional museums", "major digital platforms",
"international collections", "jerusalem cultural centers"]
],
"entity_claims": claims,
"triples": triples,
}
# Create output directory
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
# Write output
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
json.dump(output, f, indent=2, ensure_ascii=False)
logger.info(f"Output written to: {OUTPUT_PATH}")
logger.info("=" * 80)
# Show sample institutions
logger.info("Sample institutions with metadata:")
for inst in institutions[:3]:
if inst.name.lower() not in ["national libraries", "major public libraries"]:
logger.info(f" - {inst.name}")
if inst.website:
logger.info(f" Website: {inst.website}")
if inst.established:
logger.info(f" Established: {inst.established}")
if inst.collections_count:
logger.info(f" Collections: {inst.collections_count} items")
# Show sample triples
logger.info("Sample triples:")
for triple in triples[:5]:
subj = triple["subject"]["text"]
pred = triple["predicate"]["label"]
obj = triple["object"]["text"]
logger.info(f" ({subj}) --[{pred}]--> ({obj})")
return output
if __name__ == "__main__":
main()