#!/usr/bin/env python3 """ Extract heritage institution claims from Palestinian GLAM conversation. Uses pattern-based extraction to extract structured claims and triples from the conversation JSON file containing Palestinian GLAM institution data. Features: - Entity claims with GLAM-NER entity types - Metadata extraction (URLs, contacts, collections, dates) - Triple statements (subject-predicate-object) for relationships - Full provenance tracking Usage: python scripts/extract_palestinian_claims.py """ import json import logging import re import sys from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional, Tuple from dataclasses import dataclass, field, asdict # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Input file CONVERSATION_PATH = Path( "docs/reflection/2025-09-23T14-51-08-a150d437-419f-4e91-bfe9-ee2954c213a7-" "Palestinian_GLAM_resources_and_archives.json" ) # Output file OUTPUT_PATH = Path("data/extracted/palestinian_glam_claims.json") @dataclass class Provenance: """Provenance information for claims.""" source_type: str conversation_id: str extraction_date: str source_line: Optional[int] = None confidence: float = 0.9 @dataclass class EntityClaim: """An entity claim with metadata.""" hypernym: str hyponym: str text: str xpath: str confidence: float class_uri: Optional[str] = None notes: Optional[str] = None provenance: Optional[Provenance] = None @dataclass class Triple: """A subject-predicate-object triple representing a relationship.""" subject: Dict[str, Any] # {entity_type, text, uri} predicate: Dict[str, Any] # {type, uri, label} object: Dict[str, Any] # {entity_type, text, uri, literal_value} confidence: float = 0.9 provenance: Optional[Provenance] = None @dataclass class Institution: """A heritage institution with full metadata.""" name: str name_arabic: Optional[str] = None institution_type: str = "GRP.HER" section: Optional[str] = None # Location location: Optional[str] = None city: Optional[str] = None country: str = "PS" # Palestine # Identifiers website: Optional[str] = None email: Optional[str] = None phone: Optional[str] = None # Metadata established: Optional[str] = None collections: Optional[str] = None collections_count: Optional[str] = None languages: Optional[str] = None special_collections: Optional[str] = None digital_access: Optional[str] = None services: Optional[str] = None hours: Optional[str] = None status: Optional[str] = None # International identifiers international_status: Optional[str] = None unesco_status: Optional[str] = None # Provenance source_line: Optional[int] = None # Raw claims raw_claims: List[Dict[str, Any]] = field(default_factory=list) def extract_artifact_content(conversation_data: dict) -> str: """ Extract the markdown artifact content from the conversation. """ for message in conversation_data.get("chat_messages", []): for content_block in message.get("content", []): # Format 1: tool_use with artifacts in input.content if content_block.get("type") == "tool_use": if content_block.get("name") == "artifacts": input_data = content_block.get("input", {}) if input_data.get("type") == "text/markdown": content = input_data.get("content", "") if content: return content # Format 2: tool_result with artifacts output if content_block.get("type") == "tool_result": if content_block.get("name") == "artifacts": output = content_block.get("output", {}) if isinstance(output, list): for artifact in output: if artifact.get("type") == "text/markdown": return artifact.get("content", "") return "" def parse_arabic_name(name: str) -> Tuple[str, Optional[str]]: """Extract Arabic name from parentheses if present.""" match = re.search(r'\(([^\)]+)\)', name) if match: arabic = match.group(1) # Check if it contains Arabic characters if any('\u0600' <= c <= '\u06FF' for c in arabic): english = name.replace(f"({arabic})", "").strip() return english, arabic return name, None def parse_metadata_line(line: str) -> Tuple[Optional[str], Optional[str]]: """Parse a metadata line like '- **Field:** Value'.""" match = re.match(r'-\s*\*\*([^:*]+)\*\*:\s*(.+)', line) if match: return match.group(1).strip(), match.group(2).strip() return None, None def determine_institution_type(section: Optional[str]) -> str: """Determine institution type from section header.""" if not section: return "GRP.HER" section_lower = section.lower() if "libraries" in section_lower or "library" in section_lower: return "GRP.HER.LIB" elif "museums" in section_lower or "museum" in section_lower: return "GRP.HER.MUS" elif "archives" in section_lower or "archive" in section_lower: return "GRP.HER.ARC" elif "galleries" in section_lower or "cultural" in section_lower: return "GRP.HER.GAL" elif "digital" in section_lower: return "GRP.HER.DIG" return "GRP.HER" def extract_url(text: str) -> Optional[str]: """Extract URL from text.""" match = re.search(r'https?://[^\s<>\"\)]+', text) if match: return match.group(0).rstrip('.,;:') return None def extract_email(text: str) -> Optional[str]: """Extract email from text.""" match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text) if match: return match.group(0) return None def extract_phone(text: str) -> Optional[str]: """Extract phone number from text.""" match = re.search(r'\+?[\d\s\-\(\)]{10,}', text) if match: return match.group(0).strip() return None def parse_markdown_to_institutions( markdown_content: str, conversation_id: str ) -> List[Institution]: """ Parse the markdown artifact to extract structured institution data. """ institutions = [] lines = markdown_content.split("\n") current_institution = None current_section = None current_subsection = None for i, line in enumerate(lines): line_stripped = line.strip() # Section headers (## Part I: Libraries, ## Part II: Museums, etc.) if line_stripped.startswith("## Part") or line_stripped.startswith("## Current"): current_section = line_stripped current_subsection = None continue # Subsection headers (### National Libraries, ### Academic Libraries, etc.) if line_stripped.startswith("### ") and not line_stripped.startswith("#### "): current_subsection = line_stripped.lstrip("#").strip() continue # Institution headers (#### Name) if line_stripped.startswith("#### "): # Save previous institution if current_institution: institutions.append(current_institution) name = line_stripped.lstrip("#").strip() english_name, arabic_name = parse_arabic_name(name) current_institution = Institution( name=english_name, name_arabic=arabic_name, institution_type=determine_institution_type(current_section), section=current_section, source_line=i + 1, ) continue # Extract metadata from bullet points if line_stripped.startswith("- **") and current_institution: field, value = parse_metadata_line(line_stripped) if field and value: field_lower = field.lower() # Map fields to institution attributes if field_lower == "location": current_institution.location = value # Try to extract city if "," in value: parts = value.split(",") current_institution.city = parts[-1].strip().split()[0] elif field_lower == "website" or field_lower == "url": current_institution.website = extract_url(value) or value elif field_lower == "contact": current_institution.email = extract_email(value) current_institution.phone = extract_phone(value) elif field_lower == "established" or field_lower == "founded": current_institution.established = value elif field_lower == "collections": current_institution.collections = value # Extract count if present count_match = re.search(r'([\d,]+)\+?\s*(items|volumes|books|manuscripts|materials)', value, re.I) if count_match: current_institution.collections_count = count_match.group(1).replace(",", "") elif field_lower == "languages": current_institution.languages = value elif field_lower == "special collections": current_institution.special_collections = value elif field_lower == "digital access" or field_lower == "digital repository": current_institution.digital_access = value elif field_lower == "services": current_institution.services = value elif field_lower == "hours" or field_lower == "operating hours": current_institution.hours = value elif field_lower == "current status" or field_lower == "status": current_institution.status = value elif field_lower == "international status": current_institution.international_status = value elif field_lower == "unesco status" or field_lower == "unesco": current_institution.unesco_status = value # Store raw claim current_institution.raw_claims.append({ "field": field, "value": value, "line": i + 1, }) # Don't forget the last institution if current_institution: institutions.append(current_institution) return institutions def generate_entity_claims( institutions: List[Institution], conversation_id: str ) -> List[Dict[str, Any]]: """Generate entity claims from institutions.""" claims = [] extraction_date = datetime.now(timezone.utc).isoformat() for inst in institutions: # Skip section headers that got parsed as institutions if inst.name.lower() in ["national libraries", "major public libraries", "academic libraries", "major national museums", "regional museums", "major digital platforms", "international collections", "jerusalem cultural centers"]: continue provenance = { "source_type": "CONVERSATION_NLP", "conversation_id": conversation_id, "extraction_date": extraction_date, "source_line": inst.source_line, } # Main institution claim claims.append({ "hypernym": "GRP", "hyponym": inst.institution_type, "text": inst.name, "text_arabic": inst.name_arabic, "xpath": f"/section['{inst.section}']/institution[@line={inst.source_line}]", "confidence": 0.95, "class_uri": "glam:HeritageCustodian", "notes": f"Heritage institution in {inst.section}", "metadata": { "location": inst.location, "city": inst.city, "country": inst.country, "website": inst.website, "email": inst.email, "phone": inst.phone, "established": inst.established, "collections": inst.collections, "collections_count": inst.collections_count, "languages": inst.languages, "special_collections": inst.special_collections, "digital_access": inst.digital_access, "services": inst.services, "hours": inst.hours, "status": inst.status, "international_status": inst.international_status, "unesco_status": inst.unesco_status, }, "provenance": provenance, }) # Location claim if present if inst.city: claims.append({ "hypernym": "TOP", "hyponym": "TOP.SET", "text": inst.city, "xpath": f"/section['{inst.section}']/institution[@line={inst.source_line}]/location", "confidence": 0.90, "class_uri": "schema:Place", "notes": f"Location of {inst.name}", "related_entity": inst.name, "provenance": provenance, }) # URL claim if present if inst.website: claims.append({ "hypernym": "ID", "hyponym": "ID.URL", "text": inst.website, "xpath": f"/section['{inst.section}']/institution[@line={inst.source_line}]/website", "confidence": 0.95, "class_uri": "schema:URL", "notes": f"Website of {inst.name}", "related_entity": inst.name, "provenance": provenance, }) # Date claim if established date present if inst.established: claims.append({ "hypernym": "TMP", "hyponym": "TMP.DAT", "text": inst.established, "xpath": f"/section['{inst.section}']/institution[@line={inst.source_line}]/established", "confidence": 0.85, "class_uri": "schema:Date", "notes": f"Founding date of {inst.name}", "related_entity": inst.name, "provenance": provenance, }) return claims def generate_triples( institutions: List[Institution], conversation_id: str ) -> List[Dict[str, Any]]: """Generate triple statements from institutions.""" triples = [] extraction_date = datetime.now(timezone.utc).isoformat() for inst in institutions: # Skip section headers if inst.name.lower() in ["national libraries", "major public libraries", "academic libraries", "major national museums", "regional museums", "major digital platforms", "international collections", "jerusalem cultural centers"]: continue provenance = { "source_type": "CONVERSATION_NLP", "conversation_id": conversation_id, "extraction_date": extraction_date, "source_line": inst.source_line, } # Subject (the institution) subject = { "entity_type": inst.institution_type, "text": inst.name, "uri": f"glam:institution/{inst.name.lower().replace(' ', '_').replace('(', '').replace(')', '')}", } # Triple: Institution -> located_in -> City if inst.city: triples.append({ "subject": subject, "predicate": { "type": "REL.SPA.LOC", "uri": "schema:location", "label": "located in", }, "object": { "entity_type": "TOP.SET", "text": inst.city, "uri": f"geonames:{inst.city.lower()}", }, "confidence": 0.90, "provenance": provenance, }) # Triple: Institution -> located_in -> Country (Palestine) triples.append({ "subject": subject, "predicate": { "type": "REL.SPA.LOC", "uri": "schema:containedInPlace", "label": "located in country", }, "object": { "entity_type": "TOP.CTY", "text": "Palestine", "uri": "wd:Q219060", # Wikidata ID for Palestine }, "confidence": 0.95, "provenance": provenance, }) # Triple: Institution -> instance_of -> Institution Type type_labels = { "GRP.HER.LIB": ("Library", "wd:Q7075"), "GRP.HER.MUS": ("Museum", "wd:Q33506"), "GRP.HER.ARC": ("Archive", "wd:Q166118"), "GRP.HER.GAL": ("Gallery", "wd:Q1007870"), "GRP.HER.DIG": ("Digital Platform", "wd:Q35127"), "GRP.HER": ("Heritage Institution", "glam:HeritageCustodian"), } type_label, type_uri = type_labels.get(inst.institution_type, ("Heritage Institution", "glam:HeritageCustodian")) triples.append({ "subject": subject, "predicate": { "type": "REL.ONT.ISA", "uri": "rdf:type", "label": "instance of", }, "object": { "entity_type": "THG.CON", "text": type_label, "uri": type_uri, }, "confidence": 0.95, "provenance": provenance, }) # Triple: Institution -> has_website -> URL if inst.website: triples.append({ "subject": subject, "predicate": { "type": "REL.ID.URL", "uri": "schema:url", "label": "has website", }, "object": { "entity_type": "ID.URL", "text": inst.website, "literal_value": inst.website, }, "confidence": 0.95, "provenance": provenance, }) # Triple: Institution -> founded_in -> Year if inst.established: # Extract year year_match = re.search(r'\b(1\d{3}|20\d{2})\b', inst.established) if year_match: triples.append({ "subject": subject, "predicate": { "type": "REL.TMP.CRE", "uri": "schema:foundingDate", "label": "founded in", }, "object": { "entity_type": "TMP.DAT", "text": year_match.group(1), "literal_value": year_match.group(1), }, "confidence": 0.85, "provenance": provenance, }) # Triple: Institution -> has_collection_size -> Count if inst.collections_count: triples.append({ "subject": subject, "predicate": { "type": "REL.QTY.CNT", "uri": "schema:size", "label": "has collection size", }, "object": { "entity_type": "QTY.CNT", "text": f"{inst.collections_count} items", "literal_value": int(inst.collections_count), }, "confidence": 0.80, "provenance": provenance, }) # Triple: Institution -> has_contact_email -> Email if inst.email: triples.append({ "subject": subject, "predicate": { "type": "REL.ID.CON", "uri": "schema:email", "label": "has email", }, "object": { "entity_type": "ID.CON", "text": inst.email, "literal_value": inst.email, }, "confidence": 0.90, "provenance": provenance, }) # Triple: Institution -> has_languages -> Languages if inst.languages: triples.append({ "subject": subject, "predicate": { "type": "REL.ATT.LNG", "uri": "schema:inLanguage", "label": "uses languages", }, "object": { "entity_type": "THG.LNG", "text": inst.languages, "literal_value": inst.languages, }, "confidence": 0.85, "provenance": provenance, }) # Triple: Institution -> has_status -> Status (for Gaza institutions) if inst.status: triples.append({ "subject": subject, "predicate": { "type": "REL.STA.CUR", "uri": "schema:status", "label": "has current status", }, "object": { "entity_type": "THG.STA", "text": inst.status, "literal_value": inst.status, }, "confidence": 0.90, "provenance": provenance, }) # Triple: Institution -> member_of -> International Organization if inst.international_status: # Parse organizations from international status if "IFLA" in inst.international_status: triples.append({ "subject": subject, "predicate": { "type": "REL.ORG.MEM", "uri": "org:memberOf", "label": "member of", }, "object": { "entity_type": "GRP.ORG", "text": "IFLA", "uri": "wd:Q46550", # International Federation of Library Associations }, "confidence": 0.90, "provenance": provenance, }) if "UNESCO" in inst.international_status or inst.unesco_status: triples.append({ "subject": subject, "predicate": { "type": "REL.ORG.REC", "uri": "schema:award", "label": "recognized by", }, "object": { "entity_type": "GRP.GOV", "text": "UNESCO", "uri": "wd:Q7809", }, "confidence": 0.85, "provenance": provenance, }) return triples def main(): """Main extraction workflow.""" logger.info("=" * 80) logger.info("Palestinian GLAM Claims Extraction - Enhanced with Triples") logger.info("=" * 80) # Check input file exists if not CONVERSATION_PATH.exists(): logger.error(f"Conversation file not found: {CONVERSATION_PATH}") sys.exit(1) logger.info(f"Reading conversation: {CONVERSATION_PATH.name}") # Load conversation JSON with open(CONVERSATION_PATH, "r", encoding="utf-8") as f: conversation_data = json.load(f) conversation_id = conversation_data.get("uuid", "unknown") conversation_name = conversation_data.get("name", "Unknown") logger.info(f"Conversation: {conversation_name}") logger.info(f"UUID: {conversation_id}") # Extract artifact content logger.info("Extracting artifact content...") artifact_content = extract_artifact_content(conversation_data) if artifact_content: logger.info(f"Found artifact with {len(artifact_content):,} characters") else: logger.error("No artifact content found!") sys.exit(1) # Parse markdown to institutions logger.info("Parsing markdown to extract institutions...") institutions = parse_markdown_to_institutions(artifact_content, conversation_id) logger.info(f"Found {len(institutions)} institutions") # Generate entity claims logger.info("Generating entity claims...") claims = generate_entity_claims(institutions, conversation_id) logger.info(f"Generated {len(claims)} entity claims") # Generate triples logger.info("Generating triple statements...") triples = generate_triples(institutions, conversation_id) logger.info(f"Generated {len(triples)} triples") # Count by type claim_type_counts = {} for claim in claims: hyponym = claim.get("hyponym", "UNKNOWN") claim_type_counts[hyponym] = claim_type_counts.get(hyponym, 0) + 1 triple_type_counts = {} for triple in triples: pred_type = triple["predicate"]["type"] triple_type_counts[pred_type] = triple_type_counts.get(pred_type, 0) + 1 logger.info("Entity claims by type:") for hyponym, count in sorted(claim_type_counts.items()): logger.info(f" {hyponym}: {count}") logger.info("Triples by predicate type:") for pred_type, count in sorted(triple_type_counts.items()): logger.info(f" {pred_type}: {count}") # Prepare output output = { "metadata": { "source_file": str(CONVERSATION_PATH), "conversation_id": conversation_id, "conversation_name": conversation_name, "extraction_date": datetime.now(timezone.utc).isoformat(), "institution_count": len([i for i in institutions if i.name.lower() not in ["national libraries", "major public libraries", "academic libraries", "major national museums", "regional museums", "major digital platforms", "international collections", "jerusalem cultural centers"]]), "total_entity_claims": len(claims), "total_triples": len(triples), "claim_type_counts": claim_type_counts, "triple_type_counts": triple_type_counts, }, "institutions": [ { "name": inst.name, "name_arabic": inst.name_arabic, "type": inst.institution_type, "location": inst.location, "city": inst.city, "website": inst.website, "email": inst.email, "phone": inst.phone, "established": inst.established, "collections": inst.collections, "collections_count": inst.collections_count, "status": inst.status, } for inst in institutions if inst.name.lower() not in ["national libraries", "major public libraries", "academic libraries", "major national museums", "regional museums", "major digital platforms", "international collections", "jerusalem cultural centers"] ], "entity_claims": claims, "triples": triples, } # Create output directory OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True) # Write output with open(OUTPUT_PATH, "w", encoding="utf-8") as f: json.dump(output, f, indent=2, ensure_ascii=False) logger.info(f"Output written to: {OUTPUT_PATH}") logger.info("=" * 80) # Show sample institutions logger.info("Sample institutions with metadata:") for inst in institutions[:3]: if inst.name.lower() not in ["national libraries", "major public libraries"]: logger.info(f" - {inst.name}") if inst.website: logger.info(f" Website: {inst.website}") if inst.established: logger.info(f" Established: {inst.established}") if inst.collections_count: logger.info(f" Collections: {inst.collections_count} items") # Show sample triples logger.info("Sample triples:") for triple in triples[:5]: subj = triple["subject"]["text"] pred = triple["predicate"]["label"] obj = triple["object"]["text"] logger.info(f" ({subj}) --[{pred}]--> ({obj})") return output if __name__ == "__main__": main()