- Introduced `llm_extract_archiveslab.py` script for entity and relationship extraction using LLMAnnotator with GLAM-NER v1.7.0. - Replaced regex-based extraction with generative LLM inference. - Added functions for loading markdown content, converting annotation sessions to dictionaries, and generating extraction statistics. - Implemented comprehensive logging of extraction results, including counts of entities, relationships, and specific types like heritage institutions and persons. - Results and statistics are saved in JSON format for further analysis.
792 lines
29 KiB
Python
792 lines
29 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract heritage institution claims from Palestinian GLAM conversation.
|
|
|
|
Uses pattern-based extraction to extract structured claims and triples
|
|
from the conversation JSON file containing Palestinian GLAM institution data.
|
|
|
|
Features:
|
|
- Entity claims with GLAM-NER entity types
|
|
- Metadata extraction (URLs, contacts, collections, dates)
|
|
- Triple statements (subject-predicate-object) for relationships
|
|
- Full provenance tracking
|
|
|
|
Usage:
|
|
python scripts/extract_palestinian_claims.py
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
from dataclasses import dataclass, field, asdict
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Input file
|
|
CONVERSATION_PATH = Path(
|
|
"docs/reflection/2025-09-23T14-51-08-a150d437-419f-4e91-bfe9-ee2954c213a7-"
|
|
"Palestinian_GLAM_resources_and_archives.json"
|
|
)
|
|
|
|
# Output file
|
|
OUTPUT_PATH = Path("data/extracted/palestinian_glam_claims.json")
|
|
|
|
|
|
@dataclass
|
|
class Provenance:
|
|
"""Provenance information for claims."""
|
|
source_type: str
|
|
conversation_id: str
|
|
extraction_date: str
|
|
source_line: Optional[int] = None
|
|
confidence: float = 0.9
|
|
|
|
|
|
@dataclass
|
|
class EntityClaim:
|
|
"""An entity claim with metadata."""
|
|
hypernym: str
|
|
hyponym: str
|
|
text: str
|
|
xpath: str
|
|
confidence: float
|
|
class_uri: Optional[str] = None
|
|
notes: Optional[str] = None
|
|
provenance: Optional[Provenance] = None
|
|
|
|
|
|
@dataclass
|
|
class Triple:
|
|
"""A subject-predicate-object triple representing a relationship."""
|
|
subject: Dict[str, Any] # {entity_type, text, uri}
|
|
predicate: Dict[str, Any] # {type, uri, label}
|
|
object: Dict[str, Any] # {entity_type, text, uri, literal_value}
|
|
confidence: float = 0.9
|
|
provenance: Optional[Provenance] = None
|
|
|
|
|
|
@dataclass
|
|
class Institution:
|
|
"""A heritage institution with full metadata."""
|
|
name: str
|
|
name_arabic: Optional[str] = None
|
|
institution_type: str = "GRP.HER"
|
|
section: Optional[str] = None
|
|
|
|
# Location
|
|
location: Optional[str] = None
|
|
city: Optional[str] = None
|
|
country: str = "PS" # Palestine
|
|
|
|
# Identifiers
|
|
website: Optional[str] = None
|
|
email: Optional[str] = None
|
|
phone: Optional[str] = None
|
|
|
|
# Metadata
|
|
established: Optional[str] = None
|
|
collections: Optional[str] = None
|
|
collections_count: Optional[str] = None
|
|
languages: Optional[str] = None
|
|
special_collections: Optional[str] = None
|
|
digital_access: Optional[str] = None
|
|
services: Optional[str] = None
|
|
hours: Optional[str] = None
|
|
status: Optional[str] = None
|
|
|
|
# International identifiers
|
|
international_status: Optional[str] = None
|
|
unesco_status: Optional[str] = None
|
|
|
|
# Provenance
|
|
source_line: Optional[int] = None
|
|
|
|
# Raw claims
|
|
raw_claims: List[Dict[str, Any]] = field(default_factory=list)
|
|
|
|
|
|
def extract_artifact_content(conversation_data: dict) -> str:
|
|
"""
|
|
Extract the markdown artifact content from the conversation.
|
|
"""
|
|
for message in conversation_data.get("chat_messages", []):
|
|
for content_block in message.get("content", []):
|
|
# Format 1: tool_use with artifacts in input.content
|
|
if content_block.get("type") == "tool_use":
|
|
if content_block.get("name") == "artifacts":
|
|
input_data = content_block.get("input", {})
|
|
if input_data.get("type") == "text/markdown":
|
|
content = input_data.get("content", "")
|
|
if content:
|
|
return content
|
|
|
|
# Format 2: tool_result with artifacts output
|
|
if content_block.get("type") == "tool_result":
|
|
if content_block.get("name") == "artifacts":
|
|
output = content_block.get("output", {})
|
|
if isinstance(output, list):
|
|
for artifact in output:
|
|
if artifact.get("type") == "text/markdown":
|
|
return artifact.get("content", "")
|
|
return ""
|
|
|
|
|
|
def parse_arabic_name(name: str) -> Tuple[str, Optional[str]]:
|
|
"""Extract Arabic name from parentheses if present."""
|
|
match = re.search(r'\(([^\)]+)\)', name)
|
|
if match:
|
|
arabic = match.group(1)
|
|
# Check if it contains Arabic characters
|
|
if any('\u0600' <= c <= '\u06FF' for c in arabic):
|
|
english = name.replace(f"({arabic})", "").strip()
|
|
return english, arabic
|
|
return name, None
|
|
|
|
|
|
def parse_metadata_line(line: str) -> Tuple[Optional[str], Optional[str]]:
|
|
"""Parse a metadata line like '- **Field:** Value'."""
|
|
match = re.match(r'-\s*\*\*([^:*]+)\*\*:\s*(.+)', line)
|
|
if match:
|
|
return match.group(1).strip(), match.group(2).strip()
|
|
return None, None
|
|
|
|
|
|
def determine_institution_type(section: Optional[str]) -> str:
|
|
"""Determine institution type from section header."""
|
|
if not section:
|
|
return "GRP.HER"
|
|
|
|
section_lower = section.lower()
|
|
if "libraries" in section_lower or "library" in section_lower:
|
|
return "GRP.HER.LIB"
|
|
elif "museums" in section_lower or "museum" in section_lower:
|
|
return "GRP.HER.MUS"
|
|
elif "archives" in section_lower or "archive" in section_lower:
|
|
return "GRP.HER.ARC"
|
|
elif "galleries" in section_lower or "cultural" in section_lower:
|
|
return "GRP.HER.GAL"
|
|
elif "digital" in section_lower:
|
|
return "GRP.HER.DIG"
|
|
return "GRP.HER"
|
|
|
|
|
|
def extract_url(text: str) -> Optional[str]:
|
|
"""Extract URL from text."""
|
|
match = re.search(r'https?://[^\s<>\"\)]+', text)
|
|
if match:
|
|
return match.group(0).rstrip('.,;:')
|
|
return None
|
|
|
|
|
|
def extract_email(text: str) -> Optional[str]:
|
|
"""Extract email from text."""
|
|
match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
|
|
if match:
|
|
return match.group(0)
|
|
return None
|
|
|
|
|
|
def extract_phone(text: str) -> Optional[str]:
|
|
"""Extract phone number from text."""
|
|
match = re.search(r'\+?[\d\s\-\(\)]{10,}', text)
|
|
if match:
|
|
return match.group(0).strip()
|
|
return None
|
|
|
|
|
|
def parse_markdown_to_institutions(
|
|
markdown_content: str,
|
|
conversation_id: str
|
|
) -> List[Institution]:
|
|
"""
|
|
Parse the markdown artifact to extract structured institution data.
|
|
"""
|
|
institutions = []
|
|
lines = markdown_content.split("\n")
|
|
|
|
current_institution = None
|
|
current_section = None
|
|
current_subsection = None
|
|
|
|
for i, line in enumerate(lines):
|
|
line_stripped = line.strip()
|
|
|
|
# Section headers (## Part I: Libraries, ## Part II: Museums, etc.)
|
|
if line_stripped.startswith("## Part") or line_stripped.startswith("## Current"):
|
|
current_section = line_stripped
|
|
current_subsection = None
|
|
continue
|
|
|
|
# Subsection headers (### National Libraries, ### Academic Libraries, etc.)
|
|
if line_stripped.startswith("### ") and not line_stripped.startswith("#### "):
|
|
current_subsection = line_stripped.lstrip("#").strip()
|
|
continue
|
|
|
|
# Institution headers (#### Name)
|
|
if line_stripped.startswith("#### "):
|
|
# Save previous institution
|
|
if current_institution:
|
|
institutions.append(current_institution)
|
|
|
|
name = line_stripped.lstrip("#").strip()
|
|
english_name, arabic_name = parse_arabic_name(name)
|
|
|
|
current_institution = Institution(
|
|
name=english_name,
|
|
name_arabic=arabic_name,
|
|
institution_type=determine_institution_type(current_section),
|
|
section=current_section,
|
|
source_line=i + 1,
|
|
)
|
|
continue
|
|
|
|
# Extract metadata from bullet points
|
|
if line_stripped.startswith("- **") and current_institution:
|
|
field, value = parse_metadata_line(line_stripped)
|
|
if field and value:
|
|
field_lower = field.lower()
|
|
|
|
# Map fields to institution attributes
|
|
if field_lower == "location":
|
|
current_institution.location = value
|
|
# Try to extract city
|
|
if "," in value:
|
|
parts = value.split(",")
|
|
current_institution.city = parts[-1].strip().split()[0]
|
|
elif field_lower == "website" or field_lower == "url":
|
|
current_institution.website = extract_url(value) or value
|
|
elif field_lower == "contact":
|
|
current_institution.email = extract_email(value)
|
|
current_institution.phone = extract_phone(value)
|
|
elif field_lower == "established" or field_lower == "founded":
|
|
current_institution.established = value
|
|
elif field_lower == "collections":
|
|
current_institution.collections = value
|
|
# Extract count if present
|
|
count_match = re.search(r'([\d,]+)\+?\s*(items|volumes|books|manuscripts|materials)', value, re.I)
|
|
if count_match:
|
|
current_institution.collections_count = count_match.group(1).replace(",", "")
|
|
elif field_lower == "languages":
|
|
current_institution.languages = value
|
|
elif field_lower == "special collections":
|
|
current_institution.special_collections = value
|
|
elif field_lower == "digital access" or field_lower == "digital repository":
|
|
current_institution.digital_access = value
|
|
elif field_lower == "services":
|
|
current_institution.services = value
|
|
elif field_lower == "hours" or field_lower == "operating hours":
|
|
current_institution.hours = value
|
|
elif field_lower == "current status" or field_lower == "status":
|
|
current_institution.status = value
|
|
elif field_lower == "international status":
|
|
current_institution.international_status = value
|
|
elif field_lower == "unesco status" or field_lower == "unesco":
|
|
current_institution.unesco_status = value
|
|
|
|
# Store raw claim
|
|
current_institution.raw_claims.append({
|
|
"field": field,
|
|
"value": value,
|
|
"line": i + 1,
|
|
})
|
|
|
|
# Don't forget the last institution
|
|
if current_institution:
|
|
institutions.append(current_institution)
|
|
|
|
return institutions
|
|
|
|
|
|
def generate_entity_claims(
|
|
institutions: List[Institution],
|
|
conversation_id: str
|
|
) -> List[Dict[str, Any]]:
|
|
"""Generate entity claims from institutions."""
|
|
claims = []
|
|
extraction_date = datetime.now(timezone.utc).isoformat()
|
|
|
|
for inst in institutions:
|
|
# Skip section headers that got parsed as institutions
|
|
if inst.name.lower() in ["national libraries", "major public libraries",
|
|
"academic libraries", "major national museums",
|
|
"regional museums", "major digital platforms",
|
|
"international collections", "jerusalem cultural centers"]:
|
|
continue
|
|
|
|
provenance = {
|
|
"source_type": "CONVERSATION_NLP",
|
|
"conversation_id": conversation_id,
|
|
"extraction_date": extraction_date,
|
|
"source_line": inst.source_line,
|
|
}
|
|
|
|
# Main institution claim
|
|
claims.append({
|
|
"hypernym": "GRP",
|
|
"hyponym": inst.institution_type,
|
|
"text": inst.name,
|
|
"text_arabic": inst.name_arabic,
|
|
"xpath": f"/section['{inst.section}']/institution[@line={inst.source_line}]",
|
|
"confidence": 0.95,
|
|
"class_uri": "glam:HeritageCustodian",
|
|
"notes": f"Heritage institution in {inst.section}",
|
|
"metadata": {
|
|
"location": inst.location,
|
|
"city": inst.city,
|
|
"country": inst.country,
|
|
"website": inst.website,
|
|
"email": inst.email,
|
|
"phone": inst.phone,
|
|
"established": inst.established,
|
|
"collections": inst.collections,
|
|
"collections_count": inst.collections_count,
|
|
"languages": inst.languages,
|
|
"special_collections": inst.special_collections,
|
|
"digital_access": inst.digital_access,
|
|
"services": inst.services,
|
|
"hours": inst.hours,
|
|
"status": inst.status,
|
|
"international_status": inst.international_status,
|
|
"unesco_status": inst.unesco_status,
|
|
},
|
|
"provenance": provenance,
|
|
})
|
|
|
|
# Location claim if present
|
|
if inst.city:
|
|
claims.append({
|
|
"hypernym": "TOP",
|
|
"hyponym": "TOP.SET",
|
|
"text": inst.city,
|
|
"xpath": f"/section['{inst.section}']/institution[@line={inst.source_line}]/location",
|
|
"confidence": 0.90,
|
|
"class_uri": "schema:Place",
|
|
"notes": f"Location of {inst.name}",
|
|
"related_entity": inst.name,
|
|
"provenance": provenance,
|
|
})
|
|
|
|
# URL claim if present
|
|
if inst.website:
|
|
claims.append({
|
|
"hypernym": "ID",
|
|
"hyponym": "ID.URL",
|
|
"text": inst.website,
|
|
"xpath": f"/section['{inst.section}']/institution[@line={inst.source_line}]/website",
|
|
"confidence": 0.95,
|
|
"class_uri": "schema:URL",
|
|
"notes": f"Website of {inst.name}",
|
|
"related_entity": inst.name,
|
|
"provenance": provenance,
|
|
})
|
|
|
|
# Date claim if established date present
|
|
if inst.established:
|
|
claims.append({
|
|
"hypernym": "TMP",
|
|
"hyponym": "TMP.DAT",
|
|
"text": inst.established,
|
|
"xpath": f"/section['{inst.section}']/institution[@line={inst.source_line}]/established",
|
|
"confidence": 0.85,
|
|
"class_uri": "schema:Date",
|
|
"notes": f"Founding date of {inst.name}",
|
|
"related_entity": inst.name,
|
|
"provenance": provenance,
|
|
})
|
|
|
|
return claims
|
|
|
|
|
|
def generate_triples(
|
|
institutions: List[Institution],
|
|
conversation_id: str
|
|
) -> List[Dict[str, Any]]:
|
|
"""Generate triple statements from institutions."""
|
|
triples = []
|
|
extraction_date = datetime.now(timezone.utc).isoformat()
|
|
|
|
for inst in institutions:
|
|
# Skip section headers
|
|
if inst.name.lower() in ["national libraries", "major public libraries",
|
|
"academic libraries", "major national museums",
|
|
"regional museums", "major digital platforms",
|
|
"international collections", "jerusalem cultural centers"]:
|
|
continue
|
|
|
|
provenance = {
|
|
"source_type": "CONVERSATION_NLP",
|
|
"conversation_id": conversation_id,
|
|
"extraction_date": extraction_date,
|
|
"source_line": inst.source_line,
|
|
}
|
|
|
|
# Subject (the institution)
|
|
subject = {
|
|
"entity_type": inst.institution_type,
|
|
"text": inst.name,
|
|
"uri": f"glam:institution/{inst.name.lower().replace(' ', '_').replace('(', '').replace(')', '')}",
|
|
}
|
|
|
|
# Triple: Institution -> located_in -> City
|
|
if inst.city:
|
|
triples.append({
|
|
"subject": subject,
|
|
"predicate": {
|
|
"type": "REL.SPA.LOC",
|
|
"uri": "schema:location",
|
|
"label": "located in",
|
|
},
|
|
"object": {
|
|
"entity_type": "TOP.SET",
|
|
"text": inst.city,
|
|
"uri": f"geonames:{inst.city.lower()}",
|
|
},
|
|
"confidence": 0.90,
|
|
"provenance": provenance,
|
|
})
|
|
|
|
# Triple: Institution -> located_in -> Country (Palestine)
|
|
triples.append({
|
|
"subject": subject,
|
|
"predicate": {
|
|
"type": "REL.SPA.LOC",
|
|
"uri": "schema:containedInPlace",
|
|
"label": "located in country",
|
|
},
|
|
"object": {
|
|
"entity_type": "TOP.CTY",
|
|
"text": "Palestine",
|
|
"uri": "wd:Q219060", # Wikidata ID for Palestine
|
|
},
|
|
"confidence": 0.95,
|
|
"provenance": provenance,
|
|
})
|
|
|
|
# Triple: Institution -> instance_of -> Institution Type
|
|
type_labels = {
|
|
"GRP.HER.LIB": ("Library", "wd:Q7075"),
|
|
"GRP.HER.MUS": ("Museum", "wd:Q33506"),
|
|
"GRP.HER.ARC": ("Archive", "wd:Q166118"),
|
|
"GRP.HER.GAL": ("Gallery", "wd:Q1007870"),
|
|
"GRP.HER.DIG": ("Digital Platform", "wd:Q35127"),
|
|
"GRP.HER": ("Heritage Institution", "glam:HeritageCustodian"),
|
|
}
|
|
type_label, type_uri = type_labels.get(inst.institution_type, ("Heritage Institution", "glam:HeritageCustodian"))
|
|
|
|
triples.append({
|
|
"subject": subject,
|
|
"predicate": {
|
|
"type": "REL.ONT.ISA",
|
|
"uri": "rdf:type",
|
|
"label": "instance of",
|
|
},
|
|
"object": {
|
|
"entity_type": "THG.CON",
|
|
"text": type_label,
|
|
"uri": type_uri,
|
|
},
|
|
"confidence": 0.95,
|
|
"provenance": provenance,
|
|
})
|
|
|
|
# Triple: Institution -> has_website -> URL
|
|
if inst.website:
|
|
triples.append({
|
|
"subject": subject,
|
|
"predicate": {
|
|
"type": "REL.ID.URL",
|
|
"uri": "schema:url",
|
|
"label": "has website",
|
|
},
|
|
"object": {
|
|
"entity_type": "ID.URL",
|
|
"text": inst.website,
|
|
"literal_value": inst.website,
|
|
},
|
|
"confidence": 0.95,
|
|
"provenance": provenance,
|
|
})
|
|
|
|
# Triple: Institution -> founded_in -> Year
|
|
if inst.established:
|
|
# Extract year
|
|
year_match = re.search(r'\b(1\d{3}|20\d{2})\b', inst.established)
|
|
if year_match:
|
|
triples.append({
|
|
"subject": subject,
|
|
"predicate": {
|
|
"type": "REL.TMP.CRE",
|
|
"uri": "schema:foundingDate",
|
|
"label": "founded in",
|
|
},
|
|
"object": {
|
|
"entity_type": "TMP.DAT",
|
|
"text": year_match.group(1),
|
|
"literal_value": year_match.group(1),
|
|
},
|
|
"confidence": 0.85,
|
|
"provenance": provenance,
|
|
})
|
|
|
|
# Triple: Institution -> has_collection_size -> Count
|
|
if inst.collections_count:
|
|
triples.append({
|
|
"subject": subject,
|
|
"predicate": {
|
|
"type": "REL.QTY.CNT",
|
|
"uri": "schema:size",
|
|
"label": "has collection size",
|
|
},
|
|
"object": {
|
|
"entity_type": "QTY.CNT",
|
|
"text": f"{inst.collections_count} items",
|
|
"literal_value": int(inst.collections_count),
|
|
},
|
|
"confidence": 0.80,
|
|
"provenance": provenance,
|
|
})
|
|
|
|
# Triple: Institution -> has_contact_email -> Email
|
|
if inst.email:
|
|
triples.append({
|
|
"subject": subject,
|
|
"predicate": {
|
|
"type": "REL.ID.CON",
|
|
"uri": "schema:email",
|
|
"label": "has email",
|
|
},
|
|
"object": {
|
|
"entity_type": "ID.CON",
|
|
"text": inst.email,
|
|
"literal_value": inst.email,
|
|
},
|
|
"confidence": 0.90,
|
|
"provenance": provenance,
|
|
})
|
|
|
|
# Triple: Institution -> has_languages -> Languages
|
|
if inst.languages:
|
|
triples.append({
|
|
"subject": subject,
|
|
"predicate": {
|
|
"type": "REL.ATT.LNG",
|
|
"uri": "schema:inLanguage",
|
|
"label": "uses languages",
|
|
},
|
|
"object": {
|
|
"entity_type": "THG.LNG",
|
|
"text": inst.languages,
|
|
"literal_value": inst.languages,
|
|
},
|
|
"confidence": 0.85,
|
|
"provenance": provenance,
|
|
})
|
|
|
|
# Triple: Institution -> has_status -> Status (for Gaza institutions)
|
|
if inst.status:
|
|
triples.append({
|
|
"subject": subject,
|
|
"predicate": {
|
|
"type": "REL.STA.CUR",
|
|
"uri": "schema:status",
|
|
"label": "has current status",
|
|
},
|
|
"object": {
|
|
"entity_type": "THG.STA",
|
|
"text": inst.status,
|
|
"literal_value": inst.status,
|
|
},
|
|
"confidence": 0.90,
|
|
"provenance": provenance,
|
|
})
|
|
|
|
# Triple: Institution -> member_of -> International Organization
|
|
if inst.international_status:
|
|
# Parse organizations from international status
|
|
if "IFLA" in inst.international_status:
|
|
triples.append({
|
|
"subject": subject,
|
|
"predicate": {
|
|
"type": "REL.ORG.MEM",
|
|
"uri": "org:memberOf",
|
|
"label": "member of",
|
|
},
|
|
"object": {
|
|
"entity_type": "GRP.ORG",
|
|
"text": "IFLA",
|
|
"uri": "wd:Q46550", # International Federation of Library Associations
|
|
},
|
|
"confidence": 0.90,
|
|
"provenance": provenance,
|
|
})
|
|
if "UNESCO" in inst.international_status or inst.unesco_status:
|
|
triples.append({
|
|
"subject": subject,
|
|
"predicate": {
|
|
"type": "REL.ORG.REC",
|
|
"uri": "schema:award",
|
|
"label": "recognized by",
|
|
},
|
|
"object": {
|
|
"entity_type": "GRP.GOV",
|
|
"text": "UNESCO",
|
|
"uri": "wd:Q7809",
|
|
},
|
|
"confidence": 0.85,
|
|
"provenance": provenance,
|
|
})
|
|
|
|
return triples
|
|
|
|
|
|
def main():
|
|
"""Main extraction workflow."""
|
|
logger.info("=" * 80)
|
|
logger.info("Palestinian GLAM Claims Extraction - Enhanced with Triples")
|
|
logger.info("=" * 80)
|
|
|
|
# Check input file exists
|
|
if not CONVERSATION_PATH.exists():
|
|
logger.error(f"Conversation file not found: {CONVERSATION_PATH}")
|
|
sys.exit(1)
|
|
|
|
logger.info(f"Reading conversation: {CONVERSATION_PATH.name}")
|
|
|
|
# Load conversation JSON
|
|
with open(CONVERSATION_PATH, "r", encoding="utf-8") as f:
|
|
conversation_data = json.load(f)
|
|
|
|
conversation_id = conversation_data.get("uuid", "unknown")
|
|
conversation_name = conversation_data.get("name", "Unknown")
|
|
|
|
logger.info(f"Conversation: {conversation_name}")
|
|
logger.info(f"UUID: {conversation_id}")
|
|
|
|
# Extract artifact content
|
|
logger.info("Extracting artifact content...")
|
|
artifact_content = extract_artifact_content(conversation_data)
|
|
|
|
if artifact_content:
|
|
logger.info(f"Found artifact with {len(artifact_content):,} characters")
|
|
else:
|
|
logger.error("No artifact content found!")
|
|
sys.exit(1)
|
|
|
|
# Parse markdown to institutions
|
|
logger.info("Parsing markdown to extract institutions...")
|
|
institutions = parse_markdown_to_institutions(artifact_content, conversation_id)
|
|
logger.info(f"Found {len(institutions)} institutions")
|
|
|
|
# Generate entity claims
|
|
logger.info("Generating entity claims...")
|
|
claims = generate_entity_claims(institutions, conversation_id)
|
|
logger.info(f"Generated {len(claims)} entity claims")
|
|
|
|
# Generate triples
|
|
logger.info("Generating triple statements...")
|
|
triples = generate_triples(institutions, conversation_id)
|
|
logger.info(f"Generated {len(triples)} triples")
|
|
|
|
# Count by type
|
|
claim_type_counts = {}
|
|
for claim in claims:
|
|
hyponym = claim.get("hyponym", "UNKNOWN")
|
|
claim_type_counts[hyponym] = claim_type_counts.get(hyponym, 0) + 1
|
|
|
|
triple_type_counts = {}
|
|
for triple in triples:
|
|
pred_type = triple["predicate"]["type"]
|
|
triple_type_counts[pred_type] = triple_type_counts.get(pred_type, 0) + 1
|
|
|
|
logger.info("Entity claims by type:")
|
|
for hyponym, count in sorted(claim_type_counts.items()):
|
|
logger.info(f" {hyponym}: {count}")
|
|
|
|
logger.info("Triples by predicate type:")
|
|
for pred_type, count in sorted(triple_type_counts.items()):
|
|
logger.info(f" {pred_type}: {count}")
|
|
|
|
# Prepare output
|
|
output = {
|
|
"metadata": {
|
|
"source_file": str(CONVERSATION_PATH),
|
|
"conversation_id": conversation_id,
|
|
"conversation_name": conversation_name,
|
|
"extraction_date": datetime.now(timezone.utc).isoformat(),
|
|
"institution_count": len([i for i in institutions if i.name.lower() not in
|
|
["national libraries", "major public libraries", "academic libraries",
|
|
"major national museums", "regional museums", "major digital platforms",
|
|
"international collections", "jerusalem cultural centers"]]),
|
|
"total_entity_claims": len(claims),
|
|
"total_triples": len(triples),
|
|
"claim_type_counts": claim_type_counts,
|
|
"triple_type_counts": triple_type_counts,
|
|
},
|
|
"institutions": [
|
|
{
|
|
"name": inst.name,
|
|
"name_arabic": inst.name_arabic,
|
|
"type": inst.institution_type,
|
|
"location": inst.location,
|
|
"city": inst.city,
|
|
"website": inst.website,
|
|
"email": inst.email,
|
|
"phone": inst.phone,
|
|
"established": inst.established,
|
|
"collections": inst.collections,
|
|
"collections_count": inst.collections_count,
|
|
"status": inst.status,
|
|
}
|
|
for inst in institutions
|
|
if inst.name.lower() not in ["national libraries", "major public libraries", "academic libraries",
|
|
"major national museums", "regional museums", "major digital platforms",
|
|
"international collections", "jerusalem cultural centers"]
|
|
],
|
|
"entity_claims": claims,
|
|
"triples": triples,
|
|
}
|
|
|
|
# Create output directory
|
|
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Write output
|
|
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
|
|
json.dump(output, f, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f"Output written to: {OUTPUT_PATH}")
|
|
logger.info("=" * 80)
|
|
|
|
# Show sample institutions
|
|
logger.info("Sample institutions with metadata:")
|
|
for inst in institutions[:3]:
|
|
if inst.name.lower() not in ["national libraries", "major public libraries"]:
|
|
logger.info(f" - {inst.name}")
|
|
if inst.website:
|
|
logger.info(f" Website: {inst.website}")
|
|
if inst.established:
|
|
logger.info(f" Established: {inst.established}")
|
|
if inst.collections_count:
|
|
logger.info(f" Collections: {inst.collections_count} items")
|
|
|
|
# Show sample triples
|
|
logger.info("Sample triples:")
|
|
for triple in triples[:5]:
|
|
subj = triple["subject"]["text"]
|
|
pred = triple["predicate"]["label"]
|
|
obj = triple["object"]["text"]
|
|
logger.info(f" ({subj}) --[{pred}]--> ({obj})")
|
|
|
|
return output
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|