76 lines
2.8 KiB
Python
76 lines
2.8 KiB
Python
"""
|
|
Integration example: Extract identifiers from conversation files.
|
|
|
|
This demonstrates how to combine ConversationParser and IdentifierExtractor
|
|
to process conversation JSON files and extract institution identifiers.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
from glam_extractor.parsers.conversation import ConversationParser
|
|
from glam_extractor.extractors.identifiers import IdentifierExtractor
|
|
|
|
|
|
def extract_identifiers_from_conversation(conversation_path: str | Path) -> dict:
|
|
"""
|
|
Extract all identifiers from a conversation JSON file.
|
|
|
|
Args:
|
|
conversation_path: Path to conversation JSON file
|
|
|
|
Returns:
|
|
Dictionary with conversation metadata and extracted identifiers
|
|
"""
|
|
# Parse conversation
|
|
parser = ConversationParser()
|
|
conversation = parser.parse_file(conversation_path)
|
|
|
|
# Get conversation metadata
|
|
metadata = parser.get_conversation_metadata(conversation)
|
|
|
|
# Extract text from assistant messages (most likely to contain institution info)
|
|
text = parser.extract_institutions_context(conversation)
|
|
|
|
# Extract identifiers
|
|
extractor = IdentifierExtractor()
|
|
identifiers = extractor.extract_all(text, include_urls=True)
|
|
|
|
# Group identifiers by scheme
|
|
by_scheme = {}
|
|
for identifier in identifiers:
|
|
scheme = identifier.identifier_scheme
|
|
if scheme not in by_scheme:
|
|
by_scheme[scheme] = []
|
|
by_scheme[scheme].append(identifier.identifier_value)
|
|
|
|
return {
|
|
"conversation": metadata,
|
|
"identifiers": identifiers,
|
|
"identifiers_by_scheme": by_scheme,
|
|
"total_identifiers": len(identifiers),
|
|
}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Example usage with the sample conversation
|
|
sample_path = Path(__file__).parent.parent / "tests" / "fixtures" / "sample_conversation.json"
|
|
|
|
if sample_path.exists():
|
|
print("Processing sample conversation...")
|
|
result = extract_identifiers_from_conversation(sample_path)
|
|
|
|
print(f"\n=== Conversation: {result['conversation']['conversation_name']} ===")
|
|
print(f"Messages: {result['conversation']['message_count']}")
|
|
print(f"Total identifiers found: {result['total_identifiers']}\n")
|
|
|
|
print("Identifiers by scheme:")
|
|
for scheme, values in result['identifiers_by_scheme'].items():
|
|
print(f" {scheme}: {', '.join(values)}")
|
|
|
|
print("\nDetailed identifiers:")
|
|
for identifier in result['identifiers']:
|
|
print(f" - {identifier.identifier_scheme}: {identifier.identifier_value}")
|
|
if identifier.identifier_url:
|
|
print(f" URL: {identifier.identifier_url}")
|
|
else:
|
|
print(f"Sample file not found at: {sample_path}")
|
|
print("Please run from the project root directory.")
|