""" Integration example: Extract identifiers from conversation files. This demonstrates how to combine ConversationParser and IdentifierExtractor to process conversation JSON files and extract institution identifiers. """ from pathlib import Path from glam_extractor.parsers.conversation import ConversationParser from glam_extractor.extractors.identifiers import IdentifierExtractor def extract_identifiers_from_conversation(conversation_path: str | Path) -> dict: """ Extract all identifiers from a conversation JSON file. Args: conversation_path: Path to conversation JSON file Returns: Dictionary with conversation metadata and extracted identifiers """ # Parse conversation parser = ConversationParser() conversation = parser.parse_file(conversation_path) # Get conversation metadata metadata = parser.get_conversation_metadata(conversation) # Extract text from assistant messages (most likely to contain institution info) text = parser.extract_institutions_context(conversation) # Extract identifiers extractor = IdentifierExtractor() identifiers = extractor.extract_all(text, include_urls=True) # Group identifiers by scheme by_scheme = {} for identifier in identifiers: scheme = identifier.identifier_scheme if scheme not in by_scheme: by_scheme[scheme] = [] by_scheme[scheme].append(identifier.identifier_value) return { "conversation": metadata, "identifiers": identifiers, "identifiers_by_scheme": by_scheme, "total_identifiers": len(identifiers), } if __name__ == "__main__": # Example usage with the sample conversation sample_path = Path(__file__).parent.parent / "tests" / "fixtures" / "sample_conversation.json" if sample_path.exists(): print("Processing sample conversation...") result = extract_identifiers_from_conversation(sample_path) print(f"\n=== Conversation: {result['conversation']['conversation_name']} ===") print(f"Messages: {result['conversation']['message_count']}") print(f"Total identifiers found: {result['total_identifiers']}\n") print("Identifiers by scheme:") for scheme, values in result['identifiers_by_scheme'].items(): print(f" {scheme}: {', '.join(values)}") print("\nDetailed identifiers:") for identifier in result['identifiers']: print(f" - {identifier.identifier_scheme}: {identifier.identifier_value}") if identifier.identifier_url: print(f" URL: {identifier.identifier_url}") else: print(f"Sample file not found at: {sample_path}") print("Please run from the project root directory.")