glam/examples/extract_identifiers.py
2025-11-19 23:25:22 +01:00

76 lines
2.8 KiB
Python

"""
Integration example: Extract identifiers from conversation files.
This demonstrates how to combine ConversationParser and IdentifierExtractor
to process conversation JSON files and extract institution identifiers.
"""
from pathlib import Path
from glam_extractor.parsers.conversation import ConversationParser
from glam_extractor.extractors.identifiers import IdentifierExtractor
def extract_identifiers_from_conversation(conversation_path: str | Path) -> dict:
"""
Extract all identifiers from a conversation JSON file.
Args:
conversation_path: Path to conversation JSON file
Returns:
Dictionary with conversation metadata and extracted identifiers
"""
# Parse conversation
parser = ConversationParser()
conversation = parser.parse_file(conversation_path)
# Get conversation metadata
metadata = parser.get_conversation_metadata(conversation)
# Extract text from assistant messages (most likely to contain institution info)
text = parser.extract_institutions_context(conversation)
# Extract identifiers
extractor = IdentifierExtractor()
identifiers = extractor.extract_all(text, include_urls=True)
# Group identifiers by scheme
by_scheme = {}
for identifier in identifiers:
scheme = identifier.identifier_scheme
if scheme not in by_scheme:
by_scheme[scheme] = []
by_scheme[scheme].append(identifier.identifier_value)
return {
"conversation": metadata,
"identifiers": identifiers,
"identifiers_by_scheme": by_scheme,
"total_identifiers": len(identifiers),
}
if __name__ == "__main__":
# Example usage with the sample conversation
sample_path = Path(__file__).parent.parent / "tests" / "fixtures" / "sample_conversation.json"
if sample_path.exists():
print("Processing sample conversation...")
result = extract_identifiers_from_conversation(sample_path)
print(f"\n=== Conversation: {result['conversation']['conversation_name']} ===")
print(f"Messages: {result['conversation']['message_count']}")
print(f"Total identifiers found: {result['total_identifiers']}\n")
print("Identifiers by scheme:")
for scheme, values in result['identifiers_by_scheme'].items():
print(f" {scheme}: {', '.join(values)}")
print("\nDetailed identifiers:")
for identifier in result['identifiers']:
print(f" - {identifier.identifier_scheme}: {identifier.identifier_value}")
if identifier.identifier_url:
print(f" URL: {identifier.identifier_url}")
else:
print(f"Sample file not found at: {sample_path}")
print("Please run from the project root directory.")