TextChatAPI / helpers.py
jacob-braun-mn's picture
Add basic files. Model next.
1d0a57d
raw
history blame
4.93 kB
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain_community.vectorstores import FAISS
from typing import List, Dict, Union
import time
def store_doc(fullText, chunkLen, embeddingModel):
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
model_name="gpt-4o",
chunk_size=chunkLen,
chunk_overlap=0,
is_separator_regex=False,
)
splits = text_splitter.create_documents([fullText])
for i, split in enumerate(splits):
split.metadata['idx'] = i
split.metadata['highlight'] = False
split.metadata['similarity_rank'] = None
split.metadata['similarity'] = None
print(f"SPLIT LENGTH: {len(splits)}")
print("Embedding and storing documents in memory...")
if len(splits) > 100:
db = FAISS.from_documents(splits[:100], embeddingModel)
print(f"Docs 1 - 100 added to db. Total docs: {len(splits)}")
for i in range(100, len(splits), 100):
db.add_documents(splits[i:i+100])
print(f"Docs {i} - {i+100} added to db. Total docs: {len(splits)}")
time.sleep(2)
else:
db = FAISS.from_documents(splits, embeddingModel)
print(f"Docs 1 - {len(splits)} added to db. Total docs: {len(splits)}")
return db, splits
def transform_documents(splits: List[Document]) -> List[Dict[str, Union[int, str, None]]]:
result = []
combined_non_highlight_content = ""
for doc in splits:
highlight_idx = doc.metadata['similarity_rank'] if doc.metadata['highlight'] else None
if highlight_idx is not None:
if combined_non_highlight_content:
result.append(
{"highlight_idx": None, "page_content": combined_non_highlight_content.strip()}
)
combined_non_highlight_content = ""
result.append({"highlight_idx": highlight_idx, "page_content": doc.page_content.strip()})
if combined_non_highlight_content:
result.append(
{"highlight_idx": None, "page_content": combined_non_highlight_content.strip()}
)
return result
def get_relevant_docs(splits, userQuery, db, topK):
print("Searching for relevant documents...")
docs = db.similarity_search_with_relevance_scores(query=userQuery, k=topK)
highlights = []
for i, doc in enumerate(docs):
doc[0].metadata['similarity'] = doc[1]
doc[0].metadata['similarity_rank'] = i
doc[0].metadata['highlight'] = True
highlights.append({
'page_content': doc[0].page_content,
'similarity': doc[1],
'similarity_rank': i,
'highlight': True})
docviewer_text = transform_documents(splits)
return highlights, docviewer_text
def get_answer(highlights, question, model_pipe):
instructions = """
# INSTRUCTIONS\n\nYou are a helpful assistant that reviews relevant sections of clinical notes to answer user questions. First, review the text provided under the # Highlighted Sections in the user message to familiarize yourself with the content. Then, read the user question under the # Question section and think step by step through what information you need to answer their question. Next, review the provided Highlighted Sections for context again and find the relevant information for the user's question. Finally, synthesize that relevant information to answer the user's question. Keep your answer fully grounded in the facts from the Highlight Sections and reply at a 10th grade reading level. Keep your answer as concise as possible and only use relevant information from the provided documents. If the Highlighted Sections do not contain the necessary facts to answer the user's question, please respond with 'I didn't find the necessary information. Please try rephrasing your question or providing additional text.' Provide your summary in markdown format but do not use H1 (#) or H2 (##) headers.
"""
documents = "# Highlighted Sections\n\n"
for i, highlight in enumerate(highlights):
documents += f"## Highlight {i+1}\n\n"
documents += highlight['page_content'] + "\n\n"
question = "# Question\n\n" + question + "\n\n"
reminder = "REMEMBER: Please keep your answer concise and fully grounded in the facts from the provided Highlighted Sections. Do not provide your own opinion or add information that is not supported by the Highlighted Sections. Provide your answer in markdown format but do not use H1 (#) or H2 (##) headers."
messages = [
{"role": "system", "content": instructions},
{"role": "user", "content": documents + question + reminder}
]
response = model_pipe(messages, max_length=4096, temperature=0.7, num_return_sequences=1)
return response[0]['generated_text'][-1]['content']