|
import os |
|
import langchain |
|
import sqlite3 |
|
from langchain.document_loaders import PyPDFLoader |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.embeddings import OpenAIEmbeddings |
|
from langchain.vectorstores import Chroma |
|
from langchain.llms import OpenAI |
|
from langchain.chains import ConversationalRetrievalChain,RetrievalQA |
|
from langchain.document_loaders import UnstructuredPDFLoader |
|
import openai |
|
import os |
|
import PyPDF2 |
|
from langchain.document_loaders.csv_loader import CSVLoader |
|
from langchain import OpenAI, PromptTemplate |
|
from langchain.document_loaders import TextLoader, Docx2txtLoader, PyPDFLoader, UnstructuredExcelLoader, CSVLoader |
|
import logging |
|
from tqdm import tqdm |
|
from langchain.chat_models import ChatOpenAI |
|
from langchain.retrievers.multi_query import MultiQueryRetriever |
|
from langchain.chains.summarize import load_summarize_chain |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
import pandas as pd |
|
import uuid |
|
from PIL import Image |
|
|
|
from utils import get_completion,model_info,model_load |
|
|
|
import pytesseract |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.schema.document import Document |
|
|
|
def get_text_chunks_langchain(text): |
|
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100) |
|
docs = [Document(page_content=x) for x in text_splitter.split_text(text)] |
|
return docs |
|
|
|
def get_text_img(path): |
|
return pytesseract.image_to_string(Image.open(path)) |
|
|
|
logging.basicConfig() |
|
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO) |
|
|
|
base_path = os.path.join(os.getcwd(),"db") |
|
key_openai ="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" |
|
embedding = OpenAIEmbeddings(openai_api_key =key_openai) |
|
|
|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from transformers.generation.utils import GenerationConfig |
|
tokenizer = AutoTokenizer.from_pretrained("/home/ubuntu/LLM/text-generation-webui/models/Flmc_DISC-MedLLM", use_fast=False, trust_remote_code=True) |
|
model = AutoModelForCausalLM.from_pretrained("/home/ubuntu/LLM/text-generation-webui/models/Flmc_DISC-MedLLM", device_map="auto", torch_dtype=torch.float16, trust_remote_code=True) |
|
model.generation_config = GenerationConfig.from_pretrained("Flmc/DISC-MedLLM") |
|
|
|
data_llm_16k = ChatOpenAI( |
|
model_name="gpt-3.5-turbo-16k", |
|
temperature = 0, |
|
openai_api_key=key_openai, |
|
) |
|
|
|
data_llm = ChatOpenAI( |
|
model_name="gpt-3.5-turbo", |
|
temperature = 0, |
|
openai_api_key=key_openai, |
|
) |
|
|
|
chain = load_summarize_chain(data_llm_16k, chain_type="stuff") |
|
|
|
def get_qa_chain_answers_llm(question,email): |
|
title = str(email) |
|
persist_directory = os.path.join(base_path,title) |
|
db = Chroma(persist_directory=persist_directory, embedding_function=embedding) |
|
k_tops = db.similarity_search(question, k=3) |
|
print(k_tops) |
|
|
|
|
|
print("LLM MODEL------------------------------") |
|
messages = [] |
|
messages.append({"role": "user", "content": "Hello the patient will provide you with the reports & other information regarding the paitent. You have to answer the questions based on the information provided and your knowledge in simplier langauge. Next you will talk with the paitent"}) |
|
model.chat(tokenizer, messages) |
|
messages.append({"role": "user", "content": f" Detials {k_tops} : & User Question {question}"}) |
|
return model.chat(tokenizer, messages) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_text(doc,file_name): |
|
file_extension = os.path.splitext(file_name)[1].lower() |
|
print(file_extension) |
|
if file_extension == ".pdf": |
|
pdf = PyPDF2.PdfReader(doc) |
|
pdf_text = "" |
|
for page in pdf.pages: |
|
pdf_text += page.extract_text() |
|
return pdf_text |
|
|
|
elif file_extension == ".md" or file_extension == ".txt": |
|
loader = TextLoader(doc) |
|
elif file_extension in [".docx", ".doc"]: |
|
loader = Docx2txtLoader(doc) |
|
elif file_extension == ".csv": |
|
loader = CSVLoader(file_path=doc) |
|
elif file_extension in [".xls", ".xlsx"]: |
|
try: |
|
df = pd.read_excel(doc, engine='openpyxl') |
|
file_name = f"{str(uuid.uuid1())}.csv" |
|
df.to_csv(file_name) |
|
loader = CSVLoader(file_path=file_name) |
|
except Exception as e: |
|
print(e) |
|
loader = UnstructuredExcelLoader(doc, mode="elements") |
|
documents = loader.load() |
|
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100) |
|
texts = text_splitter.split_documents(documents) |
|
return texts |
|
|
|
elif file_extension == ".png" or file_extension == ".jpg" or file_extension == ".jpeg": |
|
texts = get_text_img(doc) |
|
text_docs = get_text_chunks_langchain(texts) |
|
return text_docs |
|
|
|
else: |
|
raise ValueError(f"Unsupported file extension: {file_extension}") |
|
|
|
documents = loader.load() |
|
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) |
|
texts = text_splitter.split_documents(documents) |
|
|
|
return texts |
|
|
|
embedding = OpenAIEmbeddings(openai_api_key = "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") |
|
|
|
def upload_chroma(book_file,filename,email): |
|
pbar = tqdm(total=100) |
|
final_texts = get_text(book_file,filename) |
|
pbar.update(40) |
|
title = str(email) |
|
persist_directory = os.path.join(base_path,title) |
|
db = Chroma.from_documents(final_texts, embedding , persist_directory=persist_directory) |
|
pbar.update(40) |
|
db.persist() |
|
logging.info(f"Successfully uploaded the PDF of the book: {title}") |
|
print(f"Successfully uploaded the PDF of the book: {title}") |
|
pbar.update(20) |
|
pbar.close() |