dtrejopizzo commited on
Commit
eec05aa
·
1 Parent(s): 9d16735

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +142 -0
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # fixing unicode error in google colab
2
+ import locale
3
+ locale.getpreferredencoding = lambda: "UTF-8"
4
+
5
+ # import dependencies
6
+ import torch
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
8
+
9
+ import os
10
+ import gradio as gr
11
+ from google.colab import drive
12
+
13
+ import chromadb
14
+ from langchain.llms import HuggingFacePipeline
15
+ from langchain.document_loaders import TextLoader
16
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
17
+ from langchain.embeddings import HuggingFaceEmbeddings
18
+ from langchain.vectorstores import Chroma
19
+ from langchain import HuggingFacePipeline
20
+ from langchain.document_loaders import PyPDFDirectoryLoader
21
+ from langchain.chains import ConversationalRetrievalChain
22
+ from langchain.memory import ConversationBufferMemory
23
+
24
+ # specify model huggingface mode name
25
+ model_name = "anakin87/zephyr-7b-alpha-sharded"
26
+
27
+ # function for loading 4-bit quantized model
28
+ def load_quantized_model(model_name: str):
29
+ """
30
+ :param model_name: Name or path of the model to be loaded.
31
+ :return: Loaded quantized model.
32
+ """
33
+ bnb_config = BitsAndBytesConfig(
34
+ load_in_4bit=True,
35
+ bnb_4bit_use_double_quant=True,
36
+ bnb_4bit_quant_type="nf4",
37
+ bnb_4bit_compute_dtype=torch.bfloat16
38
+ )
39
+
40
+ model = AutoModelForCausalLM.from_pretrained(
41
+ model_name,
42
+ load_in_4bit=True,
43
+ torch_dtype=torch.bfloat16,
44
+ quantization_config=bnb_config
45
+ )
46
+ return model
47
+
48
+ # fucntion for initializing tokenizer
49
+ def initialize_tokenizer(model_name: str):
50
+ """
51
+ Initialize the tokenizer with the specified model_name.
52
+
53
+ :param model_name: Name or path of the model for tokenizer initialization.
54
+ :return: Initialized tokenizer.
55
+ """
56
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
57
+ tokenizer.bos_token_id = 1 # Set beginning of sentence token id
58
+ return tokenizer
59
+
60
+ # load model
61
+ model = load_quantized_model(model_name)
62
+
63
+ # initialize tokenizer
64
+ tokenizer = initialize_tokenizer(model_name)
65
+
66
+ # specify stop token ids
67
+ stop_token_ids = [0]
68
+
69
+ # specify folder path
70
+ folder_path = '/docs/'
71
+
72
+ # load pdf files
73
+ loader = PyPDFDirectoryLoader(folder_path)
74
+ documents = loader.load()
75
+
76
+ # split the documents in small chunks
77
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) #Chage the chunk_size and chunk_overlap as needed
78
+ all_splits = text_splitter.split_documents(documents)
79
+
80
+ # specify embedding model (using huggingface sentence transformer)
81
+ embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
82
+ model_kwargs = {"device": "cuda"}
83
+ embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name, model_kwargs=model_kwargs)
84
+
85
+ #embed document chunks
86
+ vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")
87
+
88
+ # specify the retriever
89
+ retriever = vectordb.as_retriever()
90
+
91
+ # build huggingface pipeline for using zephyr-7b-alpha
92
+ pipeline = pipeline(
93
+ "text-generation",
94
+ model=model,
95
+ tokenizer=tokenizer,
96
+ use_cache=True,
97
+ device_map="auto",
98
+ max_length=2048,
99
+ do_sample=True,
100
+ top_k=5,
101
+ num_return_sequences=1,
102
+ eos_token_id=tokenizer.eos_token_id,
103
+ pad_token_id=tokenizer.eos_token_id,
104
+ )
105
+
106
+ # specify the llm
107
+ llm = HuggingFacePipeline(pipeline=pipeline)
108
+
109
+ # build conversational retrieval chain with memory (rag) using langchain
110
+ def create_conversation(query: str, chat_history: list) -> tuple:
111
+ try:
112
+
113
+ memory = ConversationBufferMemory(
114
+ memory_key='chat_history',
115
+ return_messages=False
116
+ )
117
+ qa_chain = ConversationalRetrievalChain.from_llm(
118
+ llm=llm,
119
+ retriever=retriever,
120
+ memory=memory,
121
+ get_chat_history=lambda h: h,
122
+ )
123
+
124
+ result = qa_chain({'question': query, 'chat_history': chat_history})
125
+ chat_history.append((query, result['answer']))
126
+ return '', chat_history
127
+
128
+
129
+ except Exception as e:
130
+ chat_history.append((query, e))
131
+ return '', chat_history
132
+
133
+ # build gradio ui
134
+ with gr.Blocks() as demo:
135
+
136
+ chatbot = gr.Chatbot(label='Chat with your data')
137
+ msg = gr.Textbox()
138
+ clear = gr.ClearButton([msg, chatbot])
139
+
140
+ msg.submit(create_conversation, [msg, chatbot], [msg, chatbot])
141
+
142
+ demo.launch()