Sathyapriyaa commited on
Commit
58cb744
1 Parent(s): 6185eca

Upload 3 files

Browse files
Files changed (3) hide show
  1. Production-Table - Sheet1 (2).pdf +0 -0
  2. app.py +142 -0
  3. requirements.txt +9 -0
Production-Table - Sheet1 (2).pdf ADDED
Binary file (32.5 kB). View file
 
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import together
2
+
3
+ # set your API key
4
+ together.api_key = "c9909567768fbf1a69fbd94c758e432f0a05a6755c32dced992ac6640a8cfd79"
5
+
6
+ # list available models and descriptons
7
+ models = together.Models.list()
8
+
9
+ together.Models.start("togethercomputer/llama-2-7b-chat")
10
+
11
+ from langchain.llms import Together
12
+
13
+
14
+ llm = Together(
15
+ model="togethercomputer/llama-2-7b-chat",
16
+ temperature=0.7,
17
+ max_tokens=128,
18
+ top_k=1,
19
+ together_api_key="c9909567768fbf1a69fbd94c758e432f0a05a6755c32dced992ac6640a8cfd79"
20
+ )
21
+
22
+ from langchain.vectorstores import Chroma
23
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
24
+
25
+ from langchain.chains import RetrievalQA
26
+ from langchain.document_loaders import TextLoader
27
+ from langchain.document_loaders import PyPDFLoader
28
+ from langchain.document_loaders import DirectoryLoader
29
+
30
+ loader = PyPDFLoader('/Production-Table - Sheet1 (2).pdf')
31
+
32
+ documents = loader.load()
33
+ #splitting the text into
34
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
35
+ texts = text_splitter.split_documents(documents)
36
+ from langchain.embeddings import HuggingFaceBgeEmbeddings
37
+
38
+ model_name = "BAAI/bge-base-en"
39
+ encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
40
+
41
+ model_norm = HuggingFaceBgeEmbeddings(
42
+ model_name=model_name,
43
+ model_kwargs={'device': 'cuda'},
44
+ encode_kwargs=encode_kwargs
45
+ )
46
+
47
+
48
+ # Embed and store the texts
49
+ # Supplying a persist_directory will store the embeddings on disk
50
+
51
+ persist_directory = 'db'
52
+
53
+ ## Here is the nmew embeddings being used
54
+ embedding = model_norm
55
+
56
+ vectordb = Chroma.from_documents(documents=texts,
57
+ embedding=embedding,
58
+ persist_directory=persist_directory)
59
+
60
+ retriever = vectordb.as_retriever(search_kwargs={"k": 5})
61
+ ## Default LLaMA-2 prompt style
62
+ B_INST, E_INST = "[INST]", "[/INST]"
63
+ B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
64
+ DEFAULT_SYSTEM_PROMPT = """\
65
+ You are a helpful, respectful and honest assistant of a production company. You should honestly answer the user's query using the knowledge of the company's production documents uploaded.
66
+
67
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
68
+
69
+ def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
70
+ SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
71
+ prompt_template = B_INST + SYSTEM_PROMPT + instruction + E_INST
72
+ return prompt_template
73
+
74
+ sys_prompt = """You are a helpful, respectful and honest assistant of a production company. You should honestly answer the user's query using the knowledge of the company's production documents uploaded.
75
+
76
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
77
+
78
+ instruction = """CONTEXT:/n/n {context}/n
79
+
80
+ Question: {question}"""
81
+ get_prompt(instruction, sys_prompt)
82
+
83
+ from langchain.prompts import PromptTemplate
84
+ prompt_template = get_prompt(instruction, sys_prompt)
85
+
86
+ llama_prompt = PromptTemplate(
87
+ template=prompt_template, input_variables=["context", "question"]
88
+ )
89
+ from langchain.schema import prompt
90
+ # create the chain to answer questions
91
+ qa_chain = RetrievalQA.from_chain_type(llm=llm,
92
+ chain_type="stuff",
93
+ retriever=retriever,
94
+ chain_type_kwargs=chain_type_kwargs,
95
+ return_source_documents=True)
96
+ ## Cite sources
97
+
98
+ import textwrap
99
+
100
+ def wrap_text_preserve_newlines(text, width=110):
101
+ # Split the input text into lines based on newline characters
102
+ lines = text.split('\n')
103
+
104
+ # Wrap each line individually
105
+ wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
106
+
107
+ # Join the wrapped lines back together using newline characters
108
+ wrapped_text = '\n'.join(wrapped_lines)
109
+
110
+ return wrapped_text
111
+
112
+ def process_llm_response(llm_response):
113
+ print(wrap_text_preserve_newlines(llm_response['result']))
114
+ print('\n\nSources:')
115
+ for source in llm_response["source_documents"]:
116
+ print(source.metadata['source'])
117
+ import gradio as gr
118
+
119
+ with gr.Blocks() as demo:
120
+ chatbot = gr.Chatbot()
121
+ msg = gr.Textbox()
122
+ clear = gr.Button("Clear")
123
+
124
+ def user(user_message, history):
125
+ return "", history + [[user_message, None]]
126
+
127
+ def bot(history):
128
+ print("Question: ", history[-1][0])
129
+ #wrap_text_preserve_newlines(llm_response['result'])
130
+ #bot_message = process_llm_response(qa_chain(history[-1][0]))
131
+ bot_message = wrap_text_preserve_newlines((qa_chain(history[-1][0]))['result'])
132
+ print("Response: ", bot_message)
133
+ history[-1][1] = ""
134
+ history[-1][1] += bot_message
135
+ return history
136
+
137
+
138
+ msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot)
139
+ clear.click(lambda: None, None, chatbot, queue=False)
140
+
141
+ demo.queue()
142
+ demo.launch(debug = True)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ huggingface_hub
3
+ tiktoken
4
+ chromadb
5
+ PyPDF2
6
+ pypdf
7
+ sentence_transformers
8
+ together
9
+ gradio