Krishnachaitanya2004 commited on
Commit
df59db4
1 Parent(s): e4be66c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -107
app.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  from langchain.vectorstores import Chroma
2
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
  from transformers import pipeline
@@ -5,111 +9,76 @@ import torch
5
  from langchain.llms import HuggingFacePipeline
6
  from langchain.embeddings import SentenceTransformerEmbeddings
7
  from langchain.chains import RetrievalQA
8
- from langchain_community.document_loaders import UnstructuredFileLoader
9
- from langchain.text_splitter import CharacterTextSplitter
10
  import streamlit as st
11
- import os
12
-
13
-
14
- def main_process(uploaded_file):
15
- file_name = uploaded_file.name
16
-
17
- # Create a temporary directory
18
- temp_dir = "temp"
19
- os.makedirs(temp_dir, exist_ok=True)
20
-
21
- # Save the uploaded file to the temporary directory
22
- temp_path = os.path.join(temp_dir, file_name)
23
- with open(temp_path, "wb") as temp_file:
24
- temp_file.write(uploaded_file.getvalue())
25
-
26
- # Process the uploaded file
27
- loader = UnstructuredFileLoader(temp_path)
28
- documents = loader.load()
29
- for document in documents:
30
- print(document.page_content)
31
- # We cant load the whole pdf into the program so we split the pdf into chunks
32
- # We use RecursiveCharacterTextSplitter to split the pdf into chunks
33
- # Each chunk is 500 characters long and the chunks overlap by 200 characters (You can change this according to your needs)
34
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=400)
35
- texts = text_splitter.split_documents(documents)
36
-
37
- # We use SentenceTransformerEmbeddings to embed the text chunks
38
- # Embeddings are used to find the similarity between the query and the text chunks
39
- # We use multi-qa-mpnet-base-dot-v1 model to embed the text chunks
40
- # We need to save the embeddings to disk so we use persist_directory to save the embeddings to disk
41
- embeddings = SentenceTransformerEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")
42
- persist_directory = "chroma/"
43
-
44
- # Chroma is used to store the embeddings
45
- # We use from_documents to store the embeddings
46
- # We use the persist_directory to save the embeddings to disk
47
- db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory)
48
-
49
- # To save and load the saved vector db (if needed in the future)
50
- # Persist the database to disk
51
- # db.persist()
52
- # db = Chroma(persist_directory="db", embedding_function=embeddings)
53
-
54
- checkpoint = "MBZUAI/LaMini-Flan-T5-783M"
55
-
56
- # Initialize the tokenizer and base model for text generation
57
- tokenizer = AutoTokenizer.from_pretrained(checkpoint)
58
- base_model = AutoModelForSeq2SeqLM.from_pretrained(
59
- checkpoint,
60
- device_map="auto",
61
- torch_dtype=torch.float32
62
- )
63
-
64
- pipe = pipeline(
65
- 'text2text-generation',
66
- model = base_model,
67
- tokenizer = tokenizer,
68
- max_length = 512,
69
- do_sample = True,
70
- temperature = 0.3,
71
- top_p= 0.95
72
- )
73
-
74
- # Initialize a local language model pipeline
75
- local_llm = HuggingFacePipeline(pipeline=pipe)
76
- # Create a RetrievalQA chain
77
- qa_chain = RetrievalQA.from_chain_type(
78
- llm=local_llm,
79
- chain_type='stuff',
80
- retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 2}),
81
- return_source_documents=True,
82
- )
83
- return qa_chain
84
-
85
- st.title("Document Chatbot")
86
- st.write("Upload a pdf file to get started")
87
-
88
- uploaded_file = st.file_uploader("Choose a file", type=["pdf"])
89
-
90
- if uploaded_file is not None:
91
- qa_chain = main_process(uploaded_file)
92
- if "messages" not in st.session_state:
93
- st.session_state.messages = []
94
-
95
- # Display chat messages from history on app rerun
96
- for message in st.session_state.messages:
97
- with st.chat_message(message["role"]):
98
- st.markdown(message["content"])
99
-
100
- # Accept user input
101
- if prompt := st.chat_input("What is up?"):
102
- # Add user message to chat history
103
- st.session_state.messages.append({"role": "user", "content": prompt})
104
- # Display user message in chat message container
105
- with st.chat_message("user"):
106
- st.markdown(prompt)
107
- # Get response from chatbot
108
- with st.chat_message("assitant"):
109
- response = qa_chain(prompt)
110
- st.markdown(response)
111
- st.session_state.messages.append({"role": "assistant", "content": response})
112
-
113
-
114
-
115
-
 
1
+ # !pip install accelerate
2
+ # !pip install chromadb
3
+ # !pip install "unstructured[all-docs]"
4
+
5
  from langchain.vectorstores import Chroma
6
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
7
  from transformers import pipeline
 
9
  from langchain.llms import HuggingFacePipeline
10
  from langchain.embeddings import SentenceTransformerEmbeddings
11
  from langchain.chains import RetrievalQA
 
 
12
  import streamlit as st
13
+
14
+
15
+ embeddings = SentenceTransformerEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")
16
+ persist_directory = "chroma"
17
+
18
+ # Persist the database to disk
19
+ db = Chroma(persist_directory,embeddings)
20
+
21
+ # To save and load the saved vector db (if needed in the future)
22
+ # Persist the database to disk
23
+ # db.persist()
24
+ # db = Chroma(persist_directory="db", embedding_function=embeddings)
25
+
26
+ checkpoint = "MBZUAI/LaMini-Flan-T5-783M"
27
+
28
+ # Initialize the tokenizer and base model for text generation
29
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
30
+ base_model = AutoModelForSeq2SeqLM.from_pretrained(
31
+ checkpoint,
32
+ device_map="auto",
33
+ torch_dtype=torch.float32
34
+ )
35
+
36
+
37
+
38
+ pipe = pipeline(
39
+ 'text2text-generation',
40
+ model = base_model,
41
+ tokenizer = tokenizer,
42
+ max_length = 512,
43
+ do_sample = True,
44
+ temperature = 0.3,
45
+ top_p= 0.95
46
+ )
47
+
48
+
49
+ # Initialize a local language model pipeline
50
+ local_llm = HuggingFacePipeline(pipeline=pipe)
51
+ # Create a RetrievalQA chain
52
+ qa_chain = RetrievalQA.from_chain_type(
53
+ llm=local_llm,
54
+ chain_type='stuff',
55
+ retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 2}),
56
+ return_source_documents=True,
57
+ )
58
+
59
+
60
+ st.title("Lawyer Bot")
61
+ st.subheader("A chatbot to answer your legal questions trained on IPC")
62
+ if "messages" not in st.session_state:
63
+ st.session_state.messages = []
64
+
65
+ # Display chat messages from history on app rerun
66
+ for message in st.session_state.messages:
67
+ with st.chat_message(message["role"]):
68
+ st.markdown(message["content"])
69
+
70
+ # Accept user input
71
+ if prompt := st.chat_input("What is up?"):
72
+ # Add user message to chat history
73
+ st.session_state.messages.append({"role": "user", "content": prompt})
74
+ # Display user message in chat message container
75
+ with st.chat_message("user"):
76
+ st.markdown(prompt)
77
+ # Get response from chatbot
78
+ with st.chat_message("assistant"):
79
+ response = qa_chain(prompt)
80
+ print(response['result'])
81
+ st.markdown(response["result"])
82
+ st.session_state.messages.append({"role": "assistant", "content": response})
83
+
84
+