anishde commited on
Commit
ec86e95
Β·
verified Β·
1 Parent(s): 5deb75f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +291 -0
app.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
2
+ from langchain.text_splitter import CharacterTextSplitter
3
+ from langchain.document_loaders import PDFMinerLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.embeddings import HuggingFaceEmbeddings
6
+ from langchain import HuggingFaceHub
7
+ from langchain.chains.summarize import load_summarize_chain
8
+ from langchain.chains.llm_summarization_checker.base import LLMSummarizationCheckerChain
9
+ from langchain.prompts import PromptTemplate
10
+ import os
11
+ import gradio as gr
12
+ import shutil
13
+ import re
14
+ import tempfile
15
+ import cache
16
+ from pathlib import Path
17
+
18
+ from google.colab import userdata
19
+ api=userdata.get('api')
20
+ api_token=api
21
+ # api_token =
22
+ os.environ["HUGGINFACEHUB_API_TOKEN"]=api_token
23
+
24
+ temp_dir = "/content/sample_data"
25
+
26
+ def data_ingestion(file_path):
27
+ if not os.path.exists(file_path):
28
+ raise ValueError(f"File path {file_path} does not exist.")
29
+
30
+ path = Path(file_path)
31
+ file_ext = path.suffix
32
+
33
+ # file_ext = os.path.splitext(file_path)[-1]
34
+ # if file_ext == ".pdf":
35
+ # # loader = PyPDFLoader(file_path)
36
+ # loader = PDFMinerLoader(file_path)
37
+ # document= loader.load()
38
+
39
+ # elif file_ext in {".docx", ".doc"}:
40
+ # loader = Docx2txtLoader(file_path)
41
+ # document= loader.load()
42
+
43
+ # elif file_ext == ".txt":
44
+ # loader = TextLoader(file_path)
45
+ # document= loader.load()
46
+
47
+ loader = PDFMinerLoader(file_path)
48
+ document= loader.load()
49
+
50
+ length = len(document[0].page_content)
51
+
52
+ # Replace CharacterTextSplitter with RecursiveCharacterTextSplitter
53
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
54
+ split_docs = text_splitter.split_documents(document)
55
+
56
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})
57
+
58
+ llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
59
+ model_kwargs={"temperature":1, "max_length":10000},
60
+ huggingfacehub_api_token=api_token)
61
+
62
+ return split_docs
63
+
64
+ # text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
65
+ # chunk_size=2000, chunk_overlap=0
66
+ # )
67
+ # split_docs = text_splitter.split_documents(document)
68
+
69
+ # documents=split_text_into_batches(str(document),400)
70
+ # len(documents)
71
+ # documents[0]
72
+ # #
73
+ # from langchain.text_splitter import CharacterTextSplitter
74
+ # text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=0)
75
+ # documents = text_splitter.split_documents(document)
76
+ # Embeddings
77
+
78
+ # from langchain.chains.question_answering import load_qa_chain
79
+
80
+ ########## CHAIN 1 norm text
81
+
82
+ def chain1():
83
+ prompt_template = """Write a concise summary of the following:
84
+ {text}
85
+ SUMMARY:"""
86
+ prompt = PromptTemplate.from_template(prompt_template)
87
+
88
+ refine_template = (
89
+ "Your job is to produce a final summary\n"
90
+ # "We have provided an existing summary up to a certain point: {existing_answer}\n"
91
+ "We have the opportunity to refine the existing summary"
92
+ "(only if needed) with some more context below.\n"
93
+ "------------\n"
94
+ "{text}\n"
95
+ "------------\n"
96
+ "Given the new context, refine the original summary in English"
97
+ "If the context isn't useful, return the original summary." )
98
+
99
+ refine_prompt = PromptTemplate.from_template(refine_template)
100
+ chain1 = load_summarize_chain(
101
+ llm=HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
102
+ model_kwargs={"temperature":1, "max_length":10000},
103
+ huggingfacehub_api_token=api_token),
104
+ chain_type="refine",
105
+ question_prompt=prompt,
106
+ # refine_prompt=refine_prompt,
107
+ return_intermediate_steps=False,
108
+ input_key="input_documents",
109
+ output_key="output_text",
110
+ )
111
+ return chain1
112
+
113
+ # result = chain({"input_documents":split_docs}, return_only_outputs=True)
114
+
115
+ ########## CHAIN 2 research paper
116
+
117
+ def chain2():
118
+ prompt_template = """This is a Research Paper,your job is to summarise the text portion without any symbols or special characters, skip the mathematical equations for now:
119
+ {text}
120
+ SUMMARY:"""
121
+ prompt = PromptTemplate.from_template(prompt_template)
122
+
123
+ refine_template = (
124
+ "Your job is to produce a final summary\n"
125
+ # "We have provided an existing summary up to a certain point: {existing_answer}\n"
126
+ "We have the opportunity to refine the existing summary"
127
+ "(only if needed) with some more context below.\n"
128
+ "------------\n"
129
+ "{text}\n"
130
+ "------------\n"
131
+ "Given the new context, refine the original summary in English"
132
+ "If the context isn't useful, return the original summary." )
133
+
134
+ refine_prompt = PromptTemplate.from_template(refine_template)
135
+ chain2 = load_summarize_chain(
136
+ llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
137
+ model_kwargs={"temperature":1, "max_length":10000},
138
+ huggingfacehub_api_token=api_token),
139
+ chain_type = "refine",
140
+ question_prompt = prompt,
141
+ # refine_prompt = refine_prompt,
142
+ return_intermediate_steps=False,
143
+ input_key="input_documents",
144
+ output_key="output_text",
145
+ )
146
+ return chain2
147
+
148
+ # result = chain({"input_documents":split_docs}, return_only_outputs=True)
149
+
150
+ ########## CHAIN 3 arxiv_paper_1
151
+
152
+ def chain3():
153
+ prompt_template = """You are being given a markdown document with headers, this is part of a larger arxiv paper. Your job is to write a summary of the document.
154
+ here is the content of the section:
155
+ "{text}"
156
+
157
+ SUMMARY:"""
158
+ prompt = PromptTemplate.from_template(prompt_template)
159
+
160
+ refine_template = ("""You are presented with a collection of text snippets. Each snippet is a summary of a specific section from an academic paper published on arXiv. Your objective is to synthesize these snippets into a coherent, concise summary of the entire paper.
161
+
162
+ DOCUMENT SNIPPETS:
163
+ "{text}"
164
+
165
+ INSTRUCTIONS: Craft a concise summary below, capturing the essence of the paper based on the provided snippets.
166
+ It is also important that you highlight the key contributions of the paper, and 3 key takeaways from the paper.
167
+ Lastly you should provide a list of 5 questions that you would ask the author of the paper if you had the chance. Remove all the backslash n (\n)
168
+ SUMMARY:
169
+ """
170
+ )
171
+
172
+ refine_prompt = PromptTemplate.from_template(refine_template)
173
+ chain3 = load_summarize_chain(
174
+ llm=HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
175
+ model_kwargs={"temperature":1, "max_length":10000},
176
+ huggingfacehub_api_token=api_token),
177
+ chain_type="refine",
178
+ question_prompt=prompt,
179
+ # refine_prompt=refine_prompt,
180
+ return_intermediate_steps=False,
181
+ input_key="input_documents",
182
+ output_key="output_text",
183
+ )
184
+ return chain3
185
+ # result = chain({"input_documents":split_docs}, return_only_outputs=True)
186
+ # chain.run(document)
187
+ # print(result["output_text"])
188
+
189
+ def chain_function(checkbox_values):
190
+
191
+ if "Research Paper" in checkbox_values:
192
+ output = chain3()
193
+ elif "Legal Document" in checkbox_values:
194
+ output = chain2()
195
+ elif "Study Material" in checkbox_values:
196
+ output = chain1()
197
+ else:
198
+ output = "Please select a document type to run."
199
+ return output
200
+
201
+ def result(chain, split_docs):
202
+ summaries = []
203
+ for doc in split_docs:
204
+ result = chain({"input_documents": [doc]})
205
+ # result = chain({"input_documents": [doc]}, return_only_outputs=True)
206
+ summaries.append(result["output_text"])
207
+ text_concat = ""
208
+ for i in summaries:
209
+ text_concat += i
210
+ # output = re.sub(r'\n'," "," ",text_concat)
211
+ return text_concat
212
+
213
+ title = """<p style="font-family:Century Gothic; text-align:center; font-size: 100px">S I M P L I F Y</p>"""
214
+
215
+ # description = r"""<p style="font-family: Century Gothic; text-align:center; font-size: 100px">S I M P L I F Y</p>
216
+ # """
217
+
218
+ # article = r"""
219
+ # If PhotoMaker is helpful, please help to ⭐ the <a href='https://github.com/TencentARC/PhotoMaker' target='_blank'>Github Repo</a>. Thanks!
220
+ # [![GitHub Stars](https://img.shields.io/github/stars/TencentARC/PhotoMaker?style=social)](https://github.com/TencentARC/PhotoMaker)
221
+ # ---
222
+ # πŸ“ **Citation**
223
+ # <br>
224
+ # If our work is useful for your research, please consider citing:
225
+ # ```bibtex
226
+ # @article{li2023photomaker,
227
+ # title={PhotoMaker: Customizing Realistic Human Photos via Stacked ID Embedding},
228
+ # author={Li, Zhen and Cao, Mingdeng and Wang, Xintao and Qi, Zhongang and Cheng, Ming-Ming and Shan, Ying},
229
+ # booktitle={arXiv preprint arxiv:2312.04461},
230
+ # year={2023}
231
+ # }
232
+ # ```
233
+ # πŸ“‹ **License**
234
+ # <br>
235
+ # Apache-2.0 LICENSE. Please refer to the [LICENSE file](https://huggingface.co/TencentARC/PhotoMaker/blob/main/LICENSE) for details.
236
+ # πŸ“§ **Contact**
237
+ # <br>
238
+ # If you have any questions, please feel free to reach me out at <b>zhenli1031@gmail.com</b>.
239
+ # """
240
+
241
+ # tips = r"""
242
+ # ### Usage tips of PhotoMaker
243
+ # 1. Upload more photos of the person to be customized to **improve ID fidelty**. If the input is Asian face(s), maybe consider adding 'asian' before the class word, e.g., `asian woman img`
244
+ # 2. When stylizing, does the generated face look too realistic? Adjust the **Style strength** to 30-50, the larger the number, the less ID fidelty, but the stylization ability will be better.
245
+ # 3. If you want to generate realistic photos, you could try switching to our other gradio application [PhotoMaker](https://huggingface.co/spaces/TencentARC/PhotoMaker).
246
+ # 4. For **faster** speed, reduce the number of generated images and sampling steps. However, please note that reducing the sampling steps may compromise the ID fidelity.
247
+ # """
248
+
249
+ # def process_file(file_obj):
250
+ # destination_path = "/content/sample_data" # Replace with your desired path
251
+ # shutil.copy(file_obj, destination_path) # Save file to specified path
252
+ # return os.path.join(destination_path, file_obj)
253
+ def process_file(list_file_obj):
254
+ # list_file_path = [x.name for x in list_file_obj if x is not None]
255
+ # file_content = file_obj.data
256
+ # with tempfile.TemporaryFile() as temp_file:
257
+ # temp_file.write(file_content)
258
+ # temp_file_path = temp_file.name
259
+ return list_file_obj[0].name
260
+
261
+ def inference(checkbox_values, uploaded_file):
262
+ file_path = process_file(uploaded_file)
263
+ split_docs = data_ingestion(file_path)
264
+ chain = chain_function(checkbox_values)
265
+ summary = result(chain, split_docs)
266
+ return summary
267
+
268
+ with gr.Blocks(theme="monochrome") as demo:
269
+ gr.Markdown(title)
270
+
271
+ with gr.Row():
272
+ with gr.Column():
273
+ checkbox_values = gr.CheckboxGroup(["Research Paper", "Legal Document", "Study Material"], label="Choose the document type")
274
+ uploaded_file = gr.Files(height=100, file_count="multiple", file_types=["text", ".docx", "pdf"], interactive=True, label="Upload your File.")
275
+ btn = gr.Button("Submit") # Place the button outside the Row for vertical alignment
276
+ with gr.Column():
277
+ txt = gr.Textbox(
278
+ show_label=False,scale=2,
279
+ # placeholder="Simplify."
280
+ )
281
+
282
+
283
+ btn.click(
284
+ fn=inference,
285
+ inputs=[checkbox_values, uploaded_file],
286
+ outputs=[txt],
287
+ queue=False
288
+ )
289
+ # debug = True
290
+ demo.launch(debug = True)
291
+