Belemort commited on
Commit
3f9c489
·
verified ·
1 Parent(s): f23ad63

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +251 -15
app.py CHANGED
@@ -5,13 +5,22 @@ import concurrent.futures
5
  import json
6
  import os
7
  import arxiv
8
- from docx import Document
9
  from PIL import Image
10
  import io
11
  import base64
 
 
 
 
 
 
12
 
 
 
13
 
14
-
 
 
15
 
16
  # Set environment variables for Tavily API
17
  os.environ["TAVILY_API_KEY"] = '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'
@@ -20,6 +29,8 @@ os.environ["TAVILY_API_KEY"] = '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'
20
  client_1 = Mistral(api_key='eLES5HrVqduOE1OSWG6C5XyEUeR7qpXQ')
21
  client_2 = Mistral(api_key='VPqG8sCy3JX5zFkpdiZ7bRSnTLKwngFJ')
22
  client_3 = Mistral(api_key='cvyu5Rdk2lS026epqL4VB6BMPUcUMSgt')
 
 
23
 
24
  # Function to encode images in base64
25
  def encode_image_bytes(image_bytes):
@@ -79,6 +90,73 @@ def extract_key_topics(content, images=[]):
79
  )
80
  return response.choices[0].message.content
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def search_relevant_articles_arxiv(key_topics, max_articles=100):
83
  articles_by_topic = {}
84
  final_topics = []
@@ -117,13 +195,20 @@ def search_relevant_articles_arxiv(key_topics, max_articles=100):
117
 
118
  return articles_by_topic, list(set(final_topics))
119
 
120
- # Initialize process for text analysis
121
  def init(content, images=[]):
122
- key_topics = extract_key_topics(content, images)
123
- key_topics = [topic.strip("- ") for topic in key_topics.split("\n") if topic]
124
- articles_by_topic, final_topics = search_relevant_articles_arxiv(key_topics)
125
- result_json = json.dumps(articles_by_topic, indent=4)
126
- return final_topics, result_json
 
 
 
 
 
 
 
127
 
128
  # Summarization function
129
  def process_article_for_summary(text, images=[], compression_percentage=30):
@@ -146,6 +231,76 @@ def process_article_for_summary(text, images=[], compression_percentage=30):
146
  )
147
  return response.choices[0].message.content
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  # Question answering function
150
  def ask_question_to_mistral(text, question, images=[]):
151
  prompt = f"Answer the following question without mentioning it or repeating the original text on which the question is asked in style markdown.IN RUSSIAN:\nQuestion: {question}\n\nText:\n{text}"
@@ -169,19 +324,100 @@ def ask_question_to_mistral(text, question, images=[]):
169
  )
170
  return response.choices[0].message.content
171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  # Gradio interface
173
  def gradio_interface(text_input, images_base64, task, question, compression_percentage):
174
  text, images = process_input(text_input, images_base64)
175
 
176
- topics, articles_json = init(text, images)
177
-
178
  if task == "Summarization":
179
- summary = process_article_for_summary(text, images, compression_percentage)
180
- return {"Topics": topics, "Summary": summary, "Articles": articles_json}
 
 
 
 
 
 
 
 
181
  elif task == "Question Answering":
 
182
  if question:
183
- answer = ask_question_to_mistral(text, question, images)
184
- return {"Topics": topics, "Answer": answer, "Articles": articles_json}
 
 
 
 
 
 
185
  else:
186
  return {"Topics": topics, "Answer": "No question provided.", "Articles": articles_json}
187
 
@@ -204,4 +440,4 @@ with gr.Blocks() as demo:
204
  submit_button = gr.Button("Submit")
205
  submit_button.click(gradio_interface, [text_input, images_base64, task_choice, question_input, compression_input], result_output)
206
 
207
- demo.launch()
 
5
  import json
6
  import os
7
  import arxiv
 
8
  from PIL import Image
9
  import io
10
  import base64
11
+ from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
12
+ from langchain.text_splitter import CharacterTextSplitter
13
+ from langchain_mistralai import ChatMistralAI
14
+ from langchain.chains.combine_documents.stuff import StuffDocumentsChain
15
+ from langchain.chains.llm import LLMChain
16
+ from langchain_core.prompts import PromptTemplate
17
 
18
+ from transformers import AutoTokenizer
19
+ tokenizer = AutoTokenizer.from_pretrained("mistral-community/pixtral-12b")
20
 
21
+ def count_tokens_in_text(text):
22
+ tokens = tokenizer(text, return_tensors="pt", truncation=False, add_special_tokens=True)
23
+ return len(tokens["input_ids"][0])
24
 
25
  # Set environment variables for Tavily API
26
  os.environ["TAVILY_API_KEY"] = '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'
 
29
  client_1 = Mistral(api_key='eLES5HrVqduOE1OSWG6C5XyEUeR7qpXQ')
30
  client_2 = Mistral(api_key='VPqG8sCy3JX5zFkpdiZ7bRSnTLKwngFJ')
31
  client_3 = Mistral(api_key='cvyu5Rdk2lS026epqL4VB6BMPUcUMSgt')
32
+ api_key_4 = 'eLES5HrVqduOE1OSWG6C5XyEUeR7qpXQ'
33
+ client_4 = ChatMistralAI(api_key=api_key_4, model="pixtral-12b-2409")
34
 
35
  # Function to encode images in base64
36
  def encode_image_bytes(image_bytes):
 
90
  )
91
  return response.choices[0].message.content
92
 
93
+ def extract_key_topics_with_large_text(content, images=[]):
94
+ # Map prompt template for extracting key themes
95
+ map_template = f"""
96
+ Текст: {{docs}}
97
+ Изображения: {{images}}
98
+
99
+ Extract the primary themes from the text below. List each theme in as few words as possible, focusing on essential concepts only. Format as a concise, unordered list with no extraneous words.
100
+ LIST IN ENGLISH:
101
+ -
102
+
103
+ :"""
104
+
105
+ map_prompt = PromptTemplate.from_template(map_template)
106
+ map_chain = LLMChain(llm=client_4, prompt=map_prompt)
107
+
108
+ # Reduce prompt template to further refine and extract key themes
109
+ reduce_template = f"""Следующий текст состоит из нескольких кратких итогов:
110
+ {{docs}}
111
+
112
+ Extract the primary themes from the text below. List each theme in as few words as possible, focusing on essential concepts only. Format as a concise, unordered list with no extraneous words.
113
+ LIST IN ENGLISH:
114
+ -
115
+
116
+ :"""
117
+
118
+ reduce_prompt = PromptTemplate.from_template(reduce_template)
119
+ reduce_chain = LLMChain(llm=client_4, prompt=reduce_prompt)
120
+
121
+ # Combine documents chain for Reduce step
122
+ combine_documents_chain = StuffDocumentsChain(
123
+ llm_chain=reduce_chain, document_variable_name="docs"
124
+ )
125
+
126
+ # ReduceDocumentsChain configuration
127
+ reduce_documents_chain = ReduceDocumentsChain(
128
+ combine_documents_chain=combine_documents_chain,
129
+ collapse_documents_chain=combine_documents_chain,
130
+ token_max=128000,
131
+ )
132
+
133
+ # MapReduceDocumentsChain combining Map and Reduce
134
+ map_reduce_chain = MapReduceDocumentsChain(
135
+ llm_chain=map_chain,
136
+ reduce_documents_chain=reduce_documents_chain,
137
+ document_variable_name="docs",
138
+ return_intermediate_steps=False,
139
+ )
140
+
141
+ # Text splitter configuration
142
+ text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
143
+ tokenizer,
144
+ chunk_size=100000,
145
+ chunk_overlap=14000,
146
+ )
147
+
148
+ # Split the text into documents
149
+ split_docs = text_splitter.create_documents([content])
150
+
151
+ # Include image descriptions (optional, if required by the prompt)
152
+ image_descriptions = "\n".join(
153
+ [f"Изображение {i+1}: {img['image_url']}" for i, img in enumerate(images)]
154
+ )
155
+
156
+ # Run the summarization chain to extract key themes
157
+ key_topics = map_reduce_chain.run({"input_documents": split_docs, "images": image_descriptions})
158
+ return key_topics
159
+
160
  def search_relevant_articles_arxiv(key_topics, max_articles=100):
161
  articles_by_topic = {}
162
  final_topics = []
 
195
 
196
  return articles_by_topic, list(set(final_topics))
197
 
198
+
199
  def init(content, images=[]):
200
+ if count_tokens_in_text(text=content) < 128_000:
201
+ key_topics = extract_key_topics(content, images)
202
+ key_topics = [topic.strip("- ") for topic in key_topics.split("\n") if topic]
203
+ articles_by_topic, final_topics = search_relevant_articles_arxiv(key_topics)
204
+ result_json = json.dumps(articles_by_topic, indent=4)
205
+ return final_topics, result_json
206
+ else:
207
+ key_topics = extract_key_topics_with_large_text(content, images)
208
+ key_topics = [topic.strip("- ") for topic in key_topics.split("\n") if topic]
209
+ articles_by_topic, final_topics = search_relevant_articles_arxiv(key_topics)
210
+ result_json = json.dumps(articles_by_topic, indent=4)
211
+ return final_topics, result_json
212
 
213
  # Summarization function
214
  def process_article_for_summary(text, images=[], compression_percentage=30):
 
231
  )
232
  return response.choices[0].message.content
233
 
234
+ def process_large_article_for_summary(text, images=[], compression_percentage=30):
235
+ # Map prompt template
236
+ map_template = f"""Следующий текст состоит из текста и изображений:
237
+ Текст: {{docs}}
238
+ Изображения: {{images}}
239
+
240
+ На основе приведенного материала, выполните сжатие текста, выделяя основные темы и важные моменты.
241
+ Уровень сжатия: {compression_percentage}%.
242
+ Ответ предоставьте на русском языке в формате Markdown.
243
+
244
+ Полезный ответ:"""
245
+
246
+ map_prompt = PromptTemplate.from_template(map_template)
247
+ map_chain = LLMChain(llm=client_4, prompt=map_prompt)
248
+
249
+ # Reduce prompt template
250
+ reduce_template = f"""Следующий текст состоит из нескольких кратких итогов:
251
+ {{docs}}
252
+
253
+ На основе этих кратких итогов, выполните финальное сжатие текста, объединяя основные темы и ключевые моменты.
254
+ Уровень сжатия: {compression_percentage}%.
255
+ Результат предоставьте на русском языке в формате Markdown.
256
+
257
+ Полезный ответ:"""
258
+
259
+ reduce_prompt = PromptTemplate.from_template(reduce_template)
260
+ reduce_chain = LLMChain(llm=client_4, prompt=reduce_prompt)
261
+
262
+ # Combine documents chain for Reduce step
263
+ combine_documents_chain = StuffDocumentsChain(
264
+ llm_chain=reduce_chain, document_variable_name="docs"
265
+ )
266
+
267
+ # ReduceDocumentsChain configuration
268
+ reduce_documents_chain = ReduceDocumentsChain(
269
+ combine_documents_chain=combine_documents_chain,
270
+ collapse_documents_chain=combine_documents_chain,
271
+ token_max=128000,
272
+ )
273
+
274
+ # MapReduceDocumentsChain combining Map and Reduce
275
+ map_reduce_chain = MapReduceDocumentsChain(
276
+ llm_chain=map_chain,
277
+ reduce_documents_chain=reduce_documents_chain,
278
+ document_variable_name="docs",
279
+ return_intermediate_steps=False,
280
+ )
281
+
282
+ # Text splitter configuration
283
+ text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
284
+ tokenizer,
285
+ chunk_size=100000,
286
+ chunk_overlap=14000,
287
+ )
288
+
289
+ # Split the text into documents
290
+ split_docs = text_splitter.create_documents([text])
291
+ # Include image descriptions
292
+ image_descriptions = "\n".join(
293
+ [f"Изображение {i+1}: {img['image_url']}" for i, img in enumerate(images)]
294
+ )
295
+
296
+ # Run the summarization chain
297
+
298
+ with concurrent.futures.ThreadPoolExecutor() as executor:
299
+ extract_future = executor.submit(init, text, images)
300
+ summary = map_reduce_chain.run({"input_documents": split_docs, "images": image_descriptions})
301
+ key_topics , result_article_json = extract_future.result()
302
+ return summary, key_topics, result_article_json
303
+
304
  # Question answering function
305
  def ask_question_to_mistral(text, question, images=[]):
306
  prompt = f"Answer the following question without mentioning it or repeating the original text on which the question is asked in style markdown.IN RUSSIAN:\nQuestion: {question}\n\nText:\n{text}"
 
324
  )
325
  return response.choices[0].message.content
326
 
327
+ def ask_question_to_mistral_with_large_text(text, question, images=[]):
328
+ # Prompts for QA
329
+ map_template = """Следующий текст содержит статью/произведение:
330
+ Текст: {{docs}}
331
+ Изображения: {{images}}
332
+ На основе приведенного текста, ответьте на следующий вопрос:
333
+
334
+ Вопрос: {question}
335
+
336
+ Ответ должен быть точным. Пожалуйста, ответьте на русском языке в формате Markdown.
337
+
338
+ Полезный ответ:"""
339
+
340
+ reduce_template = """Следующий текст содержит несколько кратких ответов на вопрос:
341
+ {{docs}}
342
+
343
+ Объедините их в финальный ответ. Ответ предоставьте на русском языке в фо��мате Markdown.
344
+
345
+ Полезный ответ:"""
346
+
347
+ map_prompt = PromptTemplate.from_template(map_template)
348
+ map_chain = LLMChain(llm=client_4, prompt=map_prompt)
349
+
350
+ reduce_prompt = PromptTemplate.from_template(reduce_template)
351
+ reduce_chain = LLMChain(llm=client_4, prompt=reduce_prompt)
352
+
353
+ # Combine documents chain for Reduce step
354
+ combine_documents_chain = StuffDocumentsChain(
355
+ llm_chain=reduce_chain, document_variable_name="docs"
356
+ )
357
+
358
+ # ReduceDocumentsChain configuration
359
+ reduce_documents_chain = ReduceDocumentsChain(
360
+ combine_documents_chain=combine_documents_chain,
361
+ collapse_documents_chain=combine_documents_chain,
362
+ token_max=128000,
363
+ )
364
+
365
+ # MapReduceDocumentsChain combining Map and Reduce
366
+ map_reduce_chain = MapReduceDocumentsChain(
367
+ llm_chain=map_chain,
368
+ reduce_documents_chain=reduce_documents_chain,
369
+ document_variable_name="docs",
370
+ return_intermediate_steps=False,
371
+ )
372
+
373
+ # Text splitter configuration
374
+ text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
375
+ tokenizer,
376
+ chunk_size=100000,
377
+ chunk_overlap=14000,
378
+ )
379
+
380
+ # Split the text into documents
381
+ split_docs = text_splitter.create_documents([text])
382
+
383
+ # Include image descriptions
384
+ image_descriptions = "\n".join(
385
+ [f"Изображение {i+1}: {img['image_url']}" for i, img in enumerate(images)]
386
+ )
387
+
388
+ with concurrent.futures.ThreadPoolExecutor() as executor:
389
+ extract_future = executor.submit(init, text, images)
390
+ summary = map_reduce_chain.run({"input_documents": split_docs, "question": question , "images": image_descriptions})
391
+ key_topics , result_article_json = extract_future.result()
392
+ return summary, key_topics, result_article_json
393
+
394
+
395
  # Gradio interface
396
  def gradio_interface(text_input, images_base64, task, question, compression_percentage):
397
  text, images = process_input(text_input, images_base64)
398
 
 
 
399
  if task == "Summarization":
400
+
401
+ if count_tokens_in_text(text=text) < 128_000:
402
+ topics, articles_json = init(text, images)
403
+ summary = process_article_for_summary(text, images, compression_percentage)
404
+ return {"Topics": topics, "Summary": summary, "Articles": articles_json}
405
+
406
+ else:
407
+ summary , key_topics, result_article_json = process_large_article_for_summary(text, images, compression_percentage)
408
+ return {"Topics": key_topics, "Summary": summary, "Articles": result_article_json}
409
+
410
  elif task == "Question Answering":
411
+
412
  if question:
413
+
414
+ if count_tokens_in_text(text=text) < 128_000:
415
+ topics, articles_json = init(text, images)
416
+ answer = ask_question_to_mistral(text, question, images)
417
+ return {"Topics": topics, "Answer": answer, "Articles": articles_json}
418
+ else:
419
+ summary , key_topics, result_article_json = ask_question_to_mistral_with_large_text(text, question, images)
420
+ return {"Topics": key_topics, "Answer": answer, "Articles": result_article_json}
421
  else:
422
  return {"Topics": topics, "Answer": "No question provided.", "Articles": articles_json}
423
 
 
440
  submit_button = gr.Button("Submit")
441
  submit_button.click(gradio_interface, [text_input, images_base64, task_choice, question_input, compression_input], result_output)
442
 
443
+ demo.launch(show_error=True)