Spaces:

Belemort
/

test_biocad

Sleeping

App Files Files Community

Belemort commited on Dec 3, 2024

Commit

e139162

•

1 Parent(s): a495646

Update app.py

Browse files

Files changed (1) hide show

app.py +253 -18

app.py CHANGED Viewed

@@ -5,10 +5,22 @@ import concurrent.futures
 import json
 import os
 import arxiv
-from docx import Document
 from PIL import Image
 import io
 import base64
 # Set environment variables for Tavily API
 os.environ["TAVILY_API_KEY"] = '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'
@@ -17,6 +29,8 @@ os.environ["TAVILY_API_KEY"] = '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'
 client_1 = Mistral(api_key='eLES5HrVqduOE1OSWG6C5XyEUeR7qpXQ')
 client_2 = Mistral(api_key='VPqG8sCy3JX5zFkpdiZ7bRSnTLKwngFJ')
 client_3 = Mistral(api_key='cvyu5Rdk2lS026epqL4VB6BMPUcUMSgt')
 # Function to encode images in base64
 def encode_image_bytes(image_bytes):
@@ -65,9 +79,7 @@ def setup_search(question):
 def extract_key_topics(content, images=[]):
     prompt = f"""
     Extract the primary themes from the text below. List each theme in as few words as possible, focusing on essential concepts only. Format as a concise, unordered list with no extraneous words.
     ```{content}```
     LIST IN ENGLISH:
     -
     """
@@ -78,6 +90,73 @@ def extract_key_topics(content, images=[]):
     )
     return response.choices[0].message.content
 def search_relevant_articles_arxiv(key_topics, max_articles=100):
     articles_by_topic = {}
     final_topics = []
@@ -116,13 +195,20 @@ def search_relevant_articles_arxiv(key_topics, max_articles=100):
     return articles_by_topic, list(set(final_topics))
-# Initialize process for text analysis
 def init(content, images=[]):
-    key_topics = extract_key_topics(content, images)
-    key_topics = [topic.strip("- ") for topic in key_topics.split("\n") if topic]
-    articles_by_topic, final_topics = search_relevant_articles_arxiv(key_topics)
-    result_json = json.dumps(articles_by_topic, indent=4)
-    return final_topics, result_json
 # Summarization function
 def process_article_for_summary(text, images=[], compression_percentage=30):
@@ -130,11 +216,9 @@ def process_article_for_summary(text, images=[], compression_percentage=30):
     You are a commentator.
     # article:
     {text}
     # Instructions:
     ## Summarize IN RUSSIAN:
     In clear and concise language, summarize the key points and themes presented in the article by cutting it by {compression_percentage} percent in the markdown format.
     """
     if len(images) >= 8 :
@@ -147,6 +231,76 @@ def process_article_for_summary(text, images=[], compression_percentage=30):
     )
     return response.choices[0].message.content
 # Question answering function
 def ask_question_to_mistral(text, question, images=[]):
     prompt = f"Answer the following question without mentioning it or repeating the original text on which the question is asked in style markdown.IN RUSSIAN:\nQuestion: {question}\n\nText:\n{text}"
@@ -170,19 +324,100 @@ def ask_question_to_mistral(text, question, images=[]):
     )
     return response.choices[0].message.content
 # Gradio interface
 def gradio_interface(text_input, images_base64, task, question, compression_percentage):
     text, images = process_input(text_input, images_base64)
-    topics, articles_json = init(text, images)
     if task == "Summarization":
-        summary = process_article_for_summary(text, images, compression_percentage)
-        return {"Topics": topics, "Summary": summary, "Articles": articles_json}
     elif task == "Question Answering":
         if question:
-            answer = ask_question_to_mistral(text, question, images)
-            return {"Topics": topics, "Answer": answer, "Articles": articles_json}
         else:
             return {"Topics": topics, "Answer": "No question provided.", "Articles": articles_json}
@@ -205,4 +440,4 @@ with gr.Blocks() as demo:
     submit_button = gr.Button("Submit")
     submit_button.click(gradio_interface, [text_input, images_base64, task_choice, question_input, compression_input], result_output)
-demo.launch()

 import json
 import os
 import arxiv
 from PIL import Image
 import io
 import base64
+from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_mistralai import ChatMistralAI
+from langchain.chains.combine_documents.stuff import StuffDocumentsChain
+from langchain.chains.llm import LLMChain
+from langchain_core.prompts import PromptTemplate
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("mistral-community/pixtral-12b")
+def count_tokens_in_text(text):
+    tokens = tokenizer(text, return_tensors="pt", truncation=False, add_special_tokens=True)
+    return len(tokens["input_ids"][0])
 # Set environment variables for Tavily API
 os.environ["TAVILY_API_KEY"] = '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'
 client_1 = Mistral(api_key='eLES5HrVqduOE1OSWG6C5XyEUeR7qpXQ')
 client_2 = Mistral(api_key='VPqG8sCy3JX5zFkpdiZ7bRSnTLKwngFJ')
 client_3 = Mistral(api_key='cvyu5Rdk2lS026epqL4VB6BMPUcUMSgt')
+api_key_4 = 'lCZWDjyQSEc5gJsATEcKjP9cCjWsB7lg'
+client_4 = ChatMistralAI(api_key=api_key_4, model="pixtral-12b-2409")
 # Function to encode images in base64
 def encode_image_bytes(image_bytes):
 def extract_key_topics(content, images=[]):
     prompt = f"""
     Extract the primary themes from the text below. List each theme in as few words as possible, focusing on essential concepts only. Format as a concise, unordered list with no extraneous words.
     ```{content}```
     LIST IN ENGLISH:
     -
     """
     )
     return response.choices[0].message.content
+def extract_key_topics_with_large_text(content, images=[]):
+    # Map prompt template for extracting key themes
+    map_template = f"""
+        Текст: {{docs}}
+        Изображения: {{images}}
+        Extract the primary themes from the text below. List each theme in as few words as possible, focusing on essential concepts only. Format as a concise, unordered list with no extraneous words.
+        LIST IN ENGLISH:
+        -
+        :"""
+    map_prompt = PromptTemplate.from_template(map_template)
+    map_chain = LLMChain(llm=client_4, prompt=map_prompt)
+    # Reduce prompt template to further refine and extract key themes
+    reduce_template = f"""Следующий текст состоит из нескольких кратких итогов:
+        {{docs}}
+        Extract the primary themes from the text below. List each theme in as few words as possible, focusing on essential concepts only. Format as a concise, unordered list with no extraneous words.
+        LIST IN ENGLISH:
+        -
+        :"""
+    reduce_prompt = PromptTemplate.from_template(reduce_template)
+    reduce_chain = LLMChain(llm=client_4, prompt=reduce_prompt)
+    # Combine documents chain for Reduce step
+    combine_documents_chain = StuffDocumentsChain(
+        llm_chain=reduce_chain, document_variable_name="docs"
+    )
+    # ReduceDocumentsChain configuration
+    reduce_documents_chain = ReduceDocumentsChain(
+        combine_documents_chain=combine_documents_chain,
+        collapse_documents_chain=combine_documents_chain,
+        token_max=128000,
+    )
+    # MapReduceDocumentsChain combining Map and Reduce
+    map_reduce_chain = MapReduceDocumentsChain(
+        llm_chain=map_chain,
+        reduce_documents_chain=reduce_documents_chain,
+        document_variable_name="docs",
+        return_intermediate_steps=False,
+    )
+    # Text splitter configuration
+    text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
+        tokenizer,
+        chunk_size=100000,
+        chunk_overlap=14000,
+    )
+    # Split the text into documents
+    split_docs = text_splitter.create_documents([content])
+    # Include image descriptions (optional, if required by the prompt)
+    image_descriptions = "\n".join(
+        [f"Изображение {i+1}: {img['image_url']}" for i, img in enumerate(images)]
+    )
+    # Run the summarization chain to extract key themes
+    key_topics = map_reduce_chain.run({"input_documents": split_docs, "images": image_descriptions})
+    return key_topics
 def search_relevant_articles_arxiv(key_topics, max_articles=100):
     articles_by_topic = {}
     final_topics = []
     return articles_by_topic, list(set(final_topics))
 def init(content, images=[]):
+    if count_tokens_in_text(text=content) < 128_000:
+        key_topics = extract_key_topics(content, images)
+        key_topics = [topic.strip("- ") for topic in key_topics.split("\n") if topic]
+        articles_by_topic, final_topics = search_relevant_articles_arxiv(key_topics)
+        result_json = json.dumps(articles_by_topic, indent=4)
+        return final_topics, result_json
+    else:
+        key_topics = extract_key_topics_with_large_text(content, images)
+        key_topics = [topic.strip("- ") for topic in key_topics.split("\n") if topic]
+        articles_by_topic, final_topics = search_relevant_articles_arxiv(key_topics)
+        result_json = json.dumps(articles_by_topic, indent=4)
+        return final_topics, result_json
 # Summarization function
 def process_article_for_summary(text, images=[], compression_percentage=30):
     You are a commentator.
     # article:
     {text}
     # Instructions:
     ## Summarize IN RUSSIAN:
     In clear and concise language, summarize the key points and themes presented in the article by cutting it by {compression_percentage} percent in the markdown format.
     """
     if len(images) >= 8 :
     )
     return response.choices[0].message.content
+def process_large_article_for_summary(text, images=[], compression_percentage=30):
+    # Map prompt template
+    map_template = f"""Следующий текст состоит из текста и изображений:
+        Текст: {{docs}}
+        Изображения: {{images}}
+        На основе приведенного материала, выполните сжатие текста, выделяя основные темы и важные моменты.
+        Уровень сжатия: {compression_percentage}%.
+        Ответ предоставьте на русском языке в формате Markdown.
+        Полезный ответ:"""
+    map_prompt = PromptTemplate.from_template(map_template)
+    map_chain = LLMChain(llm=client_4, prompt=map_prompt)
+    # Reduce prompt template
+    reduce_template = f"""Следующий текст состоит из нескольких кратких итогов:
+        {{docs}}
+        На основе этих кратких итогов, выполните финальное сжатие текста, объединяя основные темы и ключевые моменты.
+        Уровень сжатия: {compression_percentage}%.
+        Результат предоставьте на русском языке в формате Markdown.
+        Полезный ответ:"""
+    reduce_prompt = PromptTemplate.from_template(reduce_template)
+    reduce_chain = LLMChain(llm=client_4, prompt=reduce_prompt)
+    # Combine documents chain for Reduce step
+    combine_documents_chain = StuffDocumentsChain(
+        llm_chain=reduce_chain, document_variable_name="docs"
+    )
+    # ReduceDocumentsChain configuration
+    reduce_documents_chain = ReduceDocumentsChain(
+        combine_documents_chain=combine_documents_chain,
+        collapse_documents_chain=combine_documents_chain,
+        token_max=128000,
+    )
+    # MapReduceDocumentsChain combining Map and Reduce
+    map_reduce_chain = MapReduceDocumentsChain(
+        llm_chain=map_chain,
+        reduce_documents_chain=reduce_documents_chain,
+        document_variable_name="docs",
+        return_intermediate_steps=False,
+    )
+    # Text splitter configuration
+    text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
+        tokenizer,
+        chunk_size=100000,
+        chunk_overlap=14000,
+    )
+    # Split the text into documents
+    split_docs = text_splitter.create_documents([text])
+    # Include image descriptions
+    image_descriptions = "\n".join(
+        [f"Изображение {i+1}: {img['image_url']}" for i, img in enumerate(images)]
+    )
+    # Run the summarization chain
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        extract_future = executor.submit(init, text, images)
+        summary = map_reduce_chain.run({"input_documents": split_docs, "images": image_descriptions})
+        key_topics , result_article_json = extract_future.result()
+    return summary, key_topics, result_article_json
 # Question answering function
 def ask_question_to_mistral(text, question, images=[]):
     prompt = f"Answer the following question without mentioning it or repeating the original text on which the question is asked in style markdown.IN RUSSIAN:\nQuestion: {question}\n\nText:\n{text}"
     )
     return response.choices[0].message.content
+def ask_question_to_mistral_with_large_text(text, question, images=[]):
+    # Prompts for QA
+    map_template = """С��едующий текст содержит статью/произведение:
+    Текст: {{docs}}
+    Изображения: {{images}}
+    На основе приведенного текста, ответьте на следующий вопрос:
+    Вопрос: {question}
+    Ответ должен быть точным. Пожалуйста, ответьте на русском языке в формате Markdown.
+    Полезный ответ:"""
+    reduce_template = """Следующий текст содержит несколько кратких ответов на вопрос:
+    {{docs}}
+    Объедините их в финальный ответ. Ответ предоставьте на русском языке в формате Markdown.
+    Полезный ответ:"""
+    map_prompt = PromptTemplate.from_template(map_template)
+    map_chain = LLMChain(llm=client_4, prompt=map_prompt)
+    reduce_prompt = PromptTemplate.from_template(reduce_template)
+    reduce_chain = LLMChain(llm=client_4, prompt=reduce_prompt)
+    # Combine documents chain for Reduce step
+    combine_documents_chain = StuffDocumentsChain(
+        llm_chain=reduce_chain, document_variable_name="docs"
+    )
+    # ReduceDocumentsChain configuration
+    reduce_documents_chain = ReduceDocumentsChain(
+        combine_documents_chain=combine_documents_chain,
+        collapse_documents_chain=combine_documents_chain,
+        token_max=128000,
+    )
+    # MapReduceDocumentsChain combining Map and Reduce
+    map_reduce_chain = MapReduceDocumentsChain(
+        llm_chain=map_chain,
+        reduce_documents_chain=reduce_documents_chain,
+        document_variable_name="docs",
+        return_intermediate_steps=False,
+    )
+    # Text splitter configuration
+    text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
+        tokenizer,
+        chunk_size=100000,
+        chunk_overlap=14000,
+    )
+    # Split the text into documents
+    split_docs = text_splitter.create_documents([text])
+    # Include image descriptions
+    image_descriptions = "\n".join(
+        [f"Изображение {i+1}: {img['image_url']}" for i, img in enumerate(images)]
+    )
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        extract_future = executor.submit(init, text, images)
+        summary = map_reduce_chain.run({"input_documents": split_docs, "question": question , "images": image_descriptions})
+        key_topics , result_article_json = extract_future.result()
+    return summary, key_topics, result_article_json
 # Gradio interface
 def gradio_interface(text_input, images_base64, task, question, compression_percentage):
     text, images = process_input(text_input, images_base64)
     if task == "Summarization":
+        if count_tokens_in_text(text=text) < 128_000:
+            topics, articles_json = init(text, images)
+            summary = process_article_for_summary(text, images, compression_percentage)
+            return {"Topics": topics, "Summary": summary, "Articles": articles_json}
+        else:
+            summary , key_topics, result_article_json = process_large_article_for_summary(text, images, compression_percentage)
+            return {"Topics": key_topics, "Summary": summary, "Articles": result_article_json}
     elif task == "Question Answering":
         if question:
+            if count_tokens_in_text(text=text) < 128_000:
+                topics, articles_json = init(text, images)
+                answer = ask_question_to_mistral(text, question, images)
+                return {"Topics": topics, "Answer": answer, "Articles": articles_json}
+            else:
+                summary , key_topics, result_article_json  = ask_question_to_mistral_with_large_text(text, question, images)
+                return {"Topics": key_topics, "Answer": answer, "Articles": result_article_json}
         else:
             return {"Topics": topics, "Answer": "No question provided.", "Articles": articles_json}
     submit_button = gr.Button("Submit")
     submit_button.click(gradio_interface, [text_input, images_base64, task_choice, question_input, compression_input], result_output)
+demo.launch()