from langchain.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader from langchain.text_splitter import CharacterTextSplitter from langchain.document_loaders import PDFMinerLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain import HuggingFaceHub from langchain.chains.summarize import load_summarize_chain from langchain.chains.llm_summarization_checker.base import LLMSummarizationCheckerChain from langchain.prompts import PromptTemplate import os import gradio as gr import shutil import re import tempfile import cache from pathlib import Path from google.colab import userdata api=userdata.get('api') api_token=api # api_token = os.environ["HUGGINFACEHUB_API_TOKEN"]=api_token temp_dir = "/content/sample_data" def data_ingestion(file_path): if not os.path.exists(file_path): raise ValueError(f"File path {file_path} does not exist.") path = Path(file_path) file_ext = path.suffix # file_ext = os.path.splitext(file_path)[-1] # if file_ext == ".pdf": # # loader = PyPDFLoader(file_path) # loader = PDFMinerLoader(file_path) # document= loader.load() # elif file_ext in {".docx", ".doc"}: # loader = Docx2txtLoader(file_path) # document= loader.load() # elif file_ext == ".txt": # loader = TextLoader(file_path) # document= loader.load() loader = PDFMinerLoader(file_path) document= loader.load() length = len(document[0].page_content) # Replace CharacterTextSplitter with RecursiveCharacterTextSplitter text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0) split_docs = text_splitter.split_documents(document) embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}) llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature":1, "max_length":10000}, huggingfacehub_api_token=api_token) return split_docs # text_splitter = CharacterTextSplitter.from_tiktoken_encoder( # chunk_size=2000, chunk_overlap=0 # ) # split_docs = text_splitter.split_documents(document) # documents=split_text_into_batches(str(document),400) # len(documents) # documents[0] # # # from langchain.text_splitter import CharacterTextSplitter # text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=0) # documents = text_splitter.split_documents(document) # Embeddings # from langchain.chains.question_answering import load_qa_chain ########## CHAIN 1 norm text def chain1(): prompt_template = """Write a concise summary of the following: {text} SUMMARY:""" prompt = PromptTemplate.from_template(prompt_template) refine_template = ( "Your job is to produce a final summary\n" # "We have provided an existing summary up to a certain point: {existing_answer}\n" "We have the opportunity to refine the existing summary" "(only if needed) with some more context below.\n" "------------\n" "{text}\n" "------------\n" "Given the new context, refine the original summary in English" "If the context isn't useful, return the original summary." ) refine_prompt = PromptTemplate.from_template(refine_template) chain1 = load_summarize_chain( llm=HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature":1, "max_length":10000}, huggingfacehub_api_token=api_token), chain_type="refine", question_prompt=prompt, # refine_prompt=refine_prompt, return_intermediate_steps=False, input_key="input_documents", output_key="output_text", ) return chain1 # result = chain({"input_documents":split_docs}, return_only_outputs=True) ########## CHAIN 2 research paper def chain2(): prompt_template = """This is a Research Paper,your job is to summarise the text portion without any symbols or special characters, skip the mathematical equations for now: {text} SUMMARY:""" prompt = PromptTemplate.from_template(prompt_template) refine_template = ( "Your job is to produce a final summary\n" # "We have provided an existing summary up to a certain point: {existing_answer}\n" "We have the opportunity to refine the existing summary" "(only if needed) with some more context below.\n" "------------\n" "{text}\n" "------------\n" "Given the new context, refine the original summary in English" "If the context isn't useful, return the original summary." ) refine_prompt = PromptTemplate.from_template(refine_template) chain2 = load_summarize_chain( llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature":1, "max_length":10000}, huggingfacehub_api_token=api_token), chain_type = "refine", question_prompt = prompt, # refine_prompt = refine_prompt, return_intermediate_steps=False, input_key="input_documents", output_key="output_text", ) return chain2 # result = chain({"input_documents":split_docs}, return_only_outputs=True) ########## CHAIN 3 arxiv_paper_1 def chain3(): prompt_template = """You are being given a markdown document with headers, this is part of a larger arxiv paper. Your job is to write a summary of the document. here is the content of the section: "{text}" SUMMARY:""" prompt = PromptTemplate.from_template(prompt_template) refine_template = ("""You are presented with a collection of text snippets. Each snippet is a summary of a specific section from an academic paper published on arXiv. Your objective is to synthesize these snippets into a coherent, concise summary of the entire paper. DOCUMENT SNIPPETS: "{text}" INSTRUCTIONS: Craft a concise summary below, capturing the essence of the paper based on the provided snippets. It is also important that you highlight the key contributions of the paper, and 3 key takeaways from the paper. Lastly you should provide a list of 5 questions that you would ask the author of the paper if you had the chance. Remove all the backslash n (\n) SUMMARY: """ ) refine_prompt = PromptTemplate.from_template(refine_template) chain3 = load_summarize_chain( llm=HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature":1, "max_length":10000}, huggingfacehub_api_token=api_token), chain_type="refine", question_prompt=prompt, # refine_prompt=refine_prompt, return_intermediate_steps=False, input_key="input_documents", output_key="output_text", ) return chain3 # result = chain({"input_documents":split_docs}, return_only_outputs=True) # chain.run(document) # print(result["output_text"]) def chain_function(checkbox_values): if "Research Paper" in checkbox_values: output = chain3() elif "Legal Document" in checkbox_values: output = chain2() elif "Study Material" in checkbox_values: output = chain1() else: output = "Please select a document type to run." return output def result(chain, split_docs): summaries = [] for doc in split_docs: result = chain({"input_documents": [doc]}) # result = chain({"input_documents": [doc]}, return_only_outputs=True) summaries.append(result["output_text"]) text_concat = "" for i in summaries: text_concat += i # output = re.sub(r'\n'," "," ",text_concat) return text_concat title = """
S I M P L I F Y
""" # description = r"""S I M P L I F Y
# """ # article = r""" # If PhotoMaker is helpful, please help to ⭐ the Github Repo. Thanks! # [![GitHub Stars](https://img.shields.io/github/stars/TencentARC/PhotoMaker?style=social)](https://github.com/TencentARC/PhotoMaker) # --- # 📝 **Citation** #