from PyPDF2 import PdfReader,PdfWriter import gradio as gr from langchain.embeddings import CohereEmbeddings from langchain.prompts import PromptTemplate from langchain import OpenAI from langchain_cohere import ChatCohere from langchain.text_splitter import RecursiveCharacterTextSplitter import os import numpy as np from sklearn.metrics.pairwise import cosine_similarity import spacy spacy.cli.download("en_core_web_md") nlp = spacy.load('en_core_web_md') from dotenv import load_dotenv load_dotenv() OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") COHERE_API_KEY = os.getenv('COHERE_API_KEY') text_splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap = 0) embedding = CohereEmbeddings(model='embed-multilingual-v3.0',cohere_api_key=COHERE_API_KEY) def recieve_pdf(filename): reader = PdfReader(filename) writer = PdfWriter() for page in reader.pages: writer.add_page(page) with open('processed_file.pdf','wb') as f: writer.write(f) read = PdfReader('processed_file.pdf') extracted_file =[page.extract_text(0) for page in read.pages] extracted_text = ''.join(extracted_file) global file file = extracted_text # summary_prompt_formated = summary_prompt.format(document = extracted_text) return 'Document succesfully uploaded' def chatbot(query,history): similarity_array =[] embeded_query = embedding.embed_documents([query]) doc = nlp(file) sentences_1 = [str(sentence) for sentence in doc.sents] embedded_text = embedding.embed_documents(sentences_1) similarity_score = cosine_similarity(embeded_query,embedded_text) similarity_array.append(similarity_score) most_similar_index = np.argmax(similarity_array) most_similar_documents = sentences_1[most_similar_index] splitter_text = text_splitter.split_text(file) recursive_embedded_text = embedding.embed_documents(splitter_text) most_similar_embed = embedding.embed_documents([most_similar_documents]) final_similarity_score = cosine_similarity(most_similar_embed,recursive_embedded_text) final_similarity_index = np.argmax(final_similarity_score) final_document = splitter_text[final_similarity_index] prompt_formated = prompt.format(context = final_document, query = query) response = llm.invoke(prompt_formated).content history.append((query, response)) return '', history summary_template = """ You an article summarizer and have been provided with this file {document} provide a one line summary of the content of the provides file. """ summary_prompt = PromptTemplate(input_variables= ['document'], template=summary_template) template = """ You are a knowledgeable chatbot that gently answers questions. You know the following context information. {context} Answer to the following question from a user. Use only information from the previous context. Do not invent or assume stuff. Question: {query} Answer:""" prompt = PromptTemplate(input_variables= ['context', 'query'], template= template) llm =ChatCohere(cohere_api_key=os.getenv('COHERE_API_KEY')) with gr.Blocks(theme='finlaymacklon/smooth_slate') as demo: signal = gr.Markdown('''# Welcome to Chat with Docs I am an AI that recieves a **PDF** and can answer questions on the content of the document.''') inp = gr.File() out = gr.Textbox(label= 'Summary') inp.upload(fn= recieve_pdf,inputs= inp,outputs=out,show_progress=True) signal_1 = gr.Markdown('Use the Textbox below to chat. **Ask** questions regarding the pdf you uploaded') chat = gr.Chatbot() msg = gr.Textbox(info='input your chat') with gr.Row(): submit = gr.Button('Send') clear = gr.ClearButton([msg,chat]) msg.submit(chatbot, [msg, chat], [msg ,chat]) submit.click(chatbot, [msg, chat], [msg ,chat]) feedback = gr.Markdown('# [Please use this to provide feedback](https://forms.gle/oNZKx4nL7DmmJ64g8)') demo.launch()