import subprocess bash_command = '''\ CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --verbose ''' subprocess.run(bash_command, shell=True) from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import Pinecone from sentence_transformers import SentenceTransformer from langchain.chains.question_answering import load_qa_chain import pinecone import os from langchain.llms import LlamaCpp from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from huggingface_hub import hf_hub_download from langchain.chains.question_answering import load_qa_chain from langchain import PromptTemplate from langchain.chains import RetrievalQA from langchain.prompts import PromptTemplate from langchain.llms import CTransformers import torch from langchain.chains import LLMChain from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer import gradio as gr import time from transformers import pipeline from gtts import gTTS os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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" PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '55f52f29-11e3-4b87-a6ba-9a5494dfdb58') PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'asia-southeast1-gcp-free') embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') # initialize pinecone pinecone.init( api_key=PINECONE_API_KEY, # find at app.pinecone.io environment='gcp-starter' # next to api key in console ) index_name = "rpl-llama" # put in the name of your pinecone index here docsearch = Pinecone.from_existing_index(index_name, embeddings) callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) model_name_or_path = "TheBloke/Llama-2-7b-Chat-GGUF" model_basename = "llama-2-7b-chat.Q4_0.gguf" model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename) n_gpu_layers = 40 # Change this value based on your model and your GPU VRAM pool. n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU. # Make sure the model path is correct for your system! llm = LlamaCpp( model_path=model_path, n_gpu_layers=n_gpu_layers, n_batch=n_batch, callback_manager=callback_manager, verbose=True, # Verbose is required to pass to the callback manager ) prompt_template=""" Use the embeddings, summarize and generate the answers to user's questions. Don't repeat sentences. Context: {docs} Question: {query} Only return the helpful answer below and nothing else. Helpful answer: """ PROMPT=PromptTemplate(template=prompt_template, input_variables=["docs", "query"]) llm_chain = LLMChain(prompt=PROMPT, llm=llm) # Initialize the chatbot model asr = pipeline("automatic-speech-recognition", model="openai/whisper-small") global bot_message # Create a Gradio interface with gr.Blocks() as demo: chatbot = gr.Chatbot(avatar_images=("human.png", "bot.png"), value=[[None, "Welcome to the Indore-Ekk Number Superstore family! We're thrilled to have you on board. \n How can I assist you today?"]]) with gr.Row(label="Voice Input and Output"): with gr.Column(variant="panel"): audio_file = gr.Audio(label='Voice based Input',source="microphone",type="filepath",optional=True) with gr.Column(variant="panel"): play_audio = gr.Audio(label='Output Audio', autoplay=True) audio_out = gr.Textbox(visible=False) with gr.Row(label="Voice Input and Output"): with gr.Column(label='Text Based Input', variant="panel"): msg = gr.Textbox(placeholder="Ask me your doubts") with gr.Column(variant="panel"): with gr.Row(): clear = gr.Button("Clear the Chatbot Conversation") def text_to_speech(text): var = gTTS(text = text,lang = 'en') var.save('eng.mp3') return gr.Audio.update(value='eng.mp3') def user(user_message, history): global query global fck query = user_message fck = model_response(query) print(user_message,fck) return '', history + [[user_message, None]],gr.Textbox.update(value=fck) def model_response(query): global a #query = "What is the leave policy?" docs=docsearch.similarity_search(query) docs = docs[0].page_content+docs[1].page_content+docs[2].page_content a = llm_chain.run({'docs':docs,'query':query}) return a def bot(history): global bot_message bot_message = model_response(query) history[-1][1] = "" for character in fck: history[-1][1] += character time.sleep(0.05) yield history def speech_to_text(audio_file,history): if audio_file == None: return "", history + [[None, None]] else: global query global fck text = asr(audio_file)["text"] query = text fck = model_response(query) print(text) return None, history + [[text, None]],gr.Textbox.update(value=fck) #return text audio_file.stop_recording(speech_to_text, [audio_file,chatbot], [audio_file,chatbot,audio_out], queue=False, show_progress=False).then(bot, chatbot, chatbot) msg.submit(user, [msg, chatbot], [msg, chatbot,audio_out], queue=False).then( bot, chatbot, chatbot ) clear.click(lambda: None, None, chatbot, queue=False) audio_out.change(text_to_speech,inputs=[audio_out], outputs=play_audio) demo.queue() demo.launch(debug=True)