import os import gradio as gr import requests from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM from langchain.text_splitter import CharacterTextSplitter from langchain.indexes import VectorstoreIndexCreator from langchain.document_loaders import TextLoader import textwrap from langchain.chains.question_answering import load_qa_chain from langchain import HuggingFaceHub, HuggingFacePipeline, OpenAI from sentence_transformers import SentenceTransformer from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from langchain.chains import RetrievalQA os.environ["OPENAI_API_KEY"] = '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' class ChatGPT: def __init__(self): loaders = [TextLoader(os.path.join('./docs', fn), encoding='utf8') for fn in os.listdir('./docs')] # loader = TextLoader('./docs/test7.txt', encoding='utf-8') # loader = TextLoader('./state_of_the_union.txt', encoding='utf-8') # documents = loader.load() # print(wrap_text_preserve_newlines(str(documents[0]))) text_splitter = CharacterTextSplitter( separator="\n", chunk_size=1000, chunk_overlap=0, length_function=len, ) embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2") index = VectorstoreIndexCreator( embedding=embeddings, text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders) # self.docs = text_splitter.split_documents(documents) # print(len(docs)) # embeddings = HuggingFaceEmbeddings(model_name='keepitreal/vietnamese-sbert') # embeddings = HuggingFaceEmbeddings() # llm = HuggingFaceHub(repo_id="vinai/phobert-base") # model = AutoModelForCausalLM.from_pretrained("vinai/phobert-base") # tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base") # self.db = FAISS.from_documents(self.docs, embeddings) # pipe = pipeline( # "text-generation", # model=model, # tokenizer=tokenizer # ) # local_llm = HuggingFacePipeline(pipeline=pipe) self.chain = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0.0), chain_type="stuff", retriever=index.vectorstore.as_retriever(search_kwargs={"k": 3}), input_key="question") # self.chain = load_qa_chain(llm=OpenAI(temperature=0.0), chain_type="stuff") # self.chain = load_qa_chain(llm=local_llm, chain_type="stuff") def query(self,question): # docs = self.db.similarity_search(question) # return (self.chain.run(input_documents=docs, question=question)) return (self.chain.run(question)) def wrap_text_preserve_newlines(text, width=200): # Split the input text into lines based on newline characters lines = text.split('\\n') # Wrap each line individually wrapped_lines = [textwrap.fill(line, width=width) for line in lines] # Join the wrapped lines back together using newline characters wrapped_text = '\n'.join(wrapped_lines) return wrapped_text os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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" url = "https://raw.githubusercontent.com/hwchase17/langchain/master/docs/modules/state_of_the_union.txt" # url = "https://raw.githubusercontent.com/NTT123/Vietnamese-Text-To-Speech-Dataset/master/collections.txt" # url = "https://raw.githubusercontent.com/NTT123/Vietnamese-Text-To-Speech-Dataset/master/collections.txt" # res = requests.get(url) # with open("state_of_the_union.txt", "w") as f: # f.write(res.text) # Document Loader # loader = TextLoader('./collections.txt', encoding='utf-8') # # loader = TextLoader('./state_of_the_union.txt', encoding='utf-8') # documents = loader.load() # # print(wrap_text_preserve_newlines(str(documents[0]))) # text_splitter = CharacterTextSplitter( # separator = "\n", # chunk_size = 1000, # chunk_overlap = 200, # length_function = len, # ) # docs = text_splitter.split_documents(documents) # # print(len(docs)) # # embeddings = HuggingFaceEmbeddings(model_name='keepitreal/vietnamese-sbert') # embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2") # # embeddings = HuggingFaceEmbeddings() # # # db = FAISS.from_documents(docs, embeddings) # # # query = "giờ làm việc của công ty" # # docs = db.similarity_search(query) # # # # print(wrap_text_preserve_newlines(str(docs[0].page_content))) # # # # print(wrap_text_preserve_newlines(str(docs[1].page_content))) # # model_id = 'google/flan-t5-base'# go for a smaller model if you dont have the VRAM # model_id = 'VietAI/gpt-neo-1.3B-vietnamese-news'# go for a smaller model if you dont have the VRAM # # tokenizer = AutoTokenizer.from_pretrained(model_id) # # # model = AutoModelForSeq2SeqLM.from_pretrained(model_id,load_in_8bit=False,low_cpu_mem_usage=True) # # model = AutoModelForCausalLM.from_pretrained(model_id,load_in_8bit=False,low_cpu_mem_usage=True) # # # # pipe = pipeline( # # "text-generation", # # model=model, # # tokenizer=tokenizer, # # pad_token_id=20000, # # temperature=0.9, # # max_length=500 # # ) # # llm=HuggingFaceHub(repo_id="VietAI/gpt-neo-1.3B-vietnamese-news", model_kwargs={"temperature":0.9,"top_k":20,"do_sample":True,"max_length":500}) # # local_llm = HuggingFacePipeline(pipeline=pipe) # # llm=HuggingFaceHub(repo_id="google/flan-t5-base", model_kwargs={"temperature":0, "max_length":512}) # # llm=HuggingFaceHub(repo_id="VietAI/gpt-neo-1.3B-vietnamese-news", model_kwargs={"temperature":0.1, "max_length":500}) # chain = load_qa_chain(llm=OpenAI(temperature=0.5), chain_type="stuff") # query = "được quyền lợi gì khi đẻ con" # # query = "What did the president say about the Economy" # docs = db.similarity_search(query) # # print(chain.run(input_documents=docs,question=query)) chatgpt = ChatGPT() def chatbot(input_text): response = chatgpt.query(input_text) return response iface = gr.Interface(fn=chatbot, inputs=gr.components.Textbox(lines=7, label="Enter your text"), outputs="text", title="Custom-trained AI Chatbot") iface.launch(share=True)