import os | |
import gradio as gr | |
import requests | |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.indexes import VectorstoreIndexCreator | |
from langchain.document_loaders import TextLoader | |
import textwrap | |
from langchain.chains.question_answering import load_qa_chain | |
from langchain import HuggingFaceHub, HuggingFacePipeline, OpenAI | |
from sentence_transformers import SentenceTransformer | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores import FAISS | |
from langchain.chains import RetrievalQA | |
os.environ["OPENAI_API_KEY"] = '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' | |
class ChatGPT: | |
def __init__(self): | |
loaders = [TextLoader(os.path.join('./docs', fn), encoding='utf8') for fn in os.listdir('./docs')] | |
# loader = TextLoader('./docs/test7.txt', encoding='utf-8') | |
# loader = TextLoader('./state_of_the_union.txt', encoding='utf-8') | |
# documents = loader.load() | |
# print(wrap_text_preserve_newlines(str(documents[0]))) | |
text_splitter = CharacterTextSplitter( | |
separator="\n", | |
chunk_size=1000, | |
chunk_overlap=0, | |
length_function=len, | |
) | |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2") | |
index = VectorstoreIndexCreator( | |
embedding=embeddings, | |
text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders) | |
# self.docs = text_splitter.split_documents(documents) | |
# print(len(docs)) | |
# embeddings = HuggingFaceEmbeddings(model_name='keepitreal/vietnamese-sbert') | |
# embeddings = HuggingFaceEmbeddings() | |
# llm = HuggingFaceHub(repo_id="vinai/phobert-base") | |
# model = AutoModelForCausalLM.from_pretrained("vinai/phobert-base") | |
# tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base") | |
# self.db = FAISS.from_documents(self.docs, embeddings) | |
# pipe = pipeline( | |
# "text-generation", | |
# model=model, | |
# tokenizer=tokenizer | |
# ) | |
# local_llm = HuggingFacePipeline(pipeline=pipe) | |
self.chain = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0.0), | |
chain_type="stuff", | |
retriever=index.vectorstore.as_retriever(search_kwargs={"k": 3}), | |
input_key="question") | |
# self.chain = load_qa_chain(llm=OpenAI(temperature=0.0), chain_type="stuff") | |
# self.chain = load_qa_chain(llm=local_llm, chain_type="stuff") | |
def query(self,question): | |
# docs = self.db.similarity_search(question) | |
# return (self.chain.run(input_documents=docs, question=question)) | |
return (self.chain.run(question)) | |
def wrap_text_preserve_newlines(text, width=200): | |
# Split the input text into lines based on newline characters | |
lines = text.split('\\n') | |
# Wrap each line individually | |
wrapped_lines = [textwrap.fill(line, width=width) for line in lines] | |
# Join the wrapped lines back together using newline characters | |
wrapped_text = '\n'.join(wrapped_lines) | |
return wrapped_text | |
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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" | |
url = "https://raw.githubusercontent.com/hwchase17/langchain/master/docs/modules/state_of_the_union.txt" | |
# url = "https://raw.githubusercontent.com/NTT123/Vietnamese-Text-To-Speech-Dataset/master/collections.txt" | |
# url = "https://raw.githubusercontent.com/NTT123/Vietnamese-Text-To-Speech-Dataset/master/collections.txt" | |
# res = requests.get(url) | |
# with open("state_of_the_union.txt", "w") as f: | |
# f.write(res.text) | |
# Document Loader | |
# loader = TextLoader('./collections.txt', encoding='utf-8') | |
# # loader = TextLoader('./state_of_the_union.txt', encoding='utf-8') | |
# documents = loader.load() | |
# # print(wrap_text_preserve_newlines(str(documents[0]))) | |
# text_splitter = CharacterTextSplitter( | |
# separator = "\n", | |
# chunk_size = 1000, | |
# chunk_overlap = 200, | |
# length_function = len, | |
# ) | |
# docs = text_splitter.split_documents(documents) | |
# # print(len(docs)) | |
# # embeddings = HuggingFaceEmbeddings(model_name='keepitreal/vietnamese-sbert') | |
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2") | |
# # embeddings = HuggingFaceEmbeddings() | |
# | |
# | |
# db = FAISS.from_documents(docs, embeddings) | |
# | |
# # query = "giờ làm việc của công ty" | |
# # docs = db.similarity_search(query) | |
# | |
# | |
# # print(wrap_text_preserve_newlines(str(docs[0].page_content))) | |
# | |
# | |
# # print(wrap_text_preserve_newlines(str(docs[1].page_content))) | |
# # model_id = 'google/flan-t5-base'# go for a smaller model if you dont have the VRAM | |
# model_id = 'VietAI/gpt-neo-1.3B-vietnamese-news'# go for a smaller model if you dont have the VRAM | |
# # tokenizer = AutoTokenizer.from_pretrained(model_id) | |
# # # model = AutoModelForSeq2SeqLM.from_pretrained(model_id,load_in_8bit=False,low_cpu_mem_usage=True) | |
# # model = AutoModelForCausalLM.from_pretrained(model_id,load_in_8bit=False,low_cpu_mem_usage=True) | |
# # | |
# # pipe = pipeline( | |
# # "text-generation", | |
# # model=model, | |
# # tokenizer=tokenizer, | |
# # pad_token_id=20000, | |
# # temperature=0.9, | |
# # max_length=500 | |
# # ) | |
# # llm=HuggingFaceHub(repo_id="VietAI/gpt-neo-1.3B-vietnamese-news", model_kwargs={"temperature":0.9,"top_k":20,"do_sample":True,"max_length":500}) | |
# # local_llm = HuggingFacePipeline(pipeline=pipe) | |
# # llm=HuggingFaceHub(repo_id="google/flan-t5-base", model_kwargs={"temperature":0, "max_length":512}) | |
# # llm=HuggingFaceHub(repo_id="VietAI/gpt-neo-1.3B-vietnamese-news", model_kwargs={"temperature":0.1, "max_length":500}) | |
# chain = load_qa_chain(llm=OpenAI(temperature=0.5), chain_type="stuff") | |
# query = "được quyền lợi gì khi đẻ con" | |
# # query = "What did the president say about the Economy" | |
# docs = db.similarity_search(query) | |
# | |
# print(chain.run(input_documents=docs,question=query)) | |
chatgpt = ChatGPT() | |
def chatbot(input_text): | |
response = chatgpt.query(input_text) | |
return response | |
iface = gr.Interface(fn=chatbot, | |
inputs=gr.components.Textbox(lines=7, label="Enter your text"), | |
outputs="text", | |
title="Custom-trained AI Chatbot") | |
iface.launch(share=True) |