Fastchat generating truncated/Incomplete answers
#10
by
kvmukilan
- opened
I have trained LLM on my PDF file now I am asking questions related to same. But the output which is being generated is always truncated and stops in
between . Model giving incomplete sentences.
I have used follwing embeddings:
- sentence-transformers/all-mpnet-base-v2
- hkunlp/instructor-xl
to get embedding
def getEmbedding():
device = "cuda" if torch.cuda.is_available() else "cpu"
return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": device})
and tried with following LLMs:
- lmsys/fastchat-t5-3b-v1.0
- google/flan-t5-base
to get LLM
def getLLM():
return pipeline(
task="text2text-generation",
model = "lmsys/fastchat-t5-3b-v1.0",
min_new_tokens=100,
max_new_tokens=256,
model_kwargs={"device_map": "auto", "load_in_8bit": False, "max_length": 512, "temperature": 0.}
)
# to get the text
def get_pdf_text(pdf_path):
text = ""
documents = []
for pdf in pdf_path:
with NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
shutil.copyfileobj(pdf, tmp)
tmp_path = Path(tmp.name)
#print(tmp_path)
loader = PyPDFLoader(str(tmp_path))
documents.extend(loader.load())
return documents
# to split the document which we have gotten from the pdfs into tokens
def get_text_chunks(documents):
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=10) # This the encoding for text-embedding-ada-002
texts = text_splitter.split_documents(texts)
return texts
# Creating Chroma vector DB and persisting it
def vector_db_pdf(pdf_path):
#if PDF is not present then load from persist directory else condition otherwise use pdf to generate persist vector DB
if len(pdf_path)>0:
documents=get_pdf_text(pdf_path)
texts =get_text_chunks(documents)
vector_db=Chroma.from_documents(documents=texts, embedding=getEmbedding(), persist_directory="storage")
vector_db.persist()
else:
#Use from persist
vector_db=Chroma(persist_directory="storage", embedding_function=getEmbedding())
return vector_db
def retreival_qa_chain():
llm=getLLM()
vectordb=vector_db_pdf([])
hf_llm = HuggingFacePipeline(pipeline=llm,model_id="lmsys/fastchat-t5-3b-v1.0")
qa = RetrievalQA.from_chain_type(llm=hf_llm, chain_type="stuff",retriever=retriever)
retriever = vectordb.as_retriever(search_kwargs={"k":3})
In LLM pipeline I have tried parameters like early_stopping=False
setting min_new tokens
and increasing max_new_tokens
but nothing seems to work. Kindly explain how these parameters affect length of output.
Please access full code here
Some extra info :
Input: a legal containing 8-10 pages
transformers==4.29.2 , sentence-transformers==2.2.2 , lang chain= 0.0.189 , huggingface-hub==0.14.1 ,
Yes it is not providing full response.
[{'generated_text': ' My name is ___'}]
Prompt: What is your name?
Response: My name is ___