I have trained LLM on my PDF file now I am asking questions related to same. But the output which is being generated is always truncated and stops in
between . Model giving incomplete sentences.

I have used follwing embeddings:

sentence-transformers/all-mpnet-base-v2
hkunlp/instructor-xl

to get embedding

def getEmbedding():
        device = "cuda" if torch.cuda.is_available() else "cpu"
        return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": device})

and tried with following LLMs:

lmsys/fastchat-t5-3b-v1.0
google/flan-t5-base

to get LLM

def getLLM():
        return pipeline(
            task="text2text-generation",
            model = "lmsys/fastchat-t5-3b-v1.0",
            min_new_tokens=100,
            max_new_tokens=256,
            model_kwargs={"device_map": "auto", "load_in_8bit": False, "max_length": 512, "temperature": 0.}
        )

# to get the text
def get_pdf_text(pdf_path):
    text = ""
    documents = []
    for pdf in pdf_path:
        with NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
            shutil.copyfileobj(pdf, tmp)
            tmp_path = Path(tmp.name)
            #print(tmp_path)
            loader = PyPDFLoader(str(tmp_path))
            documents.extend(loader.load())
    return documents
# to split the document which we have gotten from the pdfs into tokens 
def get_text_chunks(documents):
    text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
    texts = text_splitter.split_documents(documents)
    text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=10)  # This the encoding for text-embedding-ada-002
    texts = text_splitter.split_documents(texts)
    return texts   
# Creating Chroma vector DB and persisting it
def vector_db_pdf(pdf_path):
    #if PDF is not present then load from persist directory else condition otherwise use pdf to generate persist vector DB
    if len(pdf_path)>0:
        documents=get_pdf_text(pdf_path)
        texts =get_text_chunks(documents)         
        vector_db=Chroma.from_documents(documents=texts, embedding=getEmbedding(), persist_directory="storage")
        vector_db.persist()    
    else:
        #Use from persist
        vector_db=Chroma(persist_directory="storage", embedding_function=getEmbedding())
    return vector_db

def retreival_qa_chain():
        llm=getLLM()
        vectordb=vector_db_pdf([])
        hf_llm = HuggingFacePipeline(pipeline=llm,model_id="lmsys/fastchat-t5-3b-v1.0")
        qa = RetrievalQA.from_chain_type(llm=hf_llm, chain_type="stuff",retriever=retriever)
       retriever = vectordb.as_retriever(search_kwargs={"k":3})

In LLM pipeline I have tried parameters like early_stopping=False setting min_new tokens and increasing max_new_tokens but nothing seems to work. Kindly explain how these parameters affect length of output.

Please access full code here

Some extra info :
Input: a legal containing 8-10 pages
transformers==4.29.2 , sentence-transformers==2.2.2 , lang chain= 0.0.189 , huggingface-hub==0.14.1 ,

lmsys
/

fastchat-t5-3b-v1.0

Fastchat generating truncated/Incomplete answers

to get embedding

to get LLM

Please access full code here