import tempfile import streamlit as st from streamlit_chat import message import torch import torch.nn import transformers from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, ) import pandas as pd import numpy as np import os import io from langchain.document_loaders import TextLoader from langchain import PromptTemplate from langchain.text_splitter import CharacterTextSplitter from langchain.document_loaders import PyPDFLoader from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from langchain.chains.question_answering import load_qa_chain from langchain.chains import RetrievalQA from langchain import HuggingFacePipeline def pdf_loader(file_path): '''This is a function for loading the PDFs Params: file_path: The path of the PDF file ''' output_file = "Loaded_PDF.txt" loader = PyPDFLoader(file_path) pdf_file_as_loaded_docs = loader.load() return pdf_file_as_loaded_docs def splitDoc(loaded_docs): '''This is a function that creates the chunks of our loaded Document Params: loaded_docs:The loaded document from the pdf_loader function''' splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10) chunked_docs = splitter.split_documents(loaded_docs) return chunked_docs def makeEmbeddings(chunked_docs): '''This is a functuon for making the embeddings of the chunked document Params: chunked_docs:The chunked docs''' embedder = HuggingFaceEmbeddings() vector_store = FAISS.from_documents(chunked_docs, embedder)#making a FAISS based vector data return vector_store def create_flan_t5_base(load_in_8bit=False): ''''Loading the Flan T5 base in the form of pipeline''' # Wrap it in HF pipeline for use with LangChain model="google/flan-t5-base" tokenizer = AutoTokenizer.from_pretrained(model) return pipeline( task="text2text-generation", model=model, tokenizer = tokenizer, max_new_tokens=100, model_kwargs={ "load_in_8bit": load_in_8bit, "max_length": 512, "temperature": 0.} )