PDF_QA / helper.py
Vageesh1's picture
Update helper.py
65d98da
raw
history blame contribute delete
No virus
2.21 kB
import tempfile
import streamlit as st
from streamlit_chat import message
import torch
import torch.nn
import transformers
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
HfArgumentParser,
TrainingArguments,
pipeline,
logging,
)
import pandas as pd
import numpy as np
import os
import io
from langchain.document_loaders import TextLoader
from langchain import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA
from langchain import HuggingFacePipeline
def pdf_loader(file_path):
'''This is a function for loading the PDFs
Params:
file_path: The path of the PDF file
'''
output_file = "Loaded_PDF.txt"
loader = PyPDFLoader(file_path)
pdf_file_as_loaded_docs = loader.load()
return pdf_file_as_loaded_docs
def splitDoc(loaded_docs):
'''This is a function that creates the chunks of our loaded Document
Params:
loaded_docs:The loaded document from the pdf_loader function'''
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
chunked_docs = splitter.split_documents(loaded_docs)
return chunked_docs
def makeEmbeddings(chunked_docs):
'''This is a functuon for making the embeddings of the chunked document
Params:
chunked_docs:The chunked docs'''
embedder = HuggingFaceEmbeddings()
vector_store = FAISS.from_documents(chunked_docs, embedder)#making a FAISS based vector data
return vector_store
def create_flan_t5_base(load_in_8bit=False):
''''Loading the Flan T5 base in the form of pipeline'''
# Wrap it in HF pipeline for use with LangChain
model="google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model)
return pipeline(
task="text2text-generation",
model=model,
tokenizer = tokenizer,
max_new_tokens=100,
model_kwargs={ "load_in_8bit": load_in_8bit, "max_length": 512, "temperature": 0.}
)