File size: 2,663 Bytes
2bd9b9e
b5e0972
 
78d81f3
 
b5e0972
 
 
 
 
 
78d81f3
 
2bd9b9e
b5e0972
78d81f3
 
 
2bd9b9e
78d81f3
 
b5e0972
78d81f3
b5e0972
 
78d81f3
 
 
 
 
 
 
 
 
 
 
 
 
b5e0972
78d81f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5e0972
78d81f3
b5e0972
78d81f3
 
 
 
 
 
 
 
b5e0972
2bd9b9e
 
 
 
78d81f3
2bd9b9e
78d81f3
 
b5e0972
0902b78
2bd9b9e
 
 
 
b5e0972
 
2bd9b9e
ff6c15e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
import os
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.document_loaders import PDFMinerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
import chromadb
import tempfile

# Define Chroma Settings
CHROMA_SETTINGS = {
    "chroma_db_impl": "duckdb+parquet",
    "persist_directory": tempfile.mkdtemp(),  # Use a temporary directory
    "anonymized_telemetry": False
}

# Load model and tokenizer
checkpoint = "MBZUAI/LaMini-Flan-T5-783M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
base_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map=torch.device("cpu"), torch_dtype=torch.float32)

# Define functions
def data_ingestion(file_path):
    loader = PDFMinerLoader(file_path)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=500)
    texts = text_splitter.split_documents(documents)
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    db = Chroma.from_documents(texts, embeddings, persist_directory=CHROMA_SETTINGS["persist_directory"])
    db.persist()
    print(texts)
    return db

def llm_pipeline():
    pipe = pipeline(
        "text2text-generation",
        model=base_model,
        tokenizer=tokenizer,
        max_length=256,
        do_sample=True,
        temperature=0.3,
        top_p=0.95
    )
    local_llm = HuggingFacePipeline(pipeline=pipe)
    return local_llm

def qa_llm():
    llm = llm_pipeline()
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    vectordb = Chroma(persist_directory=CHROMA_SETTINGS["persist_directory"], embedding_function=embeddings)
    retriever = vectordb.as_retriever()
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )
    
    return qa

def process_answer(file, instruction):
    # Ingest the data from the uploaded PDF
    data_ingestion(file.name)
    # Process the question
    qa = qa_llm()
    generated_text = qa(instruction)
    answer = generated_text["result"]
    return answer

# Define Gradio interfac
iface = gr.Interface(
    fn=process_answer, 
    inputs=["file", "text"], 
    outputs="text"
)

# Launch the interface
iface.launch()