File size: 2,919 Bytes
c33d1d0
018fb30
842c848
 
018fb30
f7493dd
395e58c
e5633a7
c33d1d0
b08204e
1f5e9cb
18cb8f3
3f68bf3
 
 
142d17f
b08204e
 
 
6976271
b08204e
 
 
 
 
 
 
 
 
 
 
 
8a91ca0
 
 
 
b08204e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4dfc79
b08204e
 
 
70bd277
64ff975
70bd277
 
b08204e
 
ea07eae
b08204e
142d17f
68b31c9
 
 
 
b08204e
68b31c9
018fb30
037c950
 
68b31c9
 
018fb30
 
037c950
842c848
018fb30
 
 
c33d1d0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import gradio as gr
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from PyPDF2 import PdfReader

# Load environment variables
#load_dotenv()


# Print the current working directory
current_directory = os.getcwd()
print("Current Working Directory:", current_directory)

def get_pdf_text(pdf_docs):
    """
    Extract text from a list of PDF documents.

    Parameters
    ----------
    pdf_docs : list
        List of PDF documents to extract text from.

    Returns
    -------
    str
        Extracted text from all the PDF documents.

    """
    text = ""
    #for pdf in pdf_docs:
    pdf_reader = PdfReader(pdf_docs)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text


def get_text_chunks(text):
    """
    Split the input text into chunks.

    Parameters
    ----------
    text : str
        The input text to be split.

    Returns
    -------
    list
        List of text chunks.

    """
    text_splitter = CharacterTextSplitter(
        separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks


def get_vectorstore(text_chunks):
    """
    Generate a vector store from a list of text chunks using HuggingFace BgeEmbeddings.

    Parameters
    ----------
    text_chunks : list
        List of text chunks to be embedded.

    Returns
    -------
    FAISS
        A FAISS vector store containing the embeddings of the text chunks.

    """
    model = "BAAI/bge-base-en-v1.5"
    encode_kwargs = {
        "normalize_embeddings": True
    }  # set True to compute cosine similarity
    embeddings = HuggingFaceBgeEmbeddings(
        model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
    )
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    print("-----")
    print(vectorstore.similarity_search("What is ALiBi?"))
    print("-----")
    return vectorstore
    
# Adjust the path to your PDF file by escaping the space
pdf_path = r"new_papers/ALiBi.pdf"
pdf_text = get_pdf_text(pdf_path)

text_chunks = get_text_chunks(pdf_text)
api_db = get_vectorstore(text_chunks)

 

# Define the PDF retrieval function
def pdf_retrieval(query):
    # Run the query through the retriever
    response = api_db.similarity_search(query)
    print(response)
    return response

# Create Gradio interface for the API retriever
api_tool = gr.Interface(
    fn=pdf_retrieval,
    inputs=[gr.Textbox()],
    outputs=gr.Textbox(),
    live=True,
    title="API PDF Retrieval Tool",
    description="This tool indexes PDF documents and retrieves relevant answers based on a given query (HuggingFaceBgeEmbeddings).",
)

# Launch the Gradio interface
api_tool.launch()