File size: 2,925 Bytes
c33d1d0 018fb30 842c848 018fb30 f7493dd 395e58c e5633a7 c33d1d0 b08204e 1f5e9cb 18cb8f3 3f68bf3 142d17f b08204e 6976271 b08204e 8a91ca0 b08204e 70bd277 64ff975 70bd277 b08204e ea07eae b08204e 142d17f 68b31c9 b08204e 68b31c9 018fb30 037c950 68b31c9 018fb30 037c950 842c848 018fb30 c33d1d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import os
import gradio as gr
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from PyPDF2 import PdfReader
# Load environment variables
#load_dotenv()
# Print the current working directory
current_directory = os.getcwd()
print("Current Working Directory:", current_directory)
def get_pdf_text(pdf_docs):
"""
Extract text from a list of PDF documents.
Parameters
----------
pdf_docs : list
List of PDF documents to extract text from.
Returns
-------
str
Extracted text from all the PDF documents.
"""
text = ""
#for pdf in pdf_docs:
pdf_reader = PdfReader(pdf_docs)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_text_chunks(text):
"""
Split the input text into chunks.
Parameters
----------
text : str
The input text to be split.
Returns
-------
list
List of text chunks.
"""
text_splitter = CharacterTextSplitter(
separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
)
chunks = text_splitter.split_text(text)
return chunks
def get_vectorstore(text_chunks):
"""
Generate a vector store from a list of text chunks using HuggingFace BgeEmbeddings.
Parameters
----------
text_chunks : list
List of text chunks to be embedded.
Returns
-------
FAISS
A FAISS vector store containing the embeddings of the text chunks.
"""
model = "BAAI/bge-base-en-v1.5"
encode_kwargs = {
"normalize_embeddings": True
} # set True to compute cosine similarity
embeddings = HuggingFaceBgeEmbeddings(
model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
)
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
print("-----")
print(vectorstore.as_retriever.similarity("What is ALiBi?"))
print("-----")
return vectorstore
# Adjust the path to your PDF file by escaping the space
pdf_path = r"new_papers/ALiBi.pdf"
pdf_text = get_pdf_text(pdf_path)
text_chunks = get_text_chunks(pdf_text)
api_db = get_vectorstore(text_chunks)
# Define the PDF retrieval function
def pdf_retrieval(query):
# Run the query through the retriever
response = api_db.similarity_search(query)
print(response)
return response
# Create Gradio interface for the API retriever
api_tool = gr.Interface(
fn=pdf_retrieval,
inputs=[gr.Textbox()],
outputs=gr.Textbox(),
live=True,
title="API PDF Retrieval Tool",
description="This tool indexes PDF documents and retrieves relevant answers based on a given query (HuggingFaceBgeEmbeddings).",
)
# Launch the Gradio interface
api_tool.launch()
|