SemanticSearch / app.py
medelharchaoui's picture
Update app.py
9111075
import streamlit as st
import tempfile
import torch
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
model_name = "dangvantuan/sentence-camembert-large"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'convert_to_tensor': True}
splitter = SentenceTransformersTokenTextSplitter(model_name=model_name,
tokens_per_chunk=380,
chunk_overlap=100)
embeddings_fun = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
def read_pdf(file):
with tempfile.NamedTemporaryFile(delete=False) as temp:
temp.write(file.getvalue())
loader = PyPDFLoader(temp.name)
raw_documents = loader.load()
return raw_documents
st.title('PDF Text Extractor')
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
query = st.text_input("Entrer une question")
st.text('La reponse à votre question:')
if uploaded_file is not None:
raw_documents = read_pdf(uploaded_file)
documents = splitter.split_documents(raw_documents)
# embeddings_text = embeddings_fun.embed_documents(documents)
faiss_db = FAISS.from_documents(documents, embeddings_fun)
docs = faiss_db.similarity_search(query)
st.write(docs[0].page_content)
else:
st.write("file not uploaded correctly")