import streamlit as st import tempfile import torch from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import SentenceTransformersTokenTextSplitter from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings from langchain.vectorstores import FAISS model_name = "dangvantuan/sentence-camembert-large" model_kwargs = {'device': 'cpu'} encode_kwargs = {'convert_to_tensor': True} splitter = SentenceTransformersTokenTextSplitter(model_name=model_name, tokens_per_chunk=380, chunk_overlap=100) embeddings_fun = HuggingFaceEmbeddings( model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs ) def read_pdf(file): with tempfile.NamedTemporaryFile(delete=False) as temp: temp.write(file.getvalue()) loader = PyPDFLoader(temp.name) raw_documents = loader.load() return raw_documents st.title('PDF Text Extractor') uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") query = st.text_input("Entrer une question") st.text('La reponse à votre question:') if uploaded_file is not None: raw_documents = read_pdf(uploaded_file) documents = splitter.split_documents(raw_documents) # embeddings_text = embeddings_fun.embed_documents(documents) faiss_db = FAISS.from_documents(documents, embeddings_fun) docs = faiss_db.similarity_search(query) st.write(docs[0].page_content) else: st.write("file not uploaded correctly")