File size: 1,606 Bytes
46d9645
fd18c44
0f14a07
26ba5f9
 
 
 
 
 
 
9111075
 
 
 
 
 
 
 
 
 
 
 
e198633
46d9645
 
fd18c44
 
 
 
 
46d9645
 
 
 
973762f
 
 
 
 
46d9645
e198633
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import streamlit as st
import tempfile
import torch
from langchain.document_loaders import PyPDFLoader

from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS


model_name = "dangvantuan/sentence-camembert-large"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'convert_to_tensor': True}
splitter = SentenceTransformersTokenTextSplitter(model_name=model_name,
                                                 tokens_per_chunk=380,
                                                 chunk_overlap=100)

embeddings_fun = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


def read_pdf(file):
    with tempfile.NamedTemporaryFile(delete=False) as temp:
        temp.write(file.getvalue())
        loader = PyPDFLoader(temp.name)
        raw_documents = loader.load()
        return raw_documents

st.title('PDF Text Extractor')

uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

query = st.text_input("Entrer une question")

st.text('La reponse à votre question:')

if uploaded_file is not None:
    raw_documents = read_pdf(uploaded_file)    
    documents = splitter.split_documents(raw_documents)
    
    # embeddings_text = embeddings_fun.embed_documents(documents)
    
    faiss_db = FAISS.from_documents(documents, embeddings_fun)
    
    docs = faiss_db.similarity_search(query)
    
    st.write(docs[0].page_content)
else:
    st.write("file not uploaded correctly")