medelharchaoui commited on
Commit
26ba5f9
1 Parent(s): e274ccf

update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -7
app.py CHANGED
@@ -1,16 +1,39 @@
1
  import streamlit as st
2
- from PyPDF2 import PdfFileReader
 
 
 
 
 
 
3
 
4
  def read_pdf(file):
5
- pdf = PdfFileReader(file)
6
- text = ""
7
- for page in range(pdf.getNumPages()):
8
- text += pdf.getPage(page).extractText()
9
- return text
10
 
11
  st.title('PDF Text Extractor')
12
 
13
  uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
14
  if uploaded_file is not None:
15
  text = read_pdf(uploaded_file)
16
- st.write(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from langchain.document_loaders import PyPDFLoader
3
+
4
+ from langchain.text_splitter import SentenceTransformersTokenTextSplitter
5
+ from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+
8
+
9
 
10
  def read_pdf(file):
11
+ loader = PyPDFLoader(file)
12
+ raw_documents = loader.load()
13
+ return raw_documents
 
 
14
 
15
  st.title('PDF Text Extractor')
16
 
17
  uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
18
  if uploaded_file is not None:
19
  text = read_pdf(uploaded_file)
20
+
21
+
22
+ splitter = SentenceTransformersTokenTextSplitter(model_name='dangvantuan/sentence-camembert-large',
23
+ chunk_overlap=50
24
+ )
25
+
26
+ documents = splitter.split_documents(raw_documents)
27
+
28
+
29
+ embeddings_fun = HuggingFaceEmbeddings(model_name='dangvantuan/sentence-camembert-large')
30
+
31
+ # embeddings_text = embeddings_fun.embed_documents(documents)
32
+
33
+ faiss_db = FAISS.from_documents(documents, embeddings_fun)
34
+
35
+ query = st.text_input("Entrer une question")
36
+ docs = faiss_db.similarity_search(query)
37
+
38
+ st.text('La reponse à votre question:')
39
+ st.write(docs[0].page_content)