rushi29 commited on
Commit
8dba09e
1 Parent(s): de6e57d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -38
app.py CHANGED
@@ -7,6 +7,10 @@ nltk.download('punkt')
7
  import bs4 as bs
8
  import urllib.request
9
  import re
 
 
 
 
10
  # Create embeddings model, backed by sentence-transformers & transformers
11
  embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})
12
 
@@ -20,25 +24,6 @@ st.markdown('_Welecome to Question Answering System 🧠 🤖_')
20
 
21
  a = st.sidebar.radio("SELECT -", ['PDF', 'Website'])
22
 
23
- def my_function_pdf():
24
- textract = Textractor(sentences=True)
25
-
26
- data_lines = []
27
- for i in (locations_max):
28
- lines = textract(i)
29
- data_lines.append(lines)
30
- total_lines = []
31
- for i in data_lines:
32
- total_lines += i
33
- seq = embeddings.similarity(quer, total_lines)
34
- three_most = seq[0:3]
35
- indexes = []
36
- for i in three_most:
37
- indexes.append(i[0])
38
- for j in indexes:
39
- st.write(total_lines[j])
40
-
41
-
42
  ## webscrap function
43
  def my_web():
44
  from txtai.pipeline import Textractor
@@ -69,28 +54,63 @@ def my_web():
69
 
70
 
71
 
 
72
  if a == 'PDF' :
73
- uploaded_files = st.file_uploader("Choose a CSV file", accept_multiple_files=True)
74
- locations_max = []
75
- for uploaded_file in uploaded_files:
76
- # st.write(uploaded_file.name)
77
- locations_max.append(uploaded_file.name)
78
 
79
-
80
- # for query
81
  quer = st.text_input('ask me anything!', placeholder = 'ex - what is AI?')
82
- st.write('Your query is - ', quer)
83
-
84
-
85
- # for textraction
86
- if st.button('Confirm!'):
87
- st.write('Confirmed')
88
- my_function_pdf()
89
- else:
90
- st.write('')
91
-
92
-
93
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  ## web
95
  else:
96
  number = st.number_input('Insert a number of Links -',value =1, step =1)
 
7
  import bs4 as bs
8
  import urllib.request
9
  import re
10
+ import docx2txt
11
+ from PyPDF2 import PdfReader
12
+ from nltk import tokenize
13
+
14
  # Create embeddings model, backed by sentence-transformers & transformers
15
  embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})
16
 
 
24
 
25
  a = st.sidebar.radio("SELECT -", ['PDF', 'Website'])
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  ## webscrap function
28
  def my_web():
29
  from txtai.pipeline import Textractor
 
54
 
55
 
56
 
57
+
58
  if a == 'PDF' :
59
+ uploaded_files = st.file_uploader("Upload files - ", accept_multiple_files=True ,
60
+ type = ['pdf', 'docx' , 'txt'] )
 
 
 
61
 
62
+ # for query
63
+
64
  quer = st.text_input('ask me anything!', placeholder = 'ex - what is AI?')
65
+ st.write('Your query is - ', quer)
66
+
67
+ if st.button("Process"):
68
+
69
+ for uploaded_file in uploaded_files:
70
+ if uploaded_file is not None:
71
+ file_details = {"Filename":uploaded_file.name,"FileType":uploaded_file.type,"FileSize":uploaded_file.size}
72
+ #st.write(file_details)
73
+
74
+ if uploaded_file.type == "text/plain":
75
+ raw_text = str(uploaded_file.read(),"utf-8")
76
+ st.write(raw_text)
77
+
78
+ elif uploaded_file.type == "application/pdf" :
79
+ reader = PdfReader(uploaded_file)
80
+ text = ""
81
+ for page in reader.pages:
82
+ text += page.extract_text() + "\n"
83
+ #st.write(text)
84
+
85
+ data_lines = tokenize.sent_tokenize(text)
86
+ #st.write(data_lines)
87
+
88
+ seq = embeddings.similarity(quer, data_lines)
89
+ three_most = seq[0:3]
90
+ indexes = []
91
+ for i in three_most:
92
+ indexes.append(i[0])
93
+ for j in indexes:
94
+ st.write(data_lines[j])
95
+
96
+
97
+ #total_lines = []
98
+ #for i in data_lines:
99
+ #total_lines += i
100
+
101
+ #st.write(data_lines)
102
+
103
+ #try:
104
+ #with pdfplumber.open(uploaded_file) as pdf:
105
+ #pages = pdf.pages[0]
106
+ #st.write(pages.extract_text())
107
+ #except:
108
+ #st.write("None")
109
+
110
+ elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" :
111
+ raw_text = docx2txt.process(uploaded_file)
112
+ st.write(raw_text)
113
+
114
  ## web
115
  else:
116
  number = st.number_input('Insert a number of Links -',value =1, step =1)