Gladiator commited on
Commit
12c89e3
1 Parent(s): 3ebd8ef

addd support for .txt, .docx, .pdf files

Browse files
Files changed (2) hide show
  1. test.py +26 -0
  2. utils.py +34 -4
test.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import docx2txt
2
+ import streamlit as st
3
+ from io import StringIO
4
+ from PyPDF2 import PdfFileReader
5
+
6
+
7
+ def read_pdf(file):
8
+ pdfReader = PdfFileReader(file)
9
+ count = pdfReader.numPages
10
+ all_page_text = ""
11
+ for i in range(count):
12
+ page = pdfReader.getPage(i)
13
+ all_page_text += page.extractText()
14
+
15
+ return all_page_text
16
+
17
+
18
+ if __name__ == "__main__":
19
+ st.header("Testing file uploads")
20
+
21
+ uploaded_file = st.file_uploader("Upload a file here")
22
+
23
+ st.write(uploaded_file.type)
24
+ docx_text = docx2txt.process(uploaded_file)
25
+
26
+ st.write(docx_text)
utils.py CHANGED
@@ -1,5 +1,9 @@
1
  import re
2
  import requests
 
 
 
 
3
  from bs4 import BeautifulSoup
4
  from nltk.tokenize import sent_tokenize
5
 
@@ -98,10 +102,36 @@ def preprocess_text_for_abstractive_summarization(tokenizer, text):
98
  return chunks
99
 
100
 
 
 
 
 
 
 
 
 
 
 
 
101
  def read_text_from_file(file):
102
 
103
- # txt_file = open(file, "r")
104
- file_text = file.read()
105
- # txt_file.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
- return file_text
 
1
  import re
2
  import requests
3
+ import docx2txt
4
+ from io import StringIO
5
+ from PyPDF2 import PdfFileReader
6
+
7
  from bs4 import BeautifulSoup
8
  from nltk.tokenize import sent_tokenize
9
 
 
102
  return chunks
103
 
104
 
105
+ def read_pdf(file):
106
+ pdfReader = PdfFileReader(file)
107
+ count = pdfReader.numPages
108
+ all_page_text = ""
109
+ for i in range(count):
110
+ page = pdfReader.getPage(i)
111
+ all_page_text += page.extractText()
112
+
113
+ return all_page_text
114
+
115
+
116
  def read_text_from_file(file):
117
 
118
+ # read text file
119
+ if file.type == "text/plain":
120
+ # To convert to a string based IO:
121
+ stringio = StringIO(file.getvalue().decode("utf-8"))
122
+
123
+ # To read file as string:
124
+ file_content = stringio.read()
125
+
126
+ # read pdf file
127
+ elif file.type == "application/pdf":
128
+ file_content = read_pdf(file)
129
+
130
+ # read docx file
131
+ elif (
132
+ file.type
133
+ == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
134
+ ):
135
+ file_content = docx2txt(file)
136
 
137
+ return file_content