king007 commited on
Commit
c6e29e8
1 Parent(s): adadda8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -11
app.py CHANGED
@@ -17,6 +17,10 @@ def preprocess(text):
17
  text = re.sub('\s+', ' ', text)
18
  return text
19
 
 
 
 
 
20
 
21
  def pdf_to_text(path, start_page=1, end_page=None):
22
  doc = fitz.open(path)
@@ -26,21 +30,33 @@ def pdf_to_text(path, start_page=1, end_page=None):
26
  end_page = total_pages
27
 
28
  text_list = []
29
-
 
 
 
 
30
  for i in range(start_page-1, end_page):
31
  text = doc.load_page(i).get_text("text")
32
  text = preprocess(text)
33
  text_list.append(text)
34
-
 
35
  doc.close()
36
- return text_list
 
 
 
37
 
38
 
39
  def text_to_chunks(texts, word_length=150, start_page=1):
40
  text_toks = [t.split(' ') for t in texts]
41
  page_nums = []
42
  chunks = []
43
-
 
 
 
 
44
  for idx, words in enumerate(text_toks):
45
  for i in range(0, len(words), word_length):
46
  chunk = words[i:i+word_length]
@@ -51,6 +67,10 @@ def text_to_chunks(texts, word_length=150, start_page=1):
51
  chunk = ' '.join(chunk).strip()
52
  chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
53
  chunks.append(chunk)
 
 
 
 
54
  return chunks
55
 
56
 
@@ -93,10 +113,11 @@ class SemanticSearch:
93
 
94
  def load_recommender(path, start_page=1):
95
  global recommender
96
- texts = pdf_to_text(path, start_page=start_page)
97
- chunks = text_to_chunks(texts, start_page=start_page)
 
98
  recommender.fit(chunks)
99
- return 'Corpus Loaded.'
100
 
101
 
102
  def generate_text(openAI_key,prompt, engine="text-davinci-003"):
@@ -142,19 +163,22 @@ def question_answer(url, file, question,openAI_key):
142
 
143
  if url.strip() != '' and file != None:
144
  return '[ERROR]: Both URL and PDF is provided. Please provide only one (eiter URL or PDF).'
145
-
 
146
  if url.strip() != '':
147
  glob_url = url
148
  download_pdf(glob_url, 'corpus.pdf')
149
- load_recommender('corpus.pdf')
150
 
151
  else:
152
  old_file_name = file.name
153
  file_name = file.name
154
  file_name = file_name[:-12] + file_name[-4:]
155
  os.rename(old_file_name, file_name)
156
- load_recommender(file_name)
157
-
 
 
158
  if question.strip() == '':
159
  return '[ERROR]: Question field is empty'
160
 
 
17
  text = re.sub('\s+', ' ', text)
18
  return text
19
 
20
+ def word_count0(str):
21
+ words = str.split()
22
+
23
+ return len(words)
24
 
25
  def pdf_to_text(path, start_page=1, end_page=None):
26
  doc = fitz.open(path)
 
30
  end_page = total_pages
31
 
32
  text_list = []
33
+ #
34
+ text_len = 0
35
+ #
36
+ pdf_parse_status = 1
37
+ #
38
  for i in range(start_page-1, end_page):
39
  text = doc.load_page(i).get_text("text")
40
  text = preprocess(text)
41
  text_list.append(text)
42
+ #
43
+ text_len = text_len + word_count0(text)
44
  doc.close()
45
+ if(text_len>10):
46
+ pdf_parse_status = 0
47
+ return [], pdf_parse_status
48
+ return text_list, pdf_parse_status
49
 
50
 
51
  def text_to_chunks(texts, word_length=150, start_page=1):
52
  text_toks = [t.split(' ') for t in texts]
53
  page_nums = []
54
  chunks = []
55
+ #
56
+ text_len = 0
57
+ #
58
+ pdf_parse_status = 1
59
+ #
60
  for idx, words in enumerate(text_toks):
61
  for i in range(0, len(words), word_length):
62
  chunk = words[i:i+word_length]
 
67
  chunk = ' '.join(chunk).strip()
68
  chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
69
  chunks.append(chunk)
70
+ text_len = text_len + word_count0(chunk)
71
+ if(text_len>10):
72
+ pdf_parse_status = 0
73
+ return [], pdf_parse_status
74
  return chunks
75
 
76
 
 
113
 
114
  def load_recommender(path, start_page=1):
115
  global recommender
116
+ pdf_parse_status = 1
117
+ texts, pdf_parse_status = pdf_to_text(path, start_page=start_page)
118
+ chunks, pdf_parse_status = text_to_chunks(texts, start_page=start_page)
119
  recommender.fit(chunks)
120
+ return 'Corpus Loaded.', pdf_parse_status
121
 
122
 
123
  def generate_text(openAI_key,prompt, engine="text-davinci-003"):
 
163
 
164
  if url.strip() != '' and file != None:
165
  return '[ERROR]: Both URL and PDF is provided. Please provide only one (eiter URL or PDF).'
166
+ #
167
+ pdf_parse_status = 1
168
  if url.strip() != '':
169
  glob_url = url
170
  download_pdf(glob_url, 'corpus.pdf')
171
+ load_resp, pdf_parse_status = load_recommender('corpus.pdf')
172
 
173
  else:
174
  old_file_name = file.name
175
  file_name = file.name
176
  file_name = file_name[:-12] + file_name[-4:]
177
  os.rename(old_file_name, file_name)
178
+ load_resp, pdf_parse_status = load_recommender(file_name)
179
+ #
180
+ if pdf_parse_status == 0:
181
+ return 'CODE:1004, MSG:PDF FILE TOO LARGE'
182
  if question.strip() == '':
183
  return '[ERROR]: Question field is empty'
184