wjjessen commited on
Commit
a95a714
1 Parent(s): b8f16a6

update code

Browse files
Files changed (1) hide show
  1. app.py +16 -5
app.py CHANGED
@@ -38,9 +38,10 @@ def file_preprocessing(file, skipfirst, skiplast):
38
  else:
39
  pages = pages
40
  print("")
41
- print("# pages after loop ##########")
42
  print("")
43
  print(pages)
 
44
  text_splitter = RecursiveCharacterTextSplitter(
45
  chunk_size=1000, # number of characters
46
  chunk_overlap=100,
@@ -49,16 +50,26 @@ def file_preprocessing(file, skipfirst, skiplast):
49
  )
50
  # https://dev.to/eteimz/understanding-langchains-recursivecharactertextsplitter-2846
51
  texts = text_splitter.split_documents(pages)
 
 
 
 
 
 
 
 
 
52
  final_texts = ""
53
  for text in texts:
54
  final_texts = final_texts + text.page_content
55
  return final_texts
56
 
57
 
 
58
  def preproc_count(filepath, skipfirst, skiplast):
59
  input_text = file_preprocessing(filepath, skipfirst, skiplast)
60
  text_length = len(input_text)
61
- print("Preproc input word count: %s" %(text_length))
62
  return input_text, text_length
63
 
64
 
@@ -80,9 +91,10 @@ def llm_pipeline(tokenizer, base_model, input_text, model_source):
80
  return summary
81
 
82
 
 
83
  def postproc_count(summary):
84
  text_length = len(summary)
85
- print("Postproc summary word count: %s" %(text_length))
86
  return text_length
87
 
88
 
@@ -143,7 +155,6 @@ def main():
143
  truncation=True,
144
  legacy=False,
145
  model_max_length=1000,
146
- #cache_dir="model_cache"
147
  )
148
  if model_source == "Download model":
149
  base_model = AutoModelForSeq2SeqLM.from_pretrained(
@@ -185,13 +196,13 @@ def main():
185
  postproc_text_length = postproc_count(summary)
186
  end = time.time()
187
  duration = end - start
 
188
  st.info(
189
  "PDF Summary  |  Number of words: "
190
  f"{postproc_text_length:,}"
191
  + "  |  Summarization time: "
192
  f"{duration:.0f}" + " seconds"
193
  )
194
- #st.code("\n".join(tw.wrap(summary, width=80)), language='md')
195
  st.success(summary)
196
 
197
 
 
38
  else:
39
  pages = pages
40
  print("")
41
+ print("# pages after skip(s) ##########")
42
  print("")
43
  print(pages)
44
+ print("")
45
  text_splitter = RecursiveCharacterTextSplitter(
46
  chunk_size=1000, # number of characters
47
  chunk_overlap=100,
 
50
  )
51
  # https://dev.to/eteimz/understanding-langchains-recursivecharactertextsplitter-2846
52
  texts = text_splitter.split_documents(pages)
53
+ print("Number of tokens:" + str(len(texts)))
54
+ print("")
55
+ print("First three tokens:")
56
+ print(texts[0])
57
+ print("")
58
+ print(texts[1])
59
+ print("")
60
+ print(texts[2])
61
+ print("")
62
  final_texts = ""
63
  for text in texts:
64
  final_texts = final_texts + text.page_content
65
  return final_texts
66
 
67
 
68
+ # function to count words in the input
69
  def preproc_count(filepath, skipfirst, skiplast):
70
  input_text = file_preprocessing(filepath, skipfirst, skiplast)
71
  text_length = len(input_text)
72
+ print("Input word count: " f"{text_length:,}")
73
  return input_text, text_length
74
 
75
 
 
91
  return summary
92
 
93
 
94
+ # function to count words in the summary
95
  def postproc_count(summary):
96
  text_length = len(summary)
97
+ print("Summary word count: " f"{text_length:,}")
98
  return text_length
99
 
100
 
 
155
  truncation=True,
156
  legacy=False,
157
  model_max_length=1000,
 
158
  )
159
  if model_source == "Download model":
160
  base_model = AutoModelForSeq2SeqLM.from_pretrained(
 
196
  postproc_text_length = postproc_count(summary)
197
  end = time.time()
198
  duration = end - start
199
+ print("Duration: " f"{duration:.0f}" + " seconds")
200
  st.info(
201
  "PDF Summary  |  Number of words: "
202
  f"{postproc_text_length:,}"
203
  + "  |  Summarization time: "
204
  f"{duration:.0f}" + " seconds"
205
  )
 
206
  st.success(summary)
207
 
208