Spaces:

arithescientist
/

lincolnlegal

Sleeping

App Files Files Community

arithescientist commited on Jun 1, 2022

Commit

cf3e244

1 Parent(s): 17e34a5

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -77

app.py CHANGED Viewed

@@ -29,91 +29,77 @@ def pdf(file):
   custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
   custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
   bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
-  print('Using model {}\n'.format(model_name))
-  list_of_files = file
-  print("\nProcessing {} files...\n".format(len(list_of_files)))
-  total_pages = 0
-  for filename in list_of_files:
-      print(filename)
-      file = os.path.splitext(os.path.basename(filename))[0]
-      pages = pdf2image.convert_from_path(pdf_path=filename, dpi=400, size=(1654,2340))
-      total_pages += len(pages)
-      print("\nProcessing the next {} pages...\n".format(len(pages)))
-      # Then save all pages as images and convert them to text except the last page
-      # TODO: create this as a function
-      content = ""
-      dir_name = 'images/' + file + '/'
-      os.makedirs(dir_name, exist_ok=True)
-      # If folder doesn't exist, then create it.
-      for i in range(len(pages)-1):
-          pages[i].save(dir_name + str(i) + '.jpg')
-          # OCR the image using Google's tesseract
-          content += pt.image_to_string(pages[i])
-      summary_text = ""
-      for i, paragraph in enumerate(content.split("\n\n")):
-          paragraph = paragraph.replace('\n',' ')
-          paragraph = paragraph.replace('\t','')
-          paragraph = ' '.join(paragraph.split())
-          # count words in the paragraph and exclude if less than 4 words
-          tokens = word_tokenize(paragraph)
-          # only do real words
-          tokens = [word for word in tokens if word.isalpha()]
-          # print("\nTokens: {}\n".format(len(tokens)))
-          # only do sentences with more than 1 words excl. alpha crap
-          if len(tokens) <= 1:
-              continue
-          # Perhaps also ignore paragraphs with no sentence?
-          sentences = sent_tokenize(paragraph)
-          paragraph = ' '.join(tokens)
-          print("\nParagraph:")
-          print(paragraph+"\n")
-          # T5 needs to have 'summarize' in order to work:
-          # text = "summarize:" + paragraph
-          text = paragraph
-          summary = bert_legal_model(text,  min_length = 8, ratio = 0.05)
           # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
           summary_text += str(summary) + "\n\n"
           print("Summary:")
           print(summary)
-      content2 = content.replace('\n',' ')
-      content2 = content2.replace('\t','')
-      summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
-      # write all to file for inspection and storage
-      all_text = "The Summary-- " + str(summary) + "\n\n\n" \
-          + "The Larger Summary-- " + str(summary_text)
-      all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
-      all_text2 = all_text2.replace('?','.')
-      all_text2 = all_text2.replace('\n',' ')
-      all_text2 = all_text2.replace('..','.')
-      all_text2 = all_text2.replace(',.',',')
-      all_text2 = all_text2.replace('-- ','\n\n\n')
-      pdf = FPDF()
-      # Add a page
-      pdf.add_page()
-      pdf.set_font("Times", size = 12)
-      # open the text file in read mode
-      f = all_text2
-      return f
@@ -121,10 +107,8 @@ def pdf(file):
 iface = gr.Interface(
     pdf,
-    gr.inputs.Image(shape=(224, 224)),
-    gr.outputs.Label(f),
-    capture_session=True,
-    interpretation="default",
    )
 if __name__ == "__main__":

   custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
   custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
   bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
+  pages= pdf2image.convert_from_path(pdf_path=file, dpi=400, size=(1654,2340))
+  content = ""
+  dir_name = 'images/' + file + '/'
+  os.makedirs(dir_name, exist_ok=True)
+  # If folder doesn't exist, then create it.
+  for i in range(len(pages)-1):
+      pages[i].save(dir_name + str(i) + '.jpg')
+      # OCR the image using Google's tesseract
+      content += pt.image_to_string(pages[i])
+  summary_text = ""
+  for i, paragraph in enumerate(content.split("\n\n")):
+      paragraph = paragraph.replace('\n',' ')
+      paragraph = paragraph.replace('\t','')
+      paragraph = ' '.join(paragraph.split())
+      # count words in the paragraph and exclude if less than 4 words
+      tokens = word_tokenize(paragraph)
+      # only do real words
+      tokens = [word for word in tokens if word.isalpha()]
+      # print("\nTokens: {}\n".format(len(tokens)))
+      # only do sentences with more than 1 words excl. alpha crap
+      if len(tokens) <= 1:
+          continue
+      # Perhaps also ignore paragraphs with no sentence?
+      sentences = sent_tokenize(paragraph)
+      paragraph = ' '.join(tokens)
+      print("\nParagraph:")
+      print(paragraph+"\n")
+      # T5 needs to have 'summarize' in order to work:
+      # text = "summarize:" + paragraph
+      text = paragraph
+      summary = bert_legal_model(text,  min_length = 8, ratio = 0.05)
           # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
           summary_text += str(summary) + "\n\n"
           print("Summary:")
           print(summary)
+  content2 = content.replace('\n',' ')
+  content2 = content2.replace('\t','')
+  summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
+  # write all to file for inspection and storage
+  all_text = "The Summary-- " + str(summary) + "\n\n\n" \
+      + "The Larger Summary-- " + str(summary_text)
+  all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
+  all_text2 = all_text2.replace('?','.')
+  all_text2 = all_text2.replace('\n',' ')
+  all_text2 = all_text2.replace('..','.')
+  all_text2 = all_text2.replace(',.',',')
+  all_text2 = all_text2.replace('-- ','\n\n\n')
+  pdf = FPDF()
+  # Add a page
+  pdf.add_page()
+  pdf.set_font("Times", size = 12)
+  # open the text file in read mode
+  f = all_text2
+  return f
 iface = gr.Interface(
     pdf,
+    "file",
+    "text"
    )
 if __name__ == "__main__":