arithescientist commited on
Commit
cf3e244
·
1 Parent(s): 17e34a5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -77
app.py CHANGED
@@ -29,91 +29,77 @@ def pdf(file):
29
  custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
30
  custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
31
  bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
32
- print('Using model {}\n'.format(model_name))
33
-
34
- list_of_files = file
35
-
36
-
37
- print("\nProcessing {} files...\n".format(len(list_of_files)))
38
- total_pages = 0
39
-
40
- for filename in list_of_files:
41
- print(filename)
42
- file = os.path.splitext(os.path.basename(filename))[0]
43
- pages = pdf2image.convert_from_path(pdf_path=filename, dpi=400, size=(1654,2340))
44
- total_pages += len(pages)
45
- print("\nProcessing the next {} pages...\n".format(len(pages)))
46
-
47
- # Then save all pages as images and convert them to text except the last page
48
- # TODO: create this as a function
49
- content = ""
50
- dir_name = 'images/' + file + '/'
51
- os.makedirs(dir_name, exist_ok=True)
52
- # If folder doesn't exist, then create it.
53
- for i in range(len(pages)-1):
54
- pages[i].save(dir_name + str(i) + '.jpg')
55
- # OCR the image using Google's tesseract
56
- content += pt.image_to_string(pages[i])
57
-
58
- summary_text = ""
59
- for i, paragraph in enumerate(content.split("\n\n")):
60
-
61
- paragraph = paragraph.replace('\n',' ')
62
- paragraph = paragraph.replace('\t','')
63
- paragraph = ' '.join(paragraph.split())
64
- # count words in the paragraph and exclude if less than 4 words
65
- tokens = word_tokenize(paragraph)
66
- # only do real words
67
- tokens = [word for word in tokens if word.isalpha()]
68
- # print("\nTokens: {}\n".format(len(tokens)))
69
- # only do sentences with more than 1 words excl. alpha crap
70
- if len(tokens) <= 1:
71
- continue
72
- # Perhaps also ignore paragraphs with no sentence?
73
- sentences = sent_tokenize(paragraph)
74
-
75
- paragraph = ' '.join(tokens)
76
-
77
- print("\nParagraph:")
78
- print(paragraph+"\n")
79
- # T5 needs to have 'summarize' in order to work:
80
- # text = "summarize:" + paragraph
81
- text = paragraph
82
-
83
- summary = bert_legal_model(text, min_length = 8, ratio = 0.05)
84
  # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
85
  summary_text += str(summary) + "\n\n"
86
  print("Summary:")
87
  print(summary)
88
 
89
- content2 = content.replace('\n',' ')
90
- content2 = content2.replace('\t','')
91
- summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
92
-
93
 
94
 
95
- # write all to file for inspection and storage
96
- all_text = "The Summary-- " + str(summary) + "\n\n\n" \
97
- + "The Larger Summary-- " + str(summary_text)
98
-
99
 
100
- all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
101
- all_text2 = all_text2.replace('?','.')
102
- all_text2 = all_text2.replace('\n',' ')
103
- all_text2 = all_text2.replace('..','.')
104
- all_text2 = all_text2.replace(',.',',')
105
- all_text2 = all_text2.replace('-- ','\n\n\n')
106
 
107
- pdf = FPDF()
108
 
109
- # Add a page
110
- pdf.add_page()
111
 
112
- pdf.set_font("Times", size = 12)
113
 
114
- # open the text file in read mode
115
- f = all_text2
116
- return f
117
 
118
 
119
 
@@ -121,10 +107,8 @@ def pdf(file):
121
 
122
  iface = gr.Interface(
123
  pdf,
124
- gr.inputs.Image(shape=(224, 224)),
125
- gr.outputs.Label(f),
126
- capture_session=True,
127
- interpretation="default",
128
  )
129
 
130
  if __name__ == "__main__":
 
29
  custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
30
  custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
31
  bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
32
+
33
+ pages= pdf2image.convert_from_path(pdf_path=file, dpi=400, size=(1654,2340))
34
+
35
+ content = ""
36
+ dir_name = 'images/' + file + '/'
37
+ os.makedirs(dir_name, exist_ok=True)
38
+ # If folder doesn't exist, then create it.
39
+ for i in range(len(pages)-1):
40
+ pages[i].save(dir_name + str(i) + '.jpg')
41
+ # OCR the image using Google's tesseract
42
+ content += pt.image_to_string(pages[i])
43
+
44
+ summary_text = ""
45
+ for i, paragraph in enumerate(content.split("\n\n")):
46
+
47
+ paragraph = paragraph.replace('\n',' ')
48
+ paragraph = paragraph.replace('\t','')
49
+ paragraph = ' '.join(paragraph.split())
50
+ # count words in the paragraph and exclude if less than 4 words
51
+ tokens = word_tokenize(paragraph)
52
+ # only do real words
53
+ tokens = [word for word in tokens if word.isalpha()]
54
+ # print("\nTokens: {}\n".format(len(tokens)))
55
+ # only do sentences with more than 1 words excl. alpha crap
56
+ if len(tokens) <= 1:
57
+ continue
58
+ # Perhaps also ignore paragraphs with no sentence?
59
+ sentences = sent_tokenize(paragraph)
60
+
61
+ paragraph = ' '.join(tokens)
62
+
63
+ print("\nParagraph:")
64
+ print(paragraph+"\n")
65
+ # T5 needs to have 'summarize' in order to work:
66
+ # text = "summarize:" + paragraph
67
+ text = paragraph
68
+
69
+ summary = bert_legal_model(text, min_length = 8, ratio = 0.05)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
71
  summary_text += str(summary) + "\n\n"
72
  print("Summary:")
73
  print(summary)
74
 
75
+ content2 = content.replace('\n',' ')
76
+ content2 = content2.replace('\t','')
77
+ summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
78
+
79
 
80
 
81
+ # write all to file for inspection and storage
82
+ all_text = "The Summary-- " + str(summary) + "\n\n\n" \
83
+ + "The Larger Summary-- " + str(summary_text)
84
+
85
 
86
+ all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
87
+ all_text2 = all_text2.replace('?','.')
88
+ all_text2 = all_text2.replace('\n',' ')
89
+ all_text2 = all_text2.replace('..','.')
90
+ all_text2 = all_text2.replace(',.',',')
91
+ all_text2 = all_text2.replace('-- ','\n\n\n')
92
 
93
+ pdf = FPDF()
94
 
95
+ # Add a page
96
+ pdf.add_page()
97
 
98
+ pdf.set_font("Times", size = 12)
99
 
100
+ # open the text file in read mode
101
+ f = all_text2
102
+ return f
103
 
104
 
105
 
 
107
 
108
  iface = gr.Interface(
109
  pdf,
110
+ "file",
111
+ "text"
 
 
112
  )
113
 
114
  if __name__ == "__main__":