kkastr commited on
Commit
7a8513b
1 Parent(s): ad7d47f

updated to new version. using load api and improved looks. it's fast now.

Browse files
Files changed (2) hide show
  1. app.py +35 -29
  2. requirements.txt +1 -0
app.py CHANGED
@@ -4,6 +4,7 @@ import sys
4
  import nltk
5
  import praw
6
  import matplotlib
 
7
  import gradio as gr
8
  import pandas as pd
9
  import praw.exceptions
@@ -44,19 +45,13 @@ def preprocessData(df):
44
  df["text"] = df["text"].apply(lambda x: re.sub(r"http\S+", "", x, flags=re.M))
45
  df["text"] = df["text"].apply(lambda x: re.sub(r"^>.+", "", x, flags=re.M))
46
 
47
- smax = df.score.max()
48
-
49
- threshold = round(0.05 * smax)
50
-
51
- df = df[df.score >= threshold]
52
-
53
- # empirically, having more than 200 comments doesn't change much but slows down the summarizer.
54
- if len(df.text) >= 200:
55
- df = df[:200]
56
 
57
  # chunking to handle giving the model too large of an input which crashes
58
-
59
- # chunked = list(index_chunk(df.text))
60
  chunked = sentence_chunk(df.text)
61
 
62
  return chunked
@@ -123,56 +118,67 @@ def summarizer(url: str) -> str:
123
 
124
  # pushshift.io submission comments api doesn't work so have to use praw
125
  df = getComments(url=url)
 
 
 
126
  chunked_df = preprocessData(df)
127
- submission_title = df.submission_title.unique()[0]
128
 
129
  text = ' '.join(chunked_df)
130
- # transparent bg: background_color=None, mode='RGBA'
131
  wc_opts = dict(collocations=False, width=1920, height=1080)
132
  wcloud = WordCloud(**wc_opts).generate(text)
133
 
134
- fig = plt.figure(figsize=(16, 10))
135
- fig.patch.set_alpha(0.0)
136
- plt.imshow(wcloud)
137
  plt.axis("off")
138
- plt.tight_layout()
 
 
 
 
139
 
140
  lst_summaries = []
141
 
142
- nlp = pipeline('summarization', model="sshleifer/distilbart-cnn-12-6")
143
-
144
- for grp in chunked_df:
145
  # treating a group of comments as one block of text
146
- result = nlp(grp, max_length=500)[0]["summary_text"]
147
  lst_summaries.append(result)
148
 
149
- joined_summaries = ' '.join(lst_summaries).replace(" .", ".")
150
 
151
- total_summary = nlp(joined_summaries, max_length=500)[0]["summary_text"].replace(" .", ".")
152
 
153
- short_output = submission_title + '\n' + '\n' + total_summary
154
 
155
- long_output = submission_title + '\n' + '\n' + joined_summaries
156
-
157
- return short_output, long_output, fig
158
 
159
 
160
  if __name__ == "__main__":
161
 
 
 
 
 
 
 
 
 
162
  with gr.Blocks(css=".gradio-container {max-width: 900px !important; width: 100%}") as demo:
163
  submission_url = gr.Textbox(label='Post URL')
164
 
165
  sub_btn = gr.Button("Summarize")
166
 
 
 
167
  with gr.Row():
168
  short_summary = gr.Textbox(label='Short Summary')
169
- thread_cloud = gr.Plot(label='Word Cloud')
170
 
 
171
  long_summary = gr.Textbox(label='Long Summary')
172
 
173
  sub_btn.click(fn=summarizer,
174
  inputs=[submission_url],
175
- outputs=[short_summary, long_summary, thread_cloud])
176
 
177
  try:
178
  demo.launch()
 
4
  import nltk
5
  import praw
6
  import matplotlib
7
+ from tqdm import tqdm
8
  import gradio as gr
9
  import pandas as pd
10
  import praw.exceptions
 
45
  df["text"] = df["text"].apply(lambda x: re.sub(r"http\S+", "", x, flags=re.M))
46
  df["text"] = df["text"].apply(lambda x: re.sub(r"^>.+", "", x, flags=re.M))
47
 
48
+ # The df is sorted by comment score
49
+ # Empirically, having more than ~100 comments doesn't change much but slows down the summarizer.
50
+ # Slowdown is not present with load api but still seems good to limit low score comments.
51
+ if len(df.text) >= 128:
52
+ df = df[:128]
 
 
 
 
53
 
54
  # chunking to handle giving the model too large of an input which crashes
 
 
55
  chunked = sentence_chunk(df.text)
56
 
57
  return chunked
 
118
 
119
  # pushshift.io submission comments api doesn't work so have to use praw
120
  df = getComments(url=url)
121
+
122
+ submission_title = '# ' + df.submission_title.unique()[0]
123
+
124
  chunked_df = preprocessData(df)
 
125
 
126
  text = ' '.join(chunked_df)
127
+ # transparent bg: background_color=None, mode='RGBA')
128
  wc_opts = dict(collocations=False, width=1920, height=1080)
129
  wcloud = WordCloud(**wc_opts).generate(text)
130
 
131
+ plt.imshow(wcloud, aspect='auto')
 
 
132
  plt.axis("off")
133
+ plt.gca().set_position([0, 0, 1, 1])
134
+ plt.autoscale(tight=True)
135
+ fig = plt.gcf()
136
+ fig.patch.set_alpha(0.0)
137
+ fig.set_size_inches((12, 7))
138
 
139
  lst_summaries = []
140
 
141
+ for grp in tqdm(chunked_df):
 
 
142
  # treating a group of comments as one block of text
143
+ result = sum_api(grp)
144
  lst_summaries.append(result)
145
 
146
+ long_output = ' '.join(lst_summaries).replace(" .", ".")
147
 
148
+ short_output = sum_api(long_output).replace(" .", ".")
149
 
150
+ sentiment = clf_api(short_output)
151
 
152
+ return submission_title, short_output, long_output, sentiment, fig
 
 
153
 
154
 
155
  if __name__ == "__main__":
156
 
157
+ sum_model = "models/sshleifer/distilbart-cnn-12-6"
158
+ clf_model = "models/finiteautomata/bertweet-base-sentiment-analysis"
159
+
160
+ hf_token = os.environ["HF_TOKEN"]
161
+
162
+ sum_api = gr.Interface.load(sum_model, api_key=hf_token)
163
+ clf_api = gr.Interface.load(clf_model, api_key=hf_token)
164
+
165
  with gr.Blocks(css=".gradio-container {max-width: 900px !important; width: 100%}") as demo:
166
  submission_url = gr.Textbox(label='Post URL')
167
 
168
  sub_btn = gr.Button("Summarize")
169
 
170
+ title = gr.Markdown("")
171
+
172
  with gr.Row():
173
  short_summary = gr.Textbox(label='Short Summary')
174
+ summary_sentiment = gr.Label(label='Sentiment')
175
 
176
+ thread_cloud = gr.Plot(label='Word Cloud')
177
  long_summary = gr.Textbox(label='Long Summary')
178
 
179
  sub_btn.click(fn=summarizer,
180
  inputs=[submission_url],
181
+ outputs=[title, short_summary, long_summary, summary_sentiment, thread_cloud])
182
 
183
  try:
184
  demo.launch()
requirements.txt CHANGED
@@ -3,6 +3,7 @@ matplotlib==3.7.1
3
  nltk==3.8.1
4
  pandas==1.5.3
5
  praw==7.7.0
 
6
  transformers==4.26.1
7
  wordcloud==1.8.2.2
8
  torch==1.13.1
 
3
  nltk==3.8.1
4
  pandas==1.5.3
5
  praw==7.7.0
6
+ tqdm==4.65.0
7
  transformers==4.26.1
8
  wordcloud==1.8.2.2
9
  torch==1.13.1