kkastr commited on
Commit
320952b
1 Parent(s): 95e1833

developed a good system for dealing with the large amount of comments. The resulting summaries appear to be pretty good

Browse files
Files changed (3) hide show
  1. gradio-demo.py +2 -1
  2. scraper.py +1 -0
  3. thread_analyzer.py +29 -33
gradio-demo.py CHANGED
@@ -4,6 +4,7 @@ import gradio as gr
4
  def greet(name):
5
  return "Hello " + name + "!"
6
 
7
- demo = gr.Interface(fn=greet, inputs="text", outputs="text", css=".primary-button {background-color: cyan")
 
8
 
9
  demo.launch()
 
4
  def greet(name):
5
  return "Hello " + name + "!"
6
 
7
+
8
+ demo = gr.Interface(fn=greet, inputs="text", outputs="text")
9
 
10
  demo.launch()
scraper.py CHANGED
@@ -47,5 +47,6 @@ def getComments(url):
47
 
48
  return df
49
 
 
50
  if __name__ == "__main__":
51
  pass
 
47
 
48
  return df
49
 
50
+
51
  if __name__ == "__main__":
52
  pass
thread_analyzer.py CHANGED
@@ -3,55 +3,51 @@ from transformers import pipeline
3
  from scraper import getComments
4
 
5
 
6
- def main():
7
-
8
- # durl = "https://www.reddit.com/r/ask/comments/111591k/who_do_you_think_will_start_ww3/"
9
-
10
- # here you would probably check if the post id already exists in some DB so that you don't have to refetch comments.
11
- # if pushshift.io submission comments api starts working again, could probably make this all realtime.
12
-
13
- # df = getComments(url=durl)
14
- nlp = pipeline('summarization')
15
- df = pd.read_csv("111591k_comments.csv")
16
- gb = df.groupby("parent_id")
17
 
18
- for key, grp in gb:
19
 
20
- summary = nlp(grp.text.str.cat(), max_length=500)[0]["summary_text"]
21
 
22
- print(summary)
23
- break
24
 
25
- if __name__ == "__main__":
26
- # ldf = pd.read_csv('news_comments.csv')
 
 
27
 
28
- # pid = ldf.post_id.unique()[0]
 
 
29
 
30
- # df = ldf[ldf.post_id == pid]
31
 
32
- # txt = df.body[0:2].str.cat()
33
 
34
- # nlp = pipeline('summarization')
35
- # df.body[4] = ''
36
 
37
- # text = ' '.join(df.body[0:32])
38
- # summary1 = nlp(text, max_length=500)[0]["summary_text"]
39
 
40
- # text = ' '.join(df.body[33:60])
41
- # summary2 = nlp(text, max_length=500)[0]["summary_text"]
42
 
43
- # text = ' '.join(df.body[61:90])
44
- # summary3 = nlp(text, max_length=500)[0]["summary_text"]
45
 
 
46
 
47
- # summary = summary1 + ' ' + summary2 + ' ' + summary3
 
 
48
 
49
- # nsum = nlp(summary, max_length=500)[0]["summary_text"]
50
 
51
- # print ("Original Text")
52
 
53
- # print(summary)
54
 
55
- # print("Summarised")
56
 
 
57
  main()
 
3
  from scraper import getComments
4
 
5
 
6
+ def chunk(a):
7
+ n = round(0.2 * len(a))
8
+ k, m = divmod(len(a), n)
9
+ return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
 
 
 
 
 
 
 
10
 
 
11
 
12
+ def main():
13
 
14
+ durl = "https://www.reddit.com/r/news/comments/111lv6d/there_were_more_toxic_chemicals_on_train_that/"
 
15
 
16
+ # here you would probably check if the post id already exists in some DB
17
+ # so that you don't have to refetch comments.
18
+ # if pushshift.io submission comments api starts working again,
19
+ # could probably make this all realtime.
20
 
21
+ # df = getComments(url=durl)
22
+ #
23
+ df = pd.read_csv("111lv6d_comments.csv")
24
 
25
+ smax = df.score.max()
26
 
27
+ threshold = round(0.1 * smax)
28
 
29
+ df = df[df.score >= threshold]
 
30
 
31
+ if len(df.text) >= 200:
32
+ df = df[:200]
33
 
34
+ # this is to deal with giving the model too large of an input which makes things very slow
35
+ chunked = list(chunk(df.text))
36
 
37
+ nlp = pipeline('summarization')
 
38
 
39
+ lst_summaries = []
40
 
41
+ for grp in chunked:
42
+ result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
43
+ lst_summaries.append(result)
44
 
45
+ ntext = ' '.join(lst_summaries)
46
 
47
+ thread_summary = nlp(ntext, max_length=500)[0]["summary_text"].replace(" .", ".")
48
 
49
+ print(thread_summary)
50
 
 
51
 
52
+ if __name__ == "__main__":
53
  main()