kkastr commited on
Commit
9f1606d
1 Parent(s): 320952b

implemented gradio for main function. works good

Browse files
Files changed (3) hide show
  1. gradio-demo.py +0 -10
  2. scraper.py +3 -1
  3. thread_analyzer.py +12 -14
gradio-demo.py DELETED
@@ -1,10 +0,0 @@
1
- import gradio as gr
2
-
3
-
4
- def greet(name):
5
- return "Hello " + name + "!"
6
-
7
-
8
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
9
-
10
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
scraper.py CHANGED
@@ -43,7 +43,9 @@ def getComments(url):
43
 
44
  df = pd.DataFrame(data=rows, columns=cols)
45
 
46
- df.to_csv(f'{submission.id}_comments.csv', index=False)
 
 
47
 
48
  return df
49
 
 
43
 
44
  df = pd.DataFrame(data=rows, columns=cols)
45
 
46
+ # save for testing to avoid sending tons of requests to reddit
47
+
48
+ # df.to_csv(f'{submission.id}_comments.csv', index=False)
49
 
50
  return df
51
 
thread_analyzer.py CHANGED
@@ -1,26 +1,20 @@
1
  import pandas as pd
 
2
  from transformers import pipeline
3
  from scraper import getComments
4
 
5
 
6
  def chunk(a):
7
- n = round(0.2 * len(a))
8
  k, m = divmod(len(a), n)
9
  return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
10
 
11
 
12
- def main():
13
 
14
- durl = "https://www.reddit.com/r/news/comments/111lv6d/there_were_more_toxic_chemicals_on_train_that/"
15
 
16
- # here you would probably check if the post id already exists in some DB
17
- # so that you don't have to refetch comments.
18
- # if pushshift.io submission comments api starts working again,
19
- # could probably make this all realtime.
20
-
21
- # df = getComments(url=durl)
22
- #
23
- df = pd.read_csv("111lv6d_comments.csv")
24
 
25
  smax = df.score.max()
26
 
@@ -31,7 +25,7 @@ def main():
31
  if len(df.text) >= 200:
32
  df = df[:200]
33
 
34
- # this is to deal with giving the model too large of an input which makes things very slow
35
  chunked = list(chunk(df.text))
36
 
37
  nlp = pipeline('summarization')
@@ -39,6 +33,7 @@ def main():
39
  lst_summaries = []
40
 
41
  for grp in chunked:
 
42
  result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
43
  lst_summaries.append(result)
44
 
@@ -46,8 +41,11 @@ def main():
46
 
47
  thread_summary = nlp(ntext, max_length=500)[0]["summary_text"].replace(" .", ".")
48
 
49
- print(thread_summary)
50
 
51
 
52
  if __name__ == "__main__":
53
- main()
 
 
 
 
1
  import pandas as pd
2
+ import gradio as gr
3
  from transformers import pipeline
4
  from scraper import getComments
5
 
6
 
7
  def chunk(a):
8
+ n = round(0.3 * len(a))
9
  k, m = divmod(len(a), n)
10
  return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
11
 
12
 
13
+ def main(url: str) -> str:
14
 
15
+ # pushshift.io submission comments api doesn't work so have to use praw
16
 
17
+ df = getComments(url=url)
 
 
 
 
 
 
 
18
 
19
  smax = df.score.max()
20
 
 
25
  if len(df.text) >= 200:
26
  df = df[:200]
27
 
28
+ # chunking to handle giving the model too large of an input which crashes
29
  chunked = list(chunk(df.text))
30
 
31
  nlp = pipeline('summarization')
 
33
  lst_summaries = []
34
 
35
  for grp in chunked:
36
+ # treating a group of comments as one block of text
37
  result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
38
  lst_summaries.append(result)
39
 
 
41
 
42
  thread_summary = nlp(ntext, max_length=500)[0]["summary_text"].replace(" .", ".")
43
 
44
+ return thread_summary
45
 
46
 
47
  if __name__ == "__main__":
48
+
49
+ demo = gr.Interface(fn=main, inputs="text", outputs="text")
50
+
51
+ demo.launch()