kkastr
commited on
Commit
•
9f1606d
1
Parent(s):
320952b
implemented gradio for main function. works good
Browse files- gradio-demo.py +0 -10
- scraper.py +3 -1
- thread_analyzer.py +12 -14
gradio-demo.py
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
|
3 |
-
|
4 |
-
def greet(name):
|
5 |
-
return "Hello " + name + "!"
|
6 |
-
|
7 |
-
|
8 |
-
demo = gr.Interface(fn=greet, inputs="text", outputs="text")
|
9 |
-
|
10 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scraper.py
CHANGED
@@ -43,7 +43,9 @@ def getComments(url):
|
|
43 |
|
44 |
df = pd.DataFrame(data=rows, columns=cols)
|
45 |
|
46 |
-
|
|
|
|
|
47 |
|
48 |
return df
|
49 |
|
|
|
43 |
|
44 |
df = pd.DataFrame(data=rows, columns=cols)
|
45 |
|
46 |
+
# save for testing to avoid sending tons of requests to reddit
|
47 |
+
|
48 |
+
# df.to_csv(f'{submission.id}_comments.csv', index=False)
|
49 |
|
50 |
return df
|
51 |
|
thread_analyzer.py
CHANGED
@@ -1,26 +1,20 @@
|
|
1 |
import pandas as pd
|
|
|
2 |
from transformers import pipeline
|
3 |
from scraper import getComments
|
4 |
|
5 |
|
6 |
def chunk(a):
|
7 |
-
n = round(0.
|
8 |
k, m = divmod(len(a), n)
|
9 |
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
|
10 |
|
11 |
|
12 |
-
def main():
|
13 |
|
14 |
-
|
15 |
|
16 |
-
|
17 |
-
# so that you don't have to refetch comments.
|
18 |
-
# if pushshift.io submission comments api starts working again,
|
19 |
-
# could probably make this all realtime.
|
20 |
-
|
21 |
-
# df = getComments(url=durl)
|
22 |
-
#
|
23 |
-
df = pd.read_csv("111lv6d_comments.csv")
|
24 |
|
25 |
smax = df.score.max()
|
26 |
|
@@ -31,7 +25,7 @@ def main():
|
|
31 |
if len(df.text) >= 200:
|
32 |
df = df[:200]
|
33 |
|
34 |
-
#
|
35 |
chunked = list(chunk(df.text))
|
36 |
|
37 |
nlp = pipeline('summarization')
|
@@ -39,6 +33,7 @@ def main():
|
|
39 |
lst_summaries = []
|
40 |
|
41 |
for grp in chunked:
|
|
|
42 |
result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
|
43 |
lst_summaries.append(result)
|
44 |
|
@@ -46,8 +41,11 @@ def main():
|
|
46 |
|
47 |
thread_summary = nlp(ntext, max_length=500)[0]["summary_text"].replace(" .", ".")
|
48 |
|
49 |
-
|
50 |
|
51 |
|
52 |
if __name__ == "__main__":
|
53 |
-
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
+
import gradio as gr
|
3 |
from transformers import pipeline
|
4 |
from scraper import getComments
|
5 |
|
6 |
|
7 |
def chunk(a):
|
8 |
+
n = round(0.3 * len(a))
|
9 |
k, m = divmod(len(a), n)
|
10 |
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
|
11 |
|
12 |
|
13 |
+
def main(url: str) -> str:
|
14 |
|
15 |
+
# pushshift.io submission comments api doesn't work so have to use praw
|
16 |
|
17 |
+
df = getComments(url=url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
smax = df.score.max()
|
20 |
|
|
|
25 |
if len(df.text) >= 200:
|
26 |
df = df[:200]
|
27 |
|
28 |
+
# chunking to handle giving the model too large of an input which crashes
|
29 |
chunked = list(chunk(df.text))
|
30 |
|
31 |
nlp = pipeline('summarization')
|
|
|
33 |
lst_summaries = []
|
34 |
|
35 |
for grp in chunked:
|
36 |
+
# treating a group of comments as one block of text
|
37 |
result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
|
38 |
lst_summaries.append(result)
|
39 |
|
|
|
41 |
|
42 |
thread_summary = nlp(ntext, max_length=500)[0]["summary_text"].replace(" .", ".")
|
43 |
|
44 |
+
return thread_summary
|
45 |
|
46 |
|
47 |
if __name__ == "__main__":
|
48 |
+
|
49 |
+
demo = gr.Interface(fn=main, inputs="text", outputs="text")
|
50 |
+
|
51 |
+
demo.launch()
|