kkastr
commited on
Commit
•
320952b
1
Parent(s):
95e1833
developed a good system for dealing with the large amount of comments. The resulting summaries appear to be pretty good
Browse files- gradio-demo.py +2 -1
- scraper.py +1 -0
- thread_analyzer.py +29 -33
gradio-demo.py
CHANGED
@@ -4,6 +4,7 @@ import gradio as gr
|
|
4 |
def greet(name):
|
5 |
return "Hello " + name + "!"
|
6 |
|
7 |
-
|
|
|
8 |
|
9 |
demo.launch()
|
|
|
4 |
def greet(name):
|
5 |
return "Hello " + name + "!"
|
6 |
|
7 |
+
|
8 |
+
demo = gr.Interface(fn=greet, inputs="text", outputs="text")
|
9 |
|
10 |
demo.launch()
|
scraper.py
CHANGED
@@ -47,5 +47,6 @@ def getComments(url):
|
|
47 |
|
48 |
return df
|
49 |
|
|
|
50 |
if __name__ == "__main__":
|
51 |
pass
|
|
|
47 |
|
48 |
return df
|
49 |
|
50 |
+
|
51 |
if __name__ == "__main__":
|
52 |
pass
|
thread_analyzer.py
CHANGED
@@ -3,55 +3,51 @@ from transformers import pipeline
|
|
3 |
from scraper import getComments
|
4 |
|
5 |
|
6 |
-
def
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
# here you would probably check if the post id already exists in some DB so that you don't have to refetch comments.
|
11 |
-
# if pushshift.io submission comments api starts working again, could probably make this all realtime.
|
12 |
-
|
13 |
-
# df = getComments(url=durl)
|
14 |
-
nlp = pipeline('summarization')
|
15 |
-
df = pd.read_csv("111591k_comments.csv")
|
16 |
-
gb = df.groupby("parent_id")
|
17 |
|
18 |
-
for key, grp in gb:
|
19 |
|
20 |
-
|
21 |
|
22 |
-
|
23 |
-
break
|
24 |
|
25 |
-
if
|
26 |
-
#
|
|
|
|
|
27 |
|
28 |
-
#
|
|
|
|
|
29 |
|
30 |
-
|
31 |
|
32 |
-
|
33 |
|
34 |
-
|
35 |
-
# df.body[4] = ''
|
36 |
|
37 |
-
|
38 |
-
|
39 |
|
40 |
-
#
|
41 |
-
|
42 |
|
43 |
-
|
44 |
-
# summary3 = nlp(text, max_length=500)[0]["summary_text"]
|
45 |
|
|
|
46 |
|
47 |
-
|
|
|
|
|
48 |
|
49 |
-
|
50 |
|
51 |
-
|
52 |
|
53 |
-
|
54 |
|
55 |
-
# print("Summarised")
|
56 |
|
|
|
57 |
main()
|
|
|
3 |
from scraper import getComments
|
4 |
|
5 |
|
6 |
+
def chunk(a):
|
7 |
+
n = round(0.2 * len(a))
|
8 |
+
k, m = divmod(len(a), n)
|
9 |
+
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
|
|
11 |
|
12 |
+
def main():
|
13 |
|
14 |
+
durl = "https://www.reddit.com/r/news/comments/111lv6d/there_were_more_toxic_chemicals_on_train_that/"
|
|
|
15 |
|
16 |
+
# here you would probably check if the post id already exists in some DB
|
17 |
+
# so that you don't have to refetch comments.
|
18 |
+
# if pushshift.io submission comments api starts working again,
|
19 |
+
# could probably make this all realtime.
|
20 |
|
21 |
+
# df = getComments(url=durl)
|
22 |
+
#
|
23 |
+
df = pd.read_csv("111lv6d_comments.csv")
|
24 |
|
25 |
+
smax = df.score.max()
|
26 |
|
27 |
+
threshold = round(0.1 * smax)
|
28 |
|
29 |
+
df = df[df.score >= threshold]
|
|
|
30 |
|
31 |
+
if len(df.text) >= 200:
|
32 |
+
df = df[:200]
|
33 |
|
34 |
+
# this is to deal with giving the model too large of an input which makes things very slow
|
35 |
+
chunked = list(chunk(df.text))
|
36 |
|
37 |
+
nlp = pipeline('summarization')
|
|
|
38 |
|
39 |
+
lst_summaries = []
|
40 |
|
41 |
+
for grp in chunked:
|
42 |
+
result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
|
43 |
+
lst_summaries.append(result)
|
44 |
|
45 |
+
ntext = ' '.join(lst_summaries)
|
46 |
|
47 |
+
thread_summary = nlp(ntext, max_length=500)[0]["summary_text"].replace(" .", ".")
|
48 |
|
49 |
+
print(thread_summary)
|
50 |
|
|
|
51 |
|
52 |
+
if __name__ == "__main__":
|
53 |
main()
|