Spaces:

kkastr
/

summit

Runtime error

kkastr commited on Feb 14, 2023

Commit

320952b

•

1 Parent(s): 95e1833

developed a good system for dealing with the large amount of comments. The resulting summaries appear to be pretty good

Files changed (3) hide show

gradio-demo.py CHANGED Viewed

@@ -4,6 +4,7 @@ import gradio as gr
 def greet(name):
     return "Hello " + name + "!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text", css=".primary-button {background-color: cyan")
 demo.launch()

 def greet(name):
     return "Hello " + name + "!"
+demo = gr.Interface(fn=greet, inputs="text", outputs="text")
 demo.launch()

scraper.py CHANGED Viewed

@@ -47,5 +47,6 @@ def getComments(url):
     return df
 if __name__ == "__main__":
     pass

     return df
 if __name__ == "__main__":
     pass

thread_analyzer.py CHANGED Viewed

@@ -3,55 +3,51 @@ from transformers import pipeline
 from scraper import getComments
-def main():
-    # durl = "https://www.reddit.com/r/ask/comments/111591k/who_do_you_think_will_start_ww3/"
-    # here you would probably check if the post id already exists in some DB so that you don't have to refetch comments.
-    # if pushshift.io submission comments api starts working again, could probably make this all realtime.
-    # df = getComments(url=durl)
-    nlp = pipeline('summarization')
-    df = pd.read_csv("111591k_comments.csv")
-    gb = df.groupby("parent_id")
-    for key, grp in gb:
-        summary = nlp(grp.text.str.cat(), max_length=500)[0]["summary_text"]
-        print(summary)
-        break
-if __name__ == "__main__":
-    #     ldf = pd.read_csv('news_comments.csv')
-    # pid = ldf.post_id.unique()[0]
-    # df = ldf[ldf.post_id == pid]
-    # txt = df.body[0:2].str.cat()
-    # nlp = pipeline('summarization')
-    # df.body[4] = ''
-    # text = ' '.join(df.body[0:32])
-    # summary1 = nlp(text, max_length=500)[0]["summary_text"]
-    # text = ' '.join(df.body[33:60])
-    # summary2 = nlp(text, max_length=500)[0]["summary_text"]
-    # text = ' '.join(df.body[61:90])
-    # summary3 = nlp(text, max_length=500)[0]["summary_text"]
-    # summary = summary1 + ' ' + summary2 + ' ' + summary3
-    # nsum = nlp(summary, max_length=500)[0]["summary_text"]
-    # print ("Original Text")
-    # print(summary)
-    # print("Summarised")
     main()

 from scraper import getComments
+def chunk(a):
+    n = round(0.2 * len(a))
+    k, m = divmod(len(a), n)
+    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
+def main():
+    durl = "https://www.reddit.com/r/news/comments/111lv6d/there_were_more_toxic_chemicals_on_train_that/"
+    # here you would probably check if the post id already exists in some DB
+    # so that you don't have to refetch comments.
+    # if pushshift.io submission comments api starts working again,
+    # could probably make this all realtime.
+    # df = getComments(url=durl)
+    #
+    df = pd.read_csv("111lv6d_comments.csv")
+    smax = df.score.max()
+    threshold = round(0.1 * smax)
+    df = df[df.score >= threshold]
+    if len(df.text) >= 200:
+        df = df[:200]
+    # this is to deal with giving the model too large of an input which makes things very slow
+    chunked = list(chunk(df.text))
+    nlp = pipeline('summarization')
+    lst_summaries = []
+    for grp in chunked:
+        result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
+        lst_summaries.append(result)
+    ntext = ' '.join(lst_summaries)
+    thread_summary = nlp(ntext, max_length=500)[0]["summary_text"].replace(" .", ".")
+    print(thread_summary)
+if __name__ == "__main__":
     main()