Spaces:

kkastr
/

summit

Runtime error

kkastr commited on Feb 14, 2023

Commit

9f1606d

•

1 Parent(s): 320952b

implemented gradio for main function. works good

Files changed (3) hide show

gradio-demo.py DELETED Viewed

@@ -1,10 +0,0 @@
-import gradio as gr
-def greet(name):
-    return "Hello " + name + "!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

scraper.py CHANGED Viewed

@@ -43,7 +43,9 @@ def getComments(url):
     df = pd.DataFrame(data=rows, columns=cols)
-    df.to_csv(f'{submission.id}_comments.csv', index=False)
     return df

     df = pd.DataFrame(data=rows, columns=cols)
+    # save for testing to avoid sending tons of requests to reddit
+    # df.to_csv(f'{submission.id}_comments.csv', index=False)
     return df

thread_analyzer.py CHANGED Viewed

@@ -1,26 +1,20 @@
 import pandas as pd
 from transformers import pipeline
 from scraper import getComments
 def chunk(a):
-    n = round(0.2 * len(a))
     k, m = divmod(len(a), n)
     return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
-def main():
-    durl = "https://www.reddit.com/r/news/comments/111lv6d/there_were_more_toxic_chemicals_on_train_that/"
-    # here you would probably check if the post id already exists in some DB
-    # so that you don't have to refetch comments.
-    # if pushshift.io submission comments api starts working again,
-    # could probably make this all realtime.
-    # df = getComments(url=durl)
-    #
-    df = pd.read_csv("111lv6d_comments.csv")
     smax = df.score.max()
@@ -31,7 +25,7 @@ def main():
     if len(df.text) >= 200:
         df = df[:200]
-    # this is to deal with giving the model too large of an input which makes things very slow
     chunked = list(chunk(df.text))
     nlp = pipeline('summarization')
@@ -39,6 +33,7 @@ def main():
     lst_summaries = []
     for grp in chunked:
         result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
         lst_summaries.append(result)
@@ -46,8 +41,11 @@ def main():
     thread_summary = nlp(ntext, max_length=500)[0]["summary_text"].replace(" .", ".")
-    print(thread_summary)
 if __name__ == "__main__":
-    main()

 import pandas as pd
+import gradio as gr
 from transformers import pipeline
 from scraper import getComments
 def chunk(a):
+    n = round(0.3 * len(a))
     k, m = divmod(len(a), n)
     return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
+def main(url: str) -> str:
+    # pushshift.io submission comments api doesn't work so have to use praw
+    df = getComments(url=url)
     smax = df.score.max()
     if len(df.text) >= 200:
         df = df[:200]
+    # chunking to handle giving the model too large of an input which crashes
     chunked = list(chunk(df.text))
     nlp = pipeline('summarization')
     lst_summaries = []
     for grp in chunked:
+        # treating a group of comments as one block of text
         result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
         lst_summaries.append(result)
     thread_summary = nlp(ntext, max_length=500)[0]["summary_text"].replace(" .", ".")
+    return thread_summary
 if __name__ == "__main__":
+    demo = gr.Interface(fn=main, inputs="text", outputs="text")
+    demo.launch()