Spaces:

kkastr
/

summit

Runtime error

App Files Files Community

kkastr commited on Mar 1, 2023

Commit

cbcecfb

•

1 Parent(s): eec218d

condensed the scraper into the main app file. change api keys config to use toml. clean up of aws files as will no longer be deploying there

Browse files

Files changed (8) hide show

.gitignore +1 -0
Dockerfile +0 -13
README.md +1 -0
app.py +93 -17
cdk.json +0 -3
cdk.py +0 -34
requirements.txt +2 -5
scraper.py +0 -63

.gitignore CHANGED Viewed

@@ -3,6 +3,7 @@ model/
 archive/
 cdk.out/
 api_keys.py
 *.csv

 archive/
 cdk.out/
+api_params.toml
 api_keys.py
 *.csv

Dockerfile DELETED Viewed

@@ -1,13 +0,0 @@
-# Dockerfile
-FROM public.ecr.aws/docker/library/python:3.9.16-slim-buster
-COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.6.0 /lambda-adapter /opt/extensions/lambda-adapter
-WORKDIR /var/task
-COPY requirements.txt  ./requirements.txt
-RUN python -m pip install -r requirements.txt
-COPY app.py  ./
-COPY scraper.py  ./
-COPY model/  ./model/
-CMD ["python3", "app.py"]

README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Reddit Thread Summarizer (Gradio)

app.py CHANGED Viewed

@@ -1,8 +1,12 @@
-import pandas as pd
-import gradio as gr
 import re
 from transformers import pipeline
-from scraper import getComments
 def chunk(a):
@@ -11,17 +15,9 @@ def chunk(a):
     return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
-def preprocessText(df):
     df["text"] = df["text"].apply(lambda x: re.sub(r"http\S+", "", x, flags=re.M))
     df["text"] = df["text"].apply(lambda x: re.sub(r"^>.+", "", x, flags=re.M))
-    return df
-def summarizer(url: str, summary_length: str = "Short") -> str:
-    # pushshift.io submission comments api doesn't work so have to use praw
-    df = preprocessText(getComments(url=url))
     smax = df.score.max()
@@ -36,11 +32,82 @@ def summarizer(url: str, summary_length: str = "Short") -> str:
     # chunking to handle giving the model too large of an input which crashes
     chunked = list(chunk(df.text))
-    nlp = pipeline('summarization', model="./model/")
     lst_summaries = []
-    for grp in chunked:
         # treating a group of comments as one block of text
         result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
         lst_summaries.append(result)
@@ -49,12 +116,18 @@ def summarizer(url: str, summary_length: str = "Short") -> str:
     if summary_length == "Short":
         thread_summary = nlp(stext, max_length=500)[0]["summary_text"].replace(" .", ".")
-        return df.submission_title.unique()[0] + '\n' + '\n' + thread_summary
     else:
-        return df.submission_title.unique()[0] + '\n' + '\n' + stext
 if __name__ == "__main__":
     with gr.Blocks(css=".gradio-container {max-width: 900px; margin: auto;}") as demo:
         submission_url = gr.Textbox(label='Post URL')
@@ -67,4 +140,7 @@ if __name__ == "__main__":
         sub_btn.click(fn=summarizer, inputs=[submission_url, length_choice], outputs=summary)
-    demo.launch(server_port=8080, enable_queue=False)

+import os
 import re
+import sys
+import toml
+import praw
+import gradio as gr
+import pandas as pd
+import praw.exceptions
 from transformers import pipeline
 def chunk(a):
     return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
+def preprocessData(df):
     df["text"] = df["text"].apply(lambda x: re.sub(r"http\S+", "", x, flags=re.M))
     df["text"] = df["text"].apply(lambda x: re.sub(r"^>.+", "", x, flags=re.M))
     smax = df.score.max()
     # chunking to handle giving the model too large of an input which crashes
     chunked = list(chunk(df.text))
+    return chunked
+def getComments(url, debug=False):
+    api_keys = toml.load('./api_params.toml')
+    reddit = praw.Reddit(
+        client_id=api_keys['client_id'] ,
+        client_secret=api_keys['client_secret'] ,
+        user_agent=api_keys['user_agent']
+    )
+    try:
+        submission = reddit.submission(url=url)
+        if debug and os.path.isfile(f'./{submission.id}_comments.csv'):
+            df = pd.read_csv(f"./{submission.id}_comments.csv")
+            return df
+        else:
+            pass
+    except praw.exceptions.InvalidURL:
+        print("The URL is invalid. Make sure that you have included the submission id")
+    submission.comments.replace_more(limit=0)
+    cols = [
+        "text",
+        "score",
+        "id",
+        "parent_id",
+        "submission_title",
+        "submission_score",
+        "submission_id"
+    ]
+    rows = []
+    for comment in submission.comments.list():
+        if comment.stickied:
+            continue
+        data = [
+            comment.body,
+            comment.score,
+            comment.id,
+            comment.parent_id,
+            submission.title,
+            submission.score,
+            submission.id,
+        ]
+        rows.append(data)
+    df = pd.DataFrame(data=rows, columns=cols)
+    if debug:
+        # save for debugging to avoid sending tons of requests to reddit
+        df.to_csv(f'{submission.id}_comments.csv', index=False)
+    return df
+def summarizer(url: str, summary_length: str = "Short") -> str:
+    # pushshift.io submission comments api doesn't work so have to use praw
+    df = getComments(url=url)
+    chunked_df = preprocessData(df)
+    submission_title = df.submission_title.unique()[0]
+    nlp = pipeline('summarization', model="model/")
     lst_summaries = []
+    for grp in chunked_df:
         # treating a group of comments as one block of text
         result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
         lst_summaries.append(result)
     if summary_length == "Short":
         thread_summary = nlp(stext, max_length=500)[0]["summary_text"].replace(" .", ".")
+        return submission_title + '\n' + '\n' + thread_summary
     else:
+        return submission_title + '\n' + '\n' + stext
 if __name__ == "__main__":
+    if not os.path.isfile('./api_params.toml'):
+        print("""
+                Could not find api params config file in directory.
+                Please create api_params.toml by following the instructions in the README.
+              """)
+        sys.exit(1)
     with gr.Blocks(css=".gradio-container {max-width: 900px; margin: auto;}") as demo:
         submission_url = gr.Textbox(label='Post URL')
         sub_btn.click(fn=summarizer, inputs=[submission_url, length_choice], outputs=summary)
+    try:
+        demo.launch()
+    except KeyboardInterrupt:
+        gr.close_all()

cdk.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-    "app": "python3 cdk.py"
-}

cdk.py DELETED Viewed

@@ -1,34 +0,0 @@
-import os
-from pathlib import Path
-from constructs import Construct
-from aws_cdk import App, Stack, Environment, Duration, CfnOutput
-from aws_cdk.aws_lambda import DockerImageFunction, DockerImageCode
-from aws_cdk.aws_lambda import Architecture, FunctionUrlAuthType
-my_environment = Environment(
-    account=os.environ["CDK_DEFAULT_ACCOUNT"],
-    region=os.environ["CDK_DEFAULT_REGION"])
-class GradioLambda(Stack):
-    def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
-        super().__init__(scope, construct_id, **kwargs)
-        # create function
-        lambda_fn = DockerImageFunction(
-            self,
-            "GradioApp",
-            code=DockerImageCode.from_image_asset(str(Path.cwd()), file="Dockerfile"),
-            architecture=Architecture.X86_64,
-            memory_size=3008,
-            timeout=Duration.minutes(2),
-        )
-        # add HTTPS url
-        fn_url = lambda_fn.add_function_url(auth_type=FunctionUrlAuthType.NONE)
-        CfnOutput(self, "functionUrl", value=fn_url.url)
-app = App()
-rust_lambda = GradioLambda(app, "GradioLambda", env=my_environment)
-app.synth()

requirements.txt CHANGED Viewed

@@ -1,9 +1,6 @@
-boto3==1.21.32
-constructs==10.1.263
 gradio==3.19.1
 pandas==1.4.2
 praw==7.6.0
 transformers==4.26.1
---extra-index-url https://download.pytorch.org/whl/cpu
-torch==1.13.0+cpu

 gradio==3.19.1
 pandas==1.4.2
 praw==7.6.0
 transformers==4.26.1
+torch==1.13.0
+toml

scraper.py DELETED Viewed

@@ -1,63 +0,0 @@
-import praw
-import praw.exceptions as redditexception
-import pandas as pd
-import boto3
-def getComments(url):
-    ssm = boto3.client('ssm')
-    cid = ssm.get_parameter(Name='client_id', WithDecryption=True)['Parameter']['Value']
-    csecret = ssm.get_parameter(Name='client_secret', WithDecryption=True)['Parameter']['Value']
-    user_agent = ssm.get_parameter(Name='user_agent', WithDecryption=True)['Parameter']['Value']
-    cols = [
-        "text",
-        "score",
-        "id",
-        "parent_id",
-        "submission_title",
-        "submission_score",
-        "submission_id"
-    ]
-    reddit = praw.Reddit(
-        client_id=cid , client_secret=csecret, user_agent=user_agent
-    )
-    try:
-        submission = reddit.submission(url=url)
-    except redditexception.InvalidURL:
-        print("The URL is invalid. Make sure that you have included the submission id")
-    submission.comments.replace_more(limit=0)
-    rows = []
-    for comment in submission.comments.list():
-        if comment.stickied:
-            continue
-        data = [
-            comment.body,
-            comment.score,
-            comment.id,
-            comment.parent_id,
-            submission.title,
-            submission.score,
-            submission.id,
-        ]
-        rows.append(data)
-    df = pd.DataFrame(data=rows, columns=cols)
-    # save for testing to avoid sending tons of requests to reddit
-    # df.to_csv(f'{submission.id}_comments.csv', index=False)
-    return df
-if __name__ == "__main__":
-    pass