Spaces:

kkastr
/

summit

Runtime error

kkastr commited on Mar 1, 2023

Commit

1d197a9

•

1 Parent(s): 3b3dbc9

rename files. added prelims for deployment on aws (pending permission fix)

Files changed (8) hide show

.gitignore CHANGED Viewed

@@ -1,6 +1,8 @@
 data/
 model/
 archive/
 api_keys.py
 *.csv

 data/
 model/
 archive/
+ckd.out
 api_keys.py
 *.csv

Dockerfile ADDED Viewed

+# Dockerfile
+FROM public.ecr.aws/docker/library/python:3.9.16-slim-buster
+COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.6.0 /lambda-adapter /opt/extensions/lambda-adapter
+WORKDIR /var/task
+COPY requirements.txt  ./requirements.txt
+RUN python -m pip install -r requirements.txt
+COPY app.py  ./
+COPY scraper.py  ./
+COPY model/  ./model/
+CMD ["python3", "app.py"]

thread_summarizer.py → app.py RENAMED Viewed

@@ -17,7 +17,7 @@ def preprocessText(df):
     return df
-def main(url: str, summary_length: str = "Short") -> str:
     # pushshift.io submission comments api doesn't work so have to use praw
@@ -29,14 +29,14 @@ def main(url: str, summary_length: str = "Short") -> str:
     df = df[df.score >= threshold]
-    # empirically, having more than 200 comments doesn't change much, but slows down the code.
     if len(df.text) >= 200:
         df = df[:200]
     # chunking to handle giving the model too large of an input which crashes
     chunked = list(chunk(df.text))
-    nlp = pipeline('summarization', model="sshleifer/distilbart-cnn-12-6")
     lst_summaries = []
@@ -65,6 +65,6 @@ if __name__ == "__main__":
         summary = gr.Textbox(label='Comment Summary')
-        sub_btn.click(fn=main, inputs=[submission_url, length_choice], outputs=summary)
-    demo.launch()

     return df
+def summarizer(url: str, summary_length: str = "Short") -> str:
     # pushshift.io submission comments api doesn't work so have to use praw
     df = df[df.score >= threshold]
+    # empirically, having more than 200 comments doesn't change much but slows down the summarizer.
     if len(df.text) >= 200:
         df = df[:200]
     # chunking to handle giving the model too large of an input which crashes
     chunked = list(chunk(df.text))
+    nlp = pipeline('summarization', model="./model/")
     lst_summaries = []
         summary = gr.Textbox(label='Comment Summary')
+        sub_btn.click(fn=summarizer, inputs=[submission_url, length_choice], outputs=summary)
+    demo.launch(server_port=8080, enable_queue=False)

cdk.json ADDED Viewed

+{
+    "app": "python3 cdk.py"
+}

cdk.py ADDED Viewed

+import os
+from pathlib import Path
+from constructs import Construct
+from aws_cdk import App, Stack, Environment, Duration, CfnOutput
+from aws_cdk.aws_lambda import DockerImageFunction, DockerImageCode
+from aws_cdk.aws_lambda import Architecture, FunctionUrlAuthType
+my_environment = Environment(
+    account=os.environ["CDK_DEFAULT_ACCOUNT"],
+    region=os.environ["CDK_DEFAULT_REGION"])
+class GradioLambda(Stack):
+    def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
+        super().__init__(scope, construct_id, **kwargs)
+        # create function
+        lambda_fn = DockerImageFunction(
+            self,
+            "GradioApp",
+            code=DockerImageCode.from_image_asset(str(Path.cwd()), file="Dockerfile"),
+            architecture=Architecture.X86_64,
+            memory_size=3008,
+            timeout=Duration.minutes(2),
+        )
+        # add HTTPS url
+        fn_url = lambda_fn.add_function_url(auth_type=FunctionUrlAuthType.NONE)
+        CfnOutput(self, "functionUrl", value=fn_url.url)
+app = App()
+rust_lambda = GradioLambda(app, "GradioLambda", env=my_environment)
+app.synth()

download_model.py ADDED Viewed

+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
+model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
+tokenizer.save_pretrained("./model")
+model.save_pretrained("./model")

requirements.txt ADDED Viewed

+boto3==1.21.32
+constructs==10.1.263
+gradio==3.19.1
+pandas==1.4.2
+praw==7.6.0
+transformers==4.26.1
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==1.13.0+cpu

scraper.py CHANGED Viewed

@@ -1,10 +1,16 @@
 import praw
 import pandas as pd
-from api_keys import client_id, client_secret, user_agent, username
 def getComments(url):
     cols = [
         "text",
         "score",
@@ -16,10 +22,14 @@ def getComments(url):
     ]
     reddit = praw.Reddit(
-        client_id=client_id, client_secret=client_secret, user_agent=user_agent, username=username
     )
-    submission = reddit.submission(url=url)
     submission.comments.replace_more(limit=0)
     rows = []

 import praw
+import praw.exceptions as redditexception
 import pandas as pd
+import boto3
 def getComments(url):
+    ssm = boto3.client('ssm')
+    cid = ssm.get_parameter(Name='client_id', WithDecryption=True)['Parameter']['Value']
+    csecret = ssm.get_parameter(Name='client_secret', WithDecryption=True)['Parameter']['Value']
+    user_agent = ssm.get_parameter(Name='user_agent', WithDecryption=True)['Parameter']['Value']
     cols = [
         "text",
         "score",
     ]
     reddit = praw.Reddit(
+        client_id=cid , client_secret=csecret, user_agent=user_agent
     )
+    try:
+        submission = reddit.submission(url=url)
+    except redditexception.InvalidURL:
+        print("The URL is invalid. Make sure that you have included the submission id")
     submission.comments.replace_more(limit=0)
     rows = []