kkastr commited on
Commit
1d197a9
1 Parent(s): 3b3dbc9

rename files. added prelims for deployment on aws (pending permission fix)

Browse files
Files changed (8) hide show
  1. .gitignore +2 -0
  2. Dockerfile +13 -0
  3. thread_summarizer.py → app.py +5 -5
  4. cdk.json +3 -0
  5. cdk.py +34 -0
  6. download_model.py +8 -0
  7. requirements.txt +9 -0
  8. scraper.py +13 -3
.gitignore CHANGED
@@ -1,6 +1,8 @@
1
  data/
2
  model/
3
  archive/
 
 
4
  api_keys.py
5
  *.csv
6
 
 
1
  data/
2
  model/
3
  archive/
4
+ ckd.out
5
+
6
  api_keys.py
7
  *.csv
8
 
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile
2
+ FROM public.ecr.aws/docker/library/python:3.9.16-slim-buster
3
+
4
+ COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.6.0 /lambda-adapter /opt/extensions/lambda-adapter
5
+ WORKDIR /var/task
6
+
7
+ COPY requirements.txt ./requirements.txt
8
+ RUN python -m pip install -r requirements.txt
9
+
10
+ COPY app.py ./
11
+ COPY scraper.py ./
12
+ COPY model/ ./model/
13
+ CMD ["python3", "app.py"]
thread_summarizer.py → app.py RENAMED
@@ -17,7 +17,7 @@ def preprocessText(df):
17
  return df
18
 
19
 
20
- def main(url: str, summary_length: str = "Short") -> str:
21
 
22
  # pushshift.io submission comments api doesn't work so have to use praw
23
 
@@ -29,14 +29,14 @@ def main(url: str, summary_length: str = "Short") -> str:
29
 
30
  df = df[df.score >= threshold]
31
 
32
- # empirically, having more than 200 comments doesn't change much, but slows down the code.
33
  if len(df.text) >= 200:
34
  df = df[:200]
35
 
36
  # chunking to handle giving the model too large of an input which crashes
37
  chunked = list(chunk(df.text))
38
 
39
- nlp = pipeline('summarization', model="sshleifer/distilbart-cnn-12-6")
40
 
41
  lst_summaries = []
42
 
@@ -65,6 +65,6 @@ if __name__ == "__main__":
65
 
66
  summary = gr.Textbox(label='Comment Summary')
67
 
68
- sub_btn.click(fn=main, inputs=[submission_url, length_choice], outputs=summary)
69
 
70
- demo.launch()
 
17
  return df
18
 
19
 
20
+ def summarizer(url: str, summary_length: str = "Short") -> str:
21
 
22
  # pushshift.io submission comments api doesn't work so have to use praw
23
 
 
29
 
30
  df = df[df.score >= threshold]
31
 
32
+ # empirically, having more than 200 comments doesn't change much but slows down the summarizer.
33
  if len(df.text) >= 200:
34
  df = df[:200]
35
 
36
  # chunking to handle giving the model too large of an input which crashes
37
  chunked = list(chunk(df.text))
38
 
39
+ nlp = pipeline('summarization', model="./model/")
40
 
41
  lst_summaries = []
42
 
 
65
 
66
  summary = gr.Textbox(label='Comment Summary')
67
 
68
+ sub_btn.click(fn=summarizer, inputs=[submission_url, length_choice], outputs=summary)
69
 
70
+ demo.launch(server_port=8080, enable_queue=False)
cdk.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "app": "python3 cdk.py"
3
+ }
cdk.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ from constructs import Construct
4
+ from aws_cdk import App, Stack, Environment, Duration, CfnOutput
5
+ from aws_cdk.aws_lambda import DockerImageFunction, DockerImageCode
6
+ from aws_cdk.aws_lambda import Architecture, FunctionUrlAuthType
7
+
8
+ my_environment = Environment(
9
+ account=os.environ["CDK_DEFAULT_ACCOUNT"],
10
+ region=os.environ["CDK_DEFAULT_REGION"])
11
+
12
+
13
+ class GradioLambda(Stack):
14
+ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
15
+ super().__init__(scope, construct_id, **kwargs)
16
+
17
+ # create function
18
+ lambda_fn = DockerImageFunction(
19
+ self,
20
+ "GradioApp",
21
+ code=DockerImageCode.from_image_asset(str(Path.cwd()), file="Dockerfile"),
22
+ architecture=Architecture.X86_64,
23
+ memory_size=3008,
24
+ timeout=Duration.minutes(2),
25
+ )
26
+ # add HTTPS url
27
+ fn_url = lambda_fn.add_function_url(auth_type=FunctionUrlAuthType.NONE)
28
+ CfnOutput(self, "functionUrl", value=fn_url.url)
29
+
30
+
31
+ app = App()
32
+ rust_lambda = GradioLambda(app, "GradioLambda", env=my_environment)
33
+
34
+ app.synth()
download_model.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
+
3
+ tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
4
+
5
+ model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
6
+
7
+ tokenizer.save_pretrained("./model")
8
+ model.save_pretrained("./model")
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ boto3==1.21.32
2
+ constructs==10.1.263
3
+ gradio==3.19.1
4
+ pandas==1.4.2
5
+ praw==7.6.0
6
+ transformers==4.26.1
7
+
8
+ --extra-index-url https://download.pytorch.org/whl/cpu
9
+ torch==1.13.0+cpu
scraper.py CHANGED
@@ -1,10 +1,16 @@
1
  import praw
 
2
  import pandas as pd
3
- from api_keys import client_id, client_secret, user_agent, username
4
 
5
 
6
  def getComments(url):
7
 
 
 
 
 
 
8
  cols = [
9
  "text",
10
  "score",
@@ -16,10 +22,14 @@ def getComments(url):
16
  ]
17
 
18
  reddit = praw.Reddit(
19
- client_id=client_id, client_secret=client_secret, user_agent=user_agent, username=username
20
  )
21
 
22
- submission = reddit.submission(url=url)
 
 
 
 
23
  submission.comments.replace_more(limit=0)
24
  rows = []
25
 
 
1
  import praw
2
+ import praw.exceptions as redditexception
3
  import pandas as pd
4
+ import boto3
5
 
6
 
7
  def getComments(url):
8
 
9
+ ssm = boto3.client('ssm')
10
+ cid = ssm.get_parameter(Name='client_id', WithDecryption=True)['Parameter']['Value']
11
+ csecret = ssm.get_parameter(Name='client_secret', WithDecryption=True)['Parameter']['Value']
12
+ user_agent = ssm.get_parameter(Name='user_agent', WithDecryption=True)['Parameter']['Value']
13
+
14
  cols = [
15
  "text",
16
  "score",
 
22
  ]
23
 
24
  reddit = praw.Reddit(
25
+ client_id=cid , client_secret=csecret, user_agent=user_agent
26
  )
27
 
28
+ try:
29
+ submission = reddit.submission(url=url)
30
+ except redditexception.InvalidURL:
31
+ print("The URL is invalid. Make sure that you have included the submission id")
32
+
33
  submission.comments.replace_more(limit=0)
34
  rows = []
35