kkastr commited on
Commit
cbcecfb
1 Parent(s): eec218d

condensed the scraper into the main app file. change api keys config to use toml. clean up of aws files as will no longer be deploying there

Browse files
Files changed (8) hide show
  1. .gitignore +1 -0
  2. Dockerfile +0 -13
  3. README.md +1 -0
  4. app.py +93 -17
  5. cdk.json +0 -3
  6. cdk.py +0 -34
  7. requirements.txt +2 -5
  8. scraper.py +0 -63
.gitignore CHANGED
@@ -3,6 +3,7 @@ model/
3
  archive/
4
  cdk.out/
5
 
 
6
  api_keys.py
7
  *.csv
8
 
 
3
  archive/
4
  cdk.out/
5
 
6
+ api_params.toml
7
  api_keys.py
8
  *.csv
9
 
Dockerfile DELETED
@@ -1,13 +0,0 @@
1
- # Dockerfile
2
- FROM public.ecr.aws/docker/library/python:3.9.16-slim-buster
3
-
4
- COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.6.0 /lambda-adapter /opt/extensions/lambda-adapter
5
- WORKDIR /var/task
6
-
7
- COPY requirements.txt ./requirements.txt
8
- RUN python -m pip install -r requirements.txt
9
-
10
- COPY app.py ./
11
- COPY scraper.py ./
12
- COPY model/ ./model/
13
- CMD ["python3", "app.py"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # Reddit Thread Summarizer (Gradio)
app.py CHANGED
@@ -1,8 +1,12 @@
1
- import pandas as pd
2
- import gradio as gr
3
  import re
 
 
 
 
 
 
4
  from transformers import pipeline
5
- from scraper import getComments
6
 
7
 
8
  def chunk(a):
@@ -11,17 +15,9 @@ def chunk(a):
11
  return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
12
 
13
 
14
- def preprocessText(df):
15
  df["text"] = df["text"].apply(lambda x: re.sub(r"http\S+", "", x, flags=re.M))
16
  df["text"] = df["text"].apply(lambda x: re.sub(r"^>.+", "", x, flags=re.M))
17
- return df
18
-
19
-
20
- def summarizer(url: str, summary_length: str = "Short") -> str:
21
-
22
- # pushshift.io submission comments api doesn't work so have to use praw
23
-
24
- df = preprocessText(getComments(url=url))
25
 
26
  smax = df.score.max()
27
 
@@ -36,11 +32,82 @@ def summarizer(url: str, summary_length: str = "Short") -> str:
36
  # chunking to handle giving the model too large of an input which crashes
37
  chunked = list(chunk(df.text))
38
 
39
- nlp = pipeline('summarization', model="./model/")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  lst_summaries = []
42
 
43
- for grp in chunked:
44
  # treating a group of comments as one block of text
45
  result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
46
  lst_summaries.append(result)
@@ -49,12 +116,18 @@ def summarizer(url: str, summary_length: str = "Short") -> str:
49
 
50
  if summary_length == "Short":
51
  thread_summary = nlp(stext, max_length=500)[0]["summary_text"].replace(" .", ".")
52
- return df.submission_title.unique()[0] + '\n' + '\n' + thread_summary
53
  else:
54
- return df.submission_title.unique()[0] + '\n' + '\n' + stext
55
 
56
 
57
  if __name__ == "__main__":
 
 
 
 
 
 
58
 
59
  with gr.Blocks(css=".gradio-container {max-width: 900px; margin: auto;}") as demo:
60
  submission_url = gr.Textbox(label='Post URL')
@@ -67,4 +140,7 @@ if __name__ == "__main__":
67
 
68
  sub_btn.click(fn=summarizer, inputs=[submission_url, length_choice], outputs=summary)
69
 
70
- demo.launch(server_port=8080, enable_queue=False)
 
 
 
 
1
+ import os
 
2
  import re
3
+ import sys
4
+ import toml
5
+ import praw
6
+ import gradio as gr
7
+ import pandas as pd
8
+ import praw.exceptions
9
  from transformers import pipeline
 
10
 
11
 
12
  def chunk(a):
 
15
  return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
16
 
17
 
18
+ def preprocessData(df):
19
  df["text"] = df["text"].apply(lambda x: re.sub(r"http\S+", "", x, flags=re.M))
20
  df["text"] = df["text"].apply(lambda x: re.sub(r"^>.+", "", x, flags=re.M))
 
 
 
 
 
 
 
 
21
 
22
  smax = df.score.max()
23
 
 
32
  # chunking to handle giving the model too large of an input which crashes
33
  chunked = list(chunk(df.text))
34
 
35
+ return chunked
36
+
37
+
38
+ def getComments(url, debug=False):
39
+
40
+ api_keys = toml.load('./api_params.toml')
41
+
42
+ reddit = praw.Reddit(
43
+ client_id=api_keys['client_id'] ,
44
+ client_secret=api_keys['client_secret'] ,
45
+ user_agent=api_keys['user_agent']
46
+ )
47
+
48
+ try:
49
+ submission = reddit.submission(url=url)
50
+ if debug and os.path.isfile(f'./{submission.id}_comments.csv'):
51
+ df = pd.read_csv(f"./{submission.id}_comments.csv")
52
+ return df
53
+ else:
54
+ pass
55
+ except praw.exceptions.InvalidURL:
56
+ print("The URL is invalid. Make sure that you have included the submission id")
57
+
58
+ submission.comments.replace_more(limit=0)
59
+
60
+ cols = [
61
+ "text",
62
+ "score",
63
+ "id",
64
+ "parent_id",
65
+ "submission_title",
66
+ "submission_score",
67
+ "submission_id"
68
+ ]
69
+ rows = []
70
+
71
+ for comment in submission.comments.list():
72
+
73
+ if comment.stickied:
74
+ continue
75
+
76
+ data = [
77
+ comment.body,
78
+ comment.score,
79
+ comment.id,
80
+ comment.parent_id,
81
+ submission.title,
82
+ submission.score,
83
+ submission.id,
84
+ ]
85
+
86
+ rows.append(data)
87
+
88
+ df = pd.DataFrame(data=rows, columns=cols)
89
+
90
+ if debug:
91
+ # save for debugging to avoid sending tons of requests to reddit
92
+
93
+ df.to_csv(f'{submission.id}_comments.csv', index=False)
94
+
95
+ return df
96
+
97
+
98
+ def summarizer(url: str, summary_length: str = "Short") -> str:
99
+
100
+ # pushshift.io submission comments api doesn't work so have to use praw
101
+ df = getComments(url=url)
102
+ chunked_df = preprocessData(df)
103
+
104
+ submission_title = df.submission_title.unique()[0]
105
+
106
+ nlp = pipeline('summarization', model="model/")
107
 
108
  lst_summaries = []
109
 
110
+ for grp in chunked_df:
111
  # treating a group of comments as one block of text
112
  result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
113
  lst_summaries.append(result)
 
116
 
117
  if summary_length == "Short":
118
  thread_summary = nlp(stext, max_length=500)[0]["summary_text"].replace(" .", ".")
119
+ return submission_title + '\n' + '\n' + thread_summary
120
  else:
121
+ return submission_title + '\n' + '\n' + stext
122
 
123
 
124
  if __name__ == "__main__":
125
+ if not os.path.isfile('./api_params.toml'):
126
+ print("""
127
+ Could not find api params config file in directory.
128
+ Please create api_params.toml by following the instructions in the README.
129
+ """)
130
+ sys.exit(1)
131
 
132
  with gr.Blocks(css=".gradio-container {max-width: 900px; margin: auto;}") as demo:
133
  submission_url = gr.Textbox(label='Post URL')
 
140
 
141
  sub_btn.click(fn=summarizer, inputs=[submission_url, length_choice], outputs=summary)
142
 
143
+ try:
144
+ demo.launch()
145
+ except KeyboardInterrupt:
146
+ gr.close_all()
cdk.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "app": "python3 cdk.py"
3
- }
 
 
 
 
cdk.py DELETED
@@ -1,34 +0,0 @@
1
- import os
2
- from pathlib import Path
3
- from constructs import Construct
4
- from aws_cdk import App, Stack, Environment, Duration, CfnOutput
5
- from aws_cdk.aws_lambda import DockerImageFunction, DockerImageCode
6
- from aws_cdk.aws_lambda import Architecture, FunctionUrlAuthType
7
-
8
- my_environment = Environment(
9
- account=os.environ["CDK_DEFAULT_ACCOUNT"],
10
- region=os.environ["CDK_DEFAULT_REGION"])
11
-
12
-
13
- class GradioLambda(Stack):
14
- def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
15
- super().__init__(scope, construct_id, **kwargs)
16
-
17
- # create function
18
- lambda_fn = DockerImageFunction(
19
- self,
20
- "GradioApp",
21
- code=DockerImageCode.from_image_asset(str(Path.cwd()), file="Dockerfile"),
22
- architecture=Architecture.X86_64,
23
- memory_size=3008,
24
- timeout=Duration.minutes(2),
25
- )
26
- # add HTTPS url
27
- fn_url = lambda_fn.add_function_url(auth_type=FunctionUrlAuthType.NONE)
28
- CfnOutput(self, "functionUrl", value=fn_url.url)
29
-
30
-
31
- app = App()
32
- rust_lambda = GradioLambda(app, "GradioLambda", env=my_environment)
33
-
34
- app.synth()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,9 +1,6 @@
1
- boto3==1.21.32
2
- constructs==10.1.263
3
  gradio==3.19.1
4
  pandas==1.4.2
5
  praw==7.6.0
6
  transformers==4.26.1
7
-
8
- --extra-index-url https://download.pytorch.org/whl/cpu
9
- torch==1.13.0+cpu
 
 
 
1
  gradio==3.19.1
2
  pandas==1.4.2
3
  praw==7.6.0
4
  transformers==4.26.1
5
+ torch==1.13.0
6
+ toml
 
scraper.py DELETED
@@ -1,63 +0,0 @@
1
- import praw
2
- import praw.exceptions as redditexception
3
- import pandas as pd
4
- import boto3
5
-
6
-
7
- def getComments(url):
8
-
9
- ssm = boto3.client('ssm')
10
- cid = ssm.get_parameter(Name='client_id', WithDecryption=True)['Parameter']['Value']
11
- csecret = ssm.get_parameter(Name='client_secret', WithDecryption=True)['Parameter']['Value']
12
- user_agent = ssm.get_parameter(Name='user_agent', WithDecryption=True)['Parameter']['Value']
13
-
14
- cols = [
15
- "text",
16
- "score",
17
- "id",
18
- "parent_id",
19
- "submission_title",
20
- "submission_score",
21
- "submission_id"
22
- ]
23
-
24
- reddit = praw.Reddit(
25
- client_id=cid , client_secret=csecret, user_agent=user_agent
26
- )
27
-
28
- try:
29
- submission = reddit.submission(url=url)
30
- except redditexception.InvalidURL:
31
- print("The URL is invalid. Make sure that you have included the submission id")
32
-
33
- submission.comments.replace_more(limit=0)
34
- rows = []
35
-
36
- for comment in submission.comments.list():
37
-
38
- if comment.stickied:
39
- continue
40
-
41
- data = [
42
- comment.body,
43
- comment.score,
44
- comment.id,
45
- comment.parent_id,
46
- submission.title,
47
- submission.score,
48
- submission.id,
49
- ]
50
-
51
- rows.append(data)
52
-
53
- df = pd.DataFrame(data=rows, columns=cols)
54
-
55
- # save for testing to avoid sending tons of requests to reddit
56
-
57
- # df.to_csv(f'{submission.id}_comments.csv', index=False)
58
-
59
- return df
60
-
61
-
62
- if __name__ == "__main__":
63
- pass