kkastr
commited on
Commit
•
cbcecfb
1
Parent(s):
eec218d
condensed the scraper into the main app file. change api keys config to use toml. clean up of aws files as will no longer be deploying there
Browse files- .gitignore +1 -0
- Dockerfile +0 -13
- README.md +1 -0
- app.py +93 -17
- cdk.json +0 -3
- cdk.py +0 -34
- requirements.txt +2 -5
- scraper.py +0 -63
.gitignore
CHANGED
@@ -3,6 +3,7 @@ model/
|
|
3 |
archive/
|
4 |
cdk.out/
|
5 |
|
|
|
6 |
api_keys.py
|
7 |
*.csv
|
8 |
|
|
|
3 |
archive/
|
4 |
cdk.out/
|
5 |
|
6 |
+
api_params.toml
|
7 |
api_keys.py
|
8 |
*.csv
|
9 |
|
Dockerfile
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
# Dockerfile
|
2 |
-
FROM public.ecr.aws/docker/library/python:3.9.16-slim-buster
|
3 |
-
|
4 |
-
COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.6.0 /lambda-adapter /opt/extensions/lambda-adapter
|
5 |
-
WORKDIR /var/task
|
6 |
-
|
7 |
-
COPY requirements.txt ./requirements.txt
|
8 |
-
RUN python -m pip install -r requirements.txt
|
9 |
-
|
10 |
-
COPY app.py ./
|
11 |
-
COPY scraper.py ./
|
12 |
-
COPY model/ ./model/
|
13 |
-
CMD ["python3", "app.py"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# Reddit Thread Summarizer (Gradio)
|
app.py
CHANGED
@@ -1,8 +1,12 @@
|
|
1 |
-
import
|
2 |
-
import gradio as gr
|
3 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from transformers import pipeline
|
5 |
-
from scraper import getComments
|
6 |
|
7 |
|
8 |
def chunk(a):
|
@@ -11,17 +15,9 @@ def chunk(a):
|
|
11 |
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
|
12 |
|
13 |
|
14 |
-
def
|
15 |
df["text"] = df["text"].apply(lambda x: re.sub(r"http\S+", "", x, flags=re.M))
|
16 |
df["text"] = df["text"].apply(lambda x: re.sub(r"^>.+", "", x, flags=re.M))
|
17 |
-
return df
|
18 |
-
|
19 |
-
|
20 |
-
def summarizer(url: str, summary_length: str = "Short") -> str:
|
21 |
-
|
22 |
-
# pushshift.io submission comments api doesn't work so have to use praw
|
23 |
-
|
24 |
-
df = preprocessText(getComments(url=url))
|
25 |
|
26 |
smax = df.score.max()
|
27 |
|
@@ -36,11 +32,82 @@ def summarizer(url: str, summary_length: str = "Short") -> str:
|
|
36 |
# chunking to handle giving the model too large of an input which crashes
|
37 |
chunked = list(chunk(df.text))
|
38 |
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
lst_summaries = []
|
42 |
|
43 |
-
for grp in
|
44 |
# treating a group of comments as one block of text
|
45 |
result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
|
46 |
lst_summaries.append(result)
|
@@ -49,12 +116,18 @@ def summarizer(url: str, summary_length: str = "Short") -> str:
|
|
49 |
|
50 |
if summary_length == "Short":
|
51 |
thread_summary = nlp(stext, max_length=500)[0]["summary_text"].replace(" .", ".")
|
52 |
-
return
|
53 |
else:
|
54 |
-
return
|
55 |
|
56 |
|
57 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
with gr.Blocks(css=".gradio-container {max-width: 900px; margin: auto;}") as demo:
|
60 |
submission_url = gr.Textbox(label='Post URL')
|
@@ -67,4 +140,7 @@ if __name__ == "__main__":
|
|
67 |
|
68 |
sub_btn.click(fn=summarizer, inputs=[submission_url, length_choice], outputs=summary)
|
69 |
|
70 |
-
|
|
|
|
|
|
|
|
1 |
+
import os
|
|
|
2 |
import re
|
3 |
+
import sys
|
4 |
+
import toml
|
5 |
+
import praw
|
6 |
+
import gradio as gr
|
7 |
+
import pandas as pd
|
8 |
+
import praw.exceptions
|
9 |
from transformers import pipeline
|
|
|
10 |
|
11 |
|
12 |
def chunk(a):
|
|
|
15 |
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
|
16 |
|
17 |
|
18 |
+
def preprocessData(df):
|
19 |
df["text"] = df["text"].apply(lambda x: re.sub(r"http\S+", "", x, flags=re.M))
|
20 |
df["text"] = df["text"].apply(lambda x: re.sub(r"^>.+", "", x, flags=re.M))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
smax = df.score.max()
|
23 |
|
|
|
32 |
# chunking to handle giving the model too large of an input which crashes
|
33 |
chunked = list(chunk(df.text))
|
34 |
|
35 |
+
return chunked
|
36 |
+
|
37 |
+
|
38 |
+
def getComments(url, debug=False):
|
39 |
+
|
40 |
+
api_keys = toml.load('./api_params.toml')
|
41 |
+
|
42 |
+
reddit = praw.Reddit(
|
43 |
+
client_id=api_keys['client_id'] ,
|
44 |
+
client_secret=api_keys['client_secret'] ,
|
45 |
+
user_agent=api_keys['user_agent']
|
46 |
+
)
|
47 |
+
|
48 |
+
try:
|
49 |
+
submission = reddit.submission(url=url)
|
50 |
+
if debug and os.path.isfile(f'./{submission.id}_comments.csv'):
|
51 |
+
df = pd.read_csv(f"./{submission.id}_comments.csv")
|
52 |
+
return df
|
53 |
+
else:
|
54 |
+
pass
|
55 |
+
except praw.exceptions.InvalidURL:
|
56 |
+
print("The URL is invalid. Make sure that you have included the submission id")
|
57 |
+
|
58 |
+
submission.comments.replace_more(limit=0)
|
59 |
+
|
60 |
+
cols = [
|
61 |
+
"text",
|
62 |
+
"score",
|
63 |
+
"id",
|
64 |
+
"parent_id",
|
65 |
+
"submission_title",
|
66 |
+
"submission_score",
|
67 |
+
"submission_id"
|
68 |
+
]
|
69 |
+
rows = []
|
70 |
+
|
71 |
+
for comment in submission.comments.list():
|
72 |
+
|
73 |
+
if comment.stickied:
|
74 |
+
continue
|
75 |
+
|
76 |
+
data = [
|
77 |
+
comment.body,
|
78 |
+
comment.score,
|
79 |
+
comment.id,
|
80 |
+
comment.parent_id,
|
81 |
+
submission.title,
|
82 |
+
submission.score,
|
83 |
+
submission.id,
|
84 |
+
]
|
85 |
+
|
86 |
+
rows.append(data)
|
87 |
+
|
88 |
+
df = pd.DataFrame(data=rows, columns=cols)
|
89 |
+
|
90 |
+
if debug:
|
91 |
+
# save for debugging to avoid sending tons of requests to reddit
|
92 |
+
|
93 |
+
df.to_csv(f'{submission.id}_comments.csv', index=False)
|
94 |
+
|
95 |
+
return df
|
96 |
+
|
97 |
+
|
98 |
+
def summarizer(url: str, summary_length: str = "Short") -> str:
|
99 |
+
|
100 |
+
# pushshift.io submission comments api doesn't work so have to use praw
|
101 |
+
df = getComments(url=url)
|
102 |
+
chunked_df = preprocessData(df)
|
103 |
+
|
104 |
+
submission_title = df.submission_title.unique()[0]
|
105 |
+
|
106 |
+
nlp = pipeline('summarization', model="model/")
|
107 |
|
108 |
lst_summaries = []
|
109 |
|
110 |
+
for grp in chunked_df:
|
111 |
# treating a group of comments as one block of text
|
112 |
result = nlp(grp.str.cat(), max_length=500)[0]["summary_text"]
|
113 |
lst_summaries.append(result)
|
|
|
116 |
|
117 |
if summary_length == "Short":
|
118 |
thread_summary = nlp(stext, max_length=500)[0]["summary_text"].replace(" .", ".")
|
119 |
+
return submission_title + '\n' + '\n' + thread_summary
|
120 |
else:
|
121 |
+
return submission_title + '\n' + '\n' + stext
|
122 |
|
123 |
|
124 |
if __name__ == "__main__":
|
125 |
+
if not os.path.isfile('./api_params.toml'):
|
126 |
+
print("""
|
127 |
+
Could not find api params config file in directory.
|
128 |
+
Please create api_params.toml by following the instructions in the README.
|
129 |
+
""")
|
130 |
+
sys.exit(1)
|
131 |
|
132 |
with gr.Blocks(css=".gradio-container {max-width: 900px; margin: auto;}") as demo:
|
133 |
submission_url = gr.Textbox(label='Post URL')
|
|
|
140 |
|
141 |
sub_btn.click(fn=summarizer, inputs=[submission_url, length_choice], outputs=summary)
|
142 |
|
143 |
+
try:
|
144 |
+
demo.launch()
|
145 |
+
except KeyboardInterrupt:
|
146 |
+
gr.close_all()
|
cdk.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"app": "python3 cdk.py"
|
3 |
-
}
|
|
|
|
|
|
|
|
cdk.py
DELETED
@@ -1,34 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from pathlib import Path
|
3 |
-
from constructs import Construct
|
4 |
-
from aws_cdk import App, Stack, Environment, Duration, CfnOutput
|
5 |
-
from aws_cdk.aws_lambda import DockerImageFunction, DockerImageCode
|
6 |
-
from aws_cdk.aws_lambda import Architecture, FunctionUrlAuthType
|
7 |
-
|
8 |
-
my_environment = Environment(
|
9 |
-
account=os.environ["CDK_DEFAULT_ACCOUNT"],
|
10 |
-
region=os.environ["CDK_DEFAULT_REGION"])
|
11 |
-
|
12 |
-
|
13 |
-
class GradioLambda(Stack):
|
14 |
-
def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
|
15 |
-
super().__init__(scope, construct_id, **kwargs)
|
16 |
-
|
17 |
-
# create function
|
18 |
-
lambda_fn = DockerImageFunction(
|
19 |
-
self,
|
20 |
-
"GradioApp",
|
21 |
-
code=DockerImageCode.from_image_asset(str(Path.cwd()), file="Dockerfile"),
|
22 |
-
architecture=Architecture.X86_64,
|
23 |
-
memory_size=3008,
|
24 |
-
timeout=Duration.minutes(2),
|
25 |
-
)
|
26 |
-
# add HTTPS url
|
27 |
-
fn_url = lambda_fn.add_function_url(auth_type=FunctionUrlAuthType.NONE)
|
28 |
-
CfnOutput(self, "functionUrl", value=fn_url.url)
|
29 |
-
|
30 |
-
|
31 |
-
app = App()
|
32 |
-
rust_lambda = GradioLambda(app, "GradioLambda", env=my_environment)
|
33 |
-
|
34 |
-
app.synth()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,9 +1,6 @@
|
|
1 |
-
boto3==1.21.32
|
2 |
-
constructs==10.1.263
|
3 |
gradio==3.19.1
|
4 |
pandas==1.4.2
|
5 |
praw==7.6.0
|
6 |
transformers==4.26.1
|
7 |
-
|
8 |
-
|
9 |
-
torch==1.13.0+cpu
|
|
|
|
|
|
|
1 |
gradio==3.19.1
|
2 |
pandas==1.4.2
|
3 |
praw==7.6.0
|
4 |
transformers==4.26.1
|
5 |
+
torch==1.13.0
|
6 |
+
toml
|
|
scraper.py
DELETED
@@ -1,63 +0,0 @@
|
|
1 |
-
import praw
|
2 |
-
import praw.exceptions as redditexception
|
3 |
-
import pandas as pd
|
4 |
-
import boto3
|
5 |
-
|
6 |
-
|
7 |
-
def getComments(url):
|
8 |
-
|
9 |
-
ssm = boto3.client('ssm')
|
10 |
-
cid = ssm.get_parameter(Name='client_id', WithDecryption=True)['Parameter']['Value']
|
11 |
-
csecret = ssm.get_parameter(Name='client_secret', WithDecryption=True)['Parameter']['Value']
|
12 |
-
user_agent = ssm.get_parameter(Name='user_agent', WithDecryption=True)['Parameter']['Value']
|
13 |
-
|
14 |
-
cols = [
|
15 |
-
"text",
|
16 |
-
"score",
|
17 |
-
"id",
|
18 |
-
"parent_id",
|
19 |
-
"submission_title",
|
20 |
-
"submission_score",
|
21 |
-
"submission_id"
|
22 |
-
]
|
23 |
-
|
24 |
-
reddit = praw.Reddit(
|
25 |
-
client_id=cid , client_secret=csecret, user_agent=user_agent
|
26 |
-
)
|
27 |
-
|
28 |
-
try:
|
29 |
-
submission = reddit.submission(url=url)
|
30 |
-
except redditexception.InvalidURL:
|
31 |
-
print("The URL is invalid. Make sure that you have included the submission id")
|
32 |
-
|
33 |
-
submission.comments.replace_more(limit=0)
|
34 |
-
rows = []
|
35 |
-
|
36 |
-
for comment in submission.comments.list():
|
37 |
-
|
38 |
-
if comment.stickied:
|
39 |
-
continue
|
40 |
-
|
41 |
-
data = [
|
42 |
-
comment.body,
|
43 |
-
comment.score,
|
44 |
-
comment.id,
|
45 |
-
comment.parent_id,
|
46 |
-
submission.title,
|
47 |
-
submission.score,
|
48 |
-
submission.id,
|
49 |
-
]
|
50 |
-
|
51 |
-
rows.append(data)
|
52 |
-
|
53 |
-
df = pd.DataFrame(data=rows, columns=cols)
|
54 |
-
|
55 |
-
# save for testing to avoid sending tons of requests to reddit
|
56 |
-
|
57 |
-
# df.to_csv(f'{submission.id}_comments.csv', index=False)
|
58 |
-
|
59 |
-
return df
|
60 |
-
|
61 |
-
|
62 |
-
if __name__ == "__main__":
|
63 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|