PAN subreddit 200 threads added
Browse files- app.py +1 -1
- csv/paloaltonetworks_messages.csv +0 -0
- csv/paloaltonetworks_messages_copy.csv +0 -0
- data_models.py +6 -0
- reddit_collect.py +52 -0
- slack_summary.py +10 -14
app.py
CHANGED
@@ -267,7 +267,7 @@ def main_demo():
|
|
267 |
demo = gr.Blocks()
|
268 |
|
269 |
with demo:
|
270 |
-
data_sets = ["platform-engg_messages", "apps-ui_messages", "ux-reviews_messages"]
|
271 |
load_data(data_sets)
|
272 |
with gr.Tab("Thread"):
|
273 |
data_sets_dd = gr.Dropdown(data_sets,
|
|
|
267 |
demo = gr.Blocks()
|
268 |
|
269 |
with demo:
|
270 |
+
data_sets = ["platform-engg_messages", "apps-ui_messages", "ux-reviews_messages", "paloaltonetworks_messages"]
|
271 |
load_data(data_sets)
|
272 |
with gr.Tab("Thread"):
|
273 |
data_sets_dd = gr.Dropdown(data_sets,
|
csv/paloaltonetworks_messages.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
csv/paloaltonetworks_messages_copy.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data_models.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
class GenericMessage:
|
3 |
+
def __init__(self, user, timestamp, content):
|
4 |
+
self.user = user
|
5 |
+
self.timestamp = timestamp
|
6 |
+
self.content = content
|
reddit_collect.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import time
|
3 |
+
|
4 |
+
from prawcore import RequestException
|
5 |
+
|
6 |
+
import csv
|
7 |
+
|
8 |
+
import praw
|
9 |
+
|
10 |
+
from data_models import GenericMessage
|
11 |
+
|
12 |
+
reddit = praw.Reddit(
|
13 |
+
client_id='LV2nS-xiWYIEn6YpwOhWpg',
|
14 |
+
client_secret='PhC4AYKkL0OUR8miVIuZF45Iz_saiA',
|
15 |
+
user_agent='PythonScript:com.example.passive_monitoring:v0.0.1 (by /u/vvsatya)',
|
16 |
+
)
|
17 |
+
|
18 |
+
subreddit ='paloaltonetworks'
|
19 |
+
# Access subreddit
|
20 |
+
subreddit = reddit.subreddit(subreddit)
|
21 |
+
|
22 |
+
retry_count = 0
|
23 |
+
max_retries = 5
|
24 |
+
retry_delay = 5 # in seconds
|
25 |
+
|
26 |
+
while retry_count < max_retries:
|
27 |
+
try:
|
28 |
+
thread_messages_file = f'csv/{subreddit}_messages.csv'
|
29 |
+
with open(thread_messages_file, 'w', newline='') as csvfile:
|
30 |
+
writer = csv.writer(csvfile)
|
31 |
+
writer.writerow(
|
32 |
+
['thread_ts', 'messages_json'])
|
33 |
+
for index, submission in enumerate(subreddit.hot(limit=1000), 1):
|
34 |
+
messages = []
|
35 |
+
messages.append(GenericMessage(submission.author.id, submission.created, f"Title: {submission.title}\n Body: {submission.selftext}"))
|
36 |
+
# Collect and print comments
|
37 |
+
submission.comments.replace_more(limit=None) # Get more comments if there are "load more" placeholders
|
38 |
+
for comment in submission.comments.list():
|
39 |
+
author = comment.author.id if comment.author else 'unknown'
|
40 |
+
messages.append(GenericMessage(author, comment.created, comment.body))
|
41 |
+
message_dicts = [msg.__dict__ for msg in messages]
|
42 |
+
writer.writerow([f'{subreddit}-{index}', json.dumps(message_dicts)])
|
43 |
+
if ( index % 10 == 0 ):
|
44 |
+
print("Fetched threads : ", index)
|
45 |
+
except RequestException as e:
|
46 |
+
if hasattr(e, 'response') and e.response is not None:
|
47 |
+
if e.response.status_code == 429:
|
48 |
+
print(f"Rate limit exceeded. Retrying in {retry_delay} seconds.")
|
49 |
+
time.sleep(retry_delay)
|
50 |
+
retry_count += 1
|
51 |
+
continue
|
52 |
+
raise
|
slack_summary.py
CHANGED
@@ -14,6 +14,7 @@ from ratelimit import limits, sleep_and_retry
|
|
14 |
|
15 |
import csv
|
16 |
from channel_id_mapper import ChannelIdMapper
|
|
|
17 |
from metadata_extracter import MetadataExtractor, ThreadMetadata
|
18 |
|
19 |
SKIP_AI = False
|
@@ -43,13 +44,8 @@ def load_mapping_from_json(filepath):
|
|
43 |
|
44 |
userIdMapping = load_mapping_from_json('user_id_to_name_mapping.json')
|
45 |
|
46 |
-
|
47 |
-
|
48 |
-
def __init__(self, user, timestamp, content):
|
49 |
-
self.user = userIdMapping.get(user, user)
|
50 |
-
self.timestamp = timestamp
|
51 |
-
self.content = content
|
52 |
-
|
53 |
|
54 |
class SlackThread:
|
55 |
def __init__(self, min_ts: str, max_ts: str, user: str, unique_users: set[str], summary: str, message_count: int,
|
@@ -149,8 +145,8 @@ class SlackChannelReader:
|
|
149 |
for row in csv_reader:
|
150 |
thread_ts = str(row['thread_ts']).strip()
|
151 |
thread_messages_dict = json.loads(str(row['messages_json']))
|
152 |
-
thread_messages: list[
|
153 |
-
|
154 |
messages_count += len(thread_messages)
|
155 |
print(f"Summarizing thread {thread_ts} with {len(thread_messages)} messages")
|
156 |
slack_thread = self._get_thread_summary(thread_messages)
|
@@ -197,8 +193,8 @@ class SlackChannelReader:
|
|
197 |
)
|
198 |
return response
|
199 |
|
200 |
-
def read_thread_messages(self, channel_id, thread_ts) -> List[
|
201 |
-
all_messages: list[
|
202 |
next_cursor = None
|
203 |
has_more = True
|
204 |
while has_more:
|
@@ -209,9 +205,9 @@ class SlackChannelReader:
|
|
209 |
for message in messages:
|
210 |
try:
|
211 |
if 'user' in message:
|
212 |
-
all_messages.append(
|
213 |
elif 'subtype' in message:
|
214 |
-
all_messages.append(
|
215 |
else:
|
216 |
print(f"Unknown message type: {message}")
|
217 |
except KeyError:
|
@@ -232,7 +228,7 @@ class SlackChannelReader:
|
|
232 |
return result
|
233 |
|
234 |
@staticmethod
|
235 |
-
def _get_thread_summary(thread_messages: List[
|
236 |
|
237 |
if len(thread_messages) == 1:
|
238 |
return None
|
|
|
14 |
|
15 |
import csv
|
16 |
from channel_id_mapper import ChannelIdMapper
|
17 |
+
from data_models import GenericMessage
|
18 |
from metadata_extracter import MetadataExtractor, ThreadMetadata
|
19 |
|
20 |
SKIP_AI = False
|
|
|
44 |
|
45 |
userIdMapping = load_mapping_from_json('user_id_to_name_mapping.json')
|
46 |
|
47 |
+
def map_user(user_id):
|
48 |
+
return userIdMapping.get(user_id, user_id)
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
class SlackThread:
|
51 |
def __init__(self, min_ts: str, max_ts: str, user: str, unique_users: set[str], summary: str, message_count: int,
|
|
|
145 |
for row in csv_reader:
|
146 |
thread_ts = str(row['thread_ts']).strip()
|
147 |
thread_messages_dict = json.loads(str(row['messages_json']))
|
148 |
+
thread_messages: list[GenericMessage] = [GenericMessage(map_user(d['user']), d['timestamp'], d['content']) for d
|
149 |
+
in thread_messages_dict]
|
150 |
messages_count += len(thread_messages)
|
151 |
print(f"Summarizing thread {thread_ts} with {len(thread_messages)} messages")
|
152 |
slack_thread = self._get_thread_summary(thread_messages)
|
|
|
193 |
)
|
194 |
return response
|
195 |
|
196 |
+
def read_thread_messages(self, channel_id, thread_ts) -> List[GenericMessage]:
|
197 |
+
all_messages: list[GenericMessage] = []
|
198 |
next_cursor = None
|
199 |
has_more = True
|
200 |
while has_more:
|
|
|
205 |
for message in messages:
|
206 |
try:
|
207 |
if 'user' in message:
|
208 |
+
all_messages.append(GenericMessage(map_user(message['user']), message['ts'], message['text']))
|
209 |
elif 'subtype' in message:
|
210 |
+
all_messages.append(GenericMessage(message['subtype'], message['ts'], message['text']))
|
211 |
else:
|
212 |
print(f"Unknown message type: {message}")
|
213 |
except KeyError:
|
|
|
228 |
return result
|
229 |
|
230 |
@staticmethod
|
231 |
+
def _get_thread_summary(thread_messages: List[GenericMessage]) -> Optional[SlackThread]:
|
232 |
|
233 |
if len(thread_messages) == 1:
|
234 |
return None
|