svummidi commited on
Commit
218f81b
1 Parent(s): ae41958

PAN subreddit 200 threads added

Browse files
app.py CHANGED
@@ -267,7 +267,7 @@ def main_demo():
267
  demo = gr.Blocks()
268
 
269
  with demo:
270
- data_sets = ["platform-engg_messages", "apps-ui_messages", "ux-reviews_messages"]
271
  load_data(data_sets)
272
  with gr.Tab("Thread"):
273
  data_sets_dd = gr.Dropdown(data_sets,
 
267
  demo = gr.Blocks()
268
 
269
  with demo:
270
+ data_sets = ["platform-engg_messages", "apps-ui_messages", "ux-reviews_messages", "paloaltonetworks_messages"]
271
  load_data(data_sets)
272
  with gr.Tab("Thread"):
273
  data_sets_dd = gr.Dropdown(data_sets,
csv/paloaltonetworks_messages.csv ADDED
The diff for this file is too large to render. See raw diff
 
csv/paloaltonetworks_messages_copy.csv ADDED
The diff for this file is too large to render. See raw diff
 
data_models.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+ class GenericMessage:
3
+ def __init__(self, user, timestamp, content):
4
+ self.user = user
5
+ self.timestamp = timestamp
6
+ self.content = content
reddit_collect.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import time
3
+
4
+ from prawcore import RequestException
5
+
6
+ import csv
7
+
8
+ import praw
9
+
10
+ from data_models import GenericMessage
11
+
12
+ reddit = praw.Reddit(
13
+ client_id='LV2nS-xiWYIEn6YpwOhWpg',
14
+ client_secret='PhC4AYKkL0OUR8miVIuZF45Iz_saiA',
15
+ user_agent='PythonScript:com.example.passive_monitoring:v0.0.1 (by /u/vvsatya)',
16
+ )
17
+
18
+ subreddit ='paloaltonetworks'
19
+ # Access subreddit
20
+ subreddit = reddit.subreddit(subreddit)
21
+
22
+ retry_count = 0
23
+ max_retries = 5
24
+ retry_delay = 5 # in seconds
25
+
26
+ while retry_count < max_retries:
27
+ try:
28
+ thread_messages_file = f'csv/{subreddit}_messages.csv'
29
+ with open(thread_messages_file, 'w', newline='') as csvfile:
30
+ writer = csv.writer(csvfile)
31
+ writer.writerow(
32
+ ['thread_ts', 'messages_json'])
33
+ for index, submission in enumerate(subreddit.hot(limit=1000), 1):
34
+ messages = []
35
+ messages.append(GenericMessage(submission.author.id, submission.created, f"Title: {submission.title}\n Body: {submission.selftext}"))
36
+ # Collect and print comments
37
+ submission.comments.replace_more(limit=None) # Get more comments if there are "load more" placeholders
38
+ for comment in submission.comments.list():
39
+ author = comment.author.id if comment.author else 'unknown'
40
+ messages.append(GenericMessage(author, comment.created, comment.body))
41
+ message_dicts = [msg.__dict__ for msg in messages]
42
+ writer.writerow([f'{subreddit}-{index}', json.dumps(message_dicts)])
43
+ if ( index % 10 == 0 ):
44
+ print("Fetched threads : ", index)
45
+ except RequestException as e:
46
+ if hasattr(e, 'response') and e.response is not None:
47
+ if e.response.status_code == 429:
48
+ print(f"Rate limit exceeded. Retrying in {retry_delay} seconds.")
49
+ time.sleep(retry_delay)
50
+ retry_count += 1
51
+ continue
52
+ raise
slack_summary.py CHANGED
@@ -14,6 +14,7 @@ from ratelimit import limits, sleep_and_retry
14
 
15
  import csv
16
  from channel_id_mapper import ChannelIdMapper
 
17
  from metadata_extracter import MetadataExtractor, ThreadMetadata
18
 
19
  SKIP_AI = False
@@ -43,13 +44,8 @@ def load_mapping_from_json(filepath):
43
 
44
  userIdMapping = load_mapping_from_json('user_id_to_name_mapping.json')
45
 
46
-
47
- class SlackMessage:
48
- def __init__(self, user, timestamp, content):
49
- self.user = userIdMapping.get(user, user)
50
- self.timestamp = timestamp
51
- self.content = content
52
-
53
 
54
  class SlackThread:
55
  def __init__(self, min_ts: str, max_ts: str, user: str, unique_users: set[str], summary: str, message_count: int,
@@ -149,8 +145,8 @@ class SlackChannelReader:
149
  for row in csv_reader:
150
  thread_ts = str(row['thread_ts']).strip()
151
  thread_messages_dict = json.loads(str(row['messages_json']))
152
- thread_messages: list[SlackMessage] = [SlackMessage(d['user'], d['timestamp'], d['content']) for d
153
- in thread_messages_dict]
154
  messages_count += len(thread_messages)
155
  print(f"Summarizing thread {thread_ts} with {len(thread_messages)} messages")
156
  slack_thread = self._get_thread_summary(thread_messages)
@@ -197,8 +193,8 @@ class SlackChannelReader:
197
  )
198
  return response
199
 
200
- def read_thread_messages(self, channel_id, thread_ts) -> List[SlackMessage]:
201
- all_messages: list[SlackMessage] = []
202
  next_cursor = None
203
  has_more = True
204
  while has_more:
@@ -209,9 +205,9 @@ class SlackChannelReader:
209
  for message in messages:
210
  try:
211
  if 'user' in message:
212
- all_messages.append(SlackMessage(message['user'], message['ts'], message['text']))
213
  elif 'subtype' in message:
214
- all_messages.append(SlackMessage(message['subtype'], message['ts'], message['text']))
215
  else:
216
  print(f"Unknown message type: {message}")
217
  except KeyError:
@@ -232,7 +228,7 @@ class SlackChannelReader:
232
  return result
233
 
234
  @staticmethod
235
- def _get_thread_summary(thread_messages: List[SlackMessage]) -> Optional[SlackThread]:
236
 
237
  if len(thread_messages) == 1:
238
  return None
 
14
 
15
  import csv
16
  from channel_id_mapper import ChannelIdMapper
17
+ from data_models import GenericMessage
18
  from metadata_extracter import MetadataExtractor, ThreadMetadata
19
 
20
  SKIP_AI = False
 
44
 
45
  userIdMapping = load_mapping_from_json('user_id_to_name_mapping.json')
46
 
47
+ def map_user(user_id):
48
+ return userIdMapping.get(user_id, user_id)
 
 
 
 
 
49
 
50
  class SlackThread:
51
  def __init__(self, min_ts: str, max_ts: str, user: str, unique_users: set[str], summary: str, message_count: int,
 
145
  for row in csv_reader:
146
  thread_ts = str(row['thread_ts']).strip()
147
  thread_messages_dict = json.loads(str(row['messages_json']))
148
+ thread_messages: list[GenericMessage] = [GenericMessage(map_user(d['user']), d['timestamp'], d['content']) for d
149
+ in thread_messages_dict]
150
  messages_count += len(thread_messages)
151
  print(f"Summarizing thread {thread_ts} with {len(thread_messages)} messages")
152
  slack_thread = self._get_thread_summary(thread_messages)
 
193
  )
194
  return response
195
 
196
+ def read_thread_messages(self, channel_id, thread_ts) -> List[GenericMessage]:
197
+ all_messages: list[GenericMessage] = []
198
  next_cursor = None
199
  has_more = True
200
  while has_more:
 
205
  for message in messages:
206
  try:
207
  if 'user' in message:
208
+ all_messages.append(GenericMessage(map_user(message['user']), message['ts'], message['text']))
209
  elif 'subtype' in message:
210
+ all_messages.append(GenericMessage(message['subtype'], message['ts'], message['text']))
211
  else:
212
  print(f"Unknown message type: {message}")
213
  except KeyError:
 
228
  return result
229
 
230
  @staticmethod
231
+ def _get_thread_summary(thread_messages: List[GenericMessage]) -> Optional[SlackThread]:
232
 
233
  if len(thread_messages) == 1:
234
  return None