boompack commited on
Commit
53d44b2
·
verified ·
1 Parent(s): 556706a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +175 -52
app.py CHANGED
@@ -1,6 +1,6 @@
1
  from transformers import pipeline
2
  from dataclasses import dataclass, field
3
- from typing import List, Optional, Dict
4
  import re
5
  from datetime import datetime
6
  import logging
@@ -8,6 +8,7 @@ import html
8
  from uuid import uuid4
9
  import torch
10
  import gradio as gr
 
11
 
12
  # Настройка логирования
13
  logging.basicConfig(
@@ -18,6 +19,7 @@ logger = logging.getLogger(__name__)
18
 
19
  @dataclass
20
  class Comment:
 
21
  id: str = field(default_factory=lambda: str(uuid4()))
22
  username: str = ""
23
  time: str = ""
@@ -31,6 +33,8 @@ class Comment:
31
  hashtags: List[str] = field(default_factory=list)
32
  is_deleted: bool = False
33
  sentiment: Optional[str] = None
 
 
34
 
35
  def __post_init__(self):
36
  if len(self.content) > 2200:
@@ -38,33 +42,50 @@ class Comment:
38
  self.content = self.content[:2200] + "..."
39
 
40
  class InstagramCommentAnalyzer:
 
 
41
  COMMENT_PATTERN = r'''
42
- (?P<username>[\w.-]+)\s+
43
- (?P<time>\d+\s+нед\.)
44
  (?P<content>.*?)
45
- (?:Отметки\s*"Нравится":\s*(?P<likes>\d+))?
46
- (?:Ответить)?(?:Показать\sперевод)?(?:Нравится)?
47
  '''
48
 
 
 
 
 
 
 
 
49
  def __init__(self, max_depth: int = 10, max_comment_length: int = 2200):
 
50
  self.check_dependencies()
51
  self.max_depth = max_depth
52
  self.max_comment_length = max_comment_length
53
  self.pattern = re.compile(self.COMMENT_PATTERN, re.VERBOSE | re.DOTALL)
54
  self.comments: List[Comment] = []
55
- self.stats: Dict[str, int] = {
 
 
 
 
 
56
  'total_comments': 0,
57
  'deleted_comments': 0,
58
  'empty_comments': 0,
59
  'max_depth_reached': 0,
60
  'truncated_comments': 0,
61
  'processed_mentions': 0,
62
- 'processed_hashtags': 0
 
 
63
  }
64
- self.sentiment_analyzer = self.load_sentiment_model()
65
 
66
  def check_dependencies(self):
67
- required_packages = ['torch', 'transformers', 'numpy']
 
68
  for package in required_packages:
69
  try:
70
  __import__(package)
@@ -73,6 +94,7 @@ class InstagramCommentAnalyzer:
73
  raise
74
 
75
  def load_sentiment_model(self):
 
76
  try:
77
  device = "cuda" if torch.cuda.is_available() else "cpu"
78
  logger.info(f"Using device: {device}")
@@ -85,90 +107,173 @@ class InstagramCommentAnalyzer:
85
  logger.error(f"Model loading failed: {str(e)}")
86
  raise
87
 
88
- def analyze_sentiment(self, text: str) -> str:
89
- try:
90
- result = self.sentiment_analyzer(text)
91
- return result[0]['label']
92
- except Exception as e:
93
- logger.error(f"Sentiment analysis failed: {str(e)}")
94
- return "UNKNOWN"
95
-
96
  def normalize_text(self, text: str) -> str:
 
97
  text = html.unescape(text)
98
  text = ' '.join(text.split())
99
  text = re.sub(r'[\u200b\ufeff\u200c]', '', text)
100
  return text
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  def extract_metadata(self, comment: Comment) -> None:
 
103
  try:
 
104
  comment.mentions = re.findall(r'@(\w+)', comment.content)
105
- self.stats['processed_mentions'] += len(comment.mentions)
106
  comment.hashtags = re.findall(r'#(\w+)', comment.content)
 
 
 
 
 
 
107
  self.stats['processed_hashtags'] += len(comment.hashtags)
 
 
 
108
  comment.is_verified = bool(re.search(r'✓|Подтвержденный', comment.username))
109
  except Exception as e:
110
  logger.error(f"Metadata extraction failed: {str(e)}")
111
 
112
- def process_comment(self, text: str, parent_id: Optional[str] = None, level: int = 0) -> Optional[Comment]:
 
 
 
 
 
 
 
 
 
 
 
 
113
  if level > self.max_depth:
114
  logger.warning(f"Maximum depth {self.max_depth} exceeded")
115
  self.stats['max_depth_reached'] += 1
116
  return None
117
 
118
- if not text.strip():
119
- self.stats['empty_comments'] += 1
120
- return None
121
-
122
  try:
 
123
  match = self.pattern.match(text)
 
124
  if not match:
125
- raise ValueError(f"Could not parse comment: {text[:100]}...")
 
 
 
126
 
127
  data = match.groupdict()
128
  comment = Comment(
129
- username=data['username'],
130
- time=data['time'],
131
- content=data['content'].strip(),
132
- likes=int(data['likes'] or 0),
133
  level=level,
134
  parent_id=parent_id
135
  )
136
 
137
- if len(comment.content) > self.max_comment_length:
138
- self.stats['truncated_comments'] += 1
139
- comment.content = comment.content[:self.max_comment_length] + "..."
140
-
141
  comment.sentiment = self.analyze_sentiment(comment.content)
142
  self.extract_metadata(comment)
 
143
  self.stats['total_comments'] += 1
144
  return comment
145
 
146
  except Exception as e:
147
- logger.error(f"Error processing comment: {str(e)}")
148
- self.stats['deleted_comments'] += 1
149
- return Comment(
150
- username="[damaged]",
151
- time="",
152
- content="[Поврежденные данные]",
153
- is_deleted=True
154
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  def format_comment(self, comment: Comment, index: int) -> str:
 
157
  try:
158
  if comment.is_deleted:
159
- return f'{index}. "[УДАЛЕНО]" "" "" "Нравится 0"'
 
 
 
 
160
 
161
  return (
162
  f'{index}. "{comment.username}" "{comment.time}" '
163
- f'"{comment.content}" "Нравится {comment.likes}" "Настроение {comment.sentiment}"'
 
 
 
 
164
  )
165
  except Exception as e:
166
  logger.error(f"Error formatting comment: {str(e)}")
167
  return f'{index}. "[ОШИБКА ФОРМАТИРОВАНИЯ]"'
168
 
169
  def process_comments(self, text: str) -> List[str]:
 
170
  try:
171
- self.stats = {key: 0 for key in self.stats}
172
  text = self.normalize_text(text)
173
  raw_comments = text.split('ОтветитьНравится')
174
  formatted_comments = []
@@ -186,8 +291,8 @@ class InstagramCommentAnalyzer:
186
  logger.error(f"Error processing comments: {str(e)}")
187
  return ["[ОШИБКА ОБРАБОТКИ КОММЕНТАРИЕВ]"]
188
 
189
- # Создание интерфейса Gradio
190
  def create_interface():
 
191
  analyzer = InstagramCommentAnalyzer()
192
 
193
  def analyze_text(text: str):
@@ -196,18 +301,36 @@ def create_interface():
196
 
197
  iface = gr.Interface(
198
  fn=analyze_text,
199
- inputs=gr.Textbox(lines=10, placeholder="Вставьте текст комментариев здесь..."),
200
- outputs=gr.Textbox(lines=20, placeholder="Результаты анализа будут отображены здесь..."),
 
 
 
 
 
 
 
 
201
  title="Instagram Comment Analyzer",
202
- description="Введите текст комментариев из Instagram для анализа настроений и извлечения информации.",
 
 
203
  )
204
  return iface
205
 
206
  def main():
207
- # Запуск интерфейса Gradio
208
- interface = create_interface()
209
- interface.launch()
 
 
 
 
 
 
 
 
 
210
 
211
  if __name__ == "__main__":
212
- main()
213
-
 
1
  from transformers import pipeline
2
  from dataclasses import dataclass, field
3
+ from typing import List, Optional, Dict, Any
4
  import re
5
  from datetime import datetime
6
  import logging
 
8
  from uuid import uuid4
9
  import torch
10
  import gradio as gr
11
+ import emoji
12
 
13
  # Настройка логирования
14
  logging.basicConfig(
 
19
 
20
  @dataclass
21
  class Comment:
22
+ """Представляет комментарий Instagram со всеми метаданными"""
23
  id: str = field(default_factory=lambda: str(uuid4()))
24
  username: str = ""
25
  time: str = ""
 
33
  hashtags: List[str] = field(default_factory=list)
34
  is_deleted: bool = False
35
  sentiment: Optional[str] = None
36
+ language: Optional[str] = None
37
+ emojis: List[str] = field(default_factory=list)
38
 
39
  def __post_init__(self):
40
  if len(self.content) > 2200:
 
42
  self.content = self.content[:2200] + "..."
43
 
44
  class InstagramCommentAnalyzer:
45
+ """Анализатор комментариев Instagram с расширенной функциональностью"""
46
+
47
  COMMENT_PATTERN = r'''
48
+ (?P<username>[\w\u0400-\u04FF.-]+)\s*
49
+ (?P<time>(?:\d+\s+(?:нед|мин|ч|д|мес|год|sec|min|h|d|w|mon|y)\.?))\s*
50
  (?P<content>.*?)
51
+ (?:(?:Отметки|Likes)\s*"?Нравится"?:\s*(?P<likes>\d+))?
52
+ (?:Ответить|Reply)?(?:Показать\sперевод|Show\stranslation)?(?:Нравится|Like)?
53
  '''
54
 
55
+ TIME_MAPPING = {
56
+ 'нед': 'week', 'мин': 'minute', 'ч': 'hour',
57
+ 'д': 'day', 'мес': 'month', 'год': 'year',
58
+ 'w': 'week', 'h': 'hour', 'd': 'day',
59
+ 'mon': 'month', 'y': 'year'
60
+ }
61
+
62
  def __init__(self, max_depth: int = 10, max_comment_length: int = 2200):
63
+ """Инициализация анализатора"""
64
  self.check_dependencies()
65
  self.max_depth = max_depth
66
  self.max_comment_length = max_comment_length
67
  self.pattern = re.compile(self.COMMENT_PATTERN, re.VERBOSE | re.DOTALL)
68
  self.comments: List[Comment] = []
69
+ self.stats = self.initialize_stats()
70
+ self.sentiment_analyzer = self.load_sentiment_model()
71
+
72
+ def initialize_stats(self) -> Dict[str, int]:
73
+ """Инициализация статистики"""
74
+ return {
75
  'total_comments': 0,
76
  'deleted_comments': 0,
77
  'empty_comments': 0,
78
  'max_depth_reached': 0,
79
  'truncated_comments': 0,
80
  'processed_mentions': 0,
81
+ 'processed_hashtags': 0,
82
+ 'processed_emojis': 0,
83
+ 'failed_parses': 0
84
  }
 
85
 
86
  def check_dependencies(self):
87
+ """Проверка зависимостей"""
88
+ required_packages = ['torch', 'transformers', 'emoji']
89
  for package in required_packages:
90
  try:
91
  __import__(package)
 
94
  raise
95
 
96
  def load_sentiment_model(self):
97
+ """Загрузка модели анализа тональности"""
98
  try:
99
  device = "cuda" if torch.cuda.is_available() else "cpu"
100
  logger.info(f"Using device: {device}")
 
107
  logger.error(f"Model loading failed: {str(e)}")
108
  raise
109
 
 
 
 
 
 
 
 
 
110
  def normalize_text(self, text: str) -> str:
111
+ """Улучшенная нормализация текста"""
112
  text = html.unescape(text)
113
  text = ' '.join(text.split())
114
  text = re.sub(r'[\u200b\ufeff\u200c]', '', text)
115
  return text
116
 
117
+ def extract_emojis(self, text: str) -> List[str]:
118
+ """Извлечение эмодзи из текста"""
119
+ return [c for c in text if c in emoji.EMOJI_DATA]
120
+
121
+ def normalize_time(self, time_str: str) -> str:
122
+ """Нормализация временных меток"""
123
+ for rus, eng in self.TIME_MAPPING.items():
124
+ if rus in time_str:
125
+ return time_str.replace(rus, eng)
126
+ return time_str
127
+
128
+ def clean_content(self, content: str) -> str:
129
+ """Очистка содержимого комментария"""
130
+ content = content.strip()
131
+ content = re.sub(r'\s+', ' ', content)
132
+ if len(content) > self.max_comment_length:
133
+ self.stats['truncated_comments'] += 1
134
+ content = content[:self.max_comment_length] + "..."
135
+ return content
136
+
137
  def extract_metadata(self, comment: Comment) -> None:
138
+ """Извлечение метаданных из комментария"""
139
  try:
140
+ # Извлечение упоминаний и хэштегов
141
  comment.mentions = re.findall(r'@(\w+)', comment.content)
 
142
  comment.hashtags = re.findall(r'#(\w+)', comment.content)
143
+
144
+ # Извлечение эмодзи
145
+ comment.emojis = self.extract_emojis(comment.content)
146
+
147
+ # Обновление статистики
148
+ self.stats['processed_mentions'] += len(comment.mentions)
149
  self.stats['processed_hashtags'] += len(comment.hashtags)
150
+ self.stats['processed_emojis'] += len(comment.emojis)
151
+
152
+ # Проверка верификации
153
  comment.is_verified = bool(re.search(r'✓|Подтвержденный', comment.username))
154
  except Exception as e:
155
  logger.error(f"Metadata extraction failed: {str(e)}")
156
 
157
+ def analyze_sentiment(self, text: str) -> str:
158
+ """Анализ тональности текста"""
159
+ try:
160
+ result = self.sentiment_analyzer(text)
161
+ return result[0]['label']
162
+ except Exception as e:
163
+ logger.error(f"Sentiment analysis failed: {str(e)}")
164
+ return "UNKNOWN"
165
+ def process_comment(self, text: str, parent_id: Optional[str] = None, level: int = 0) -> Optional[Comment]:
166
+ """Обработка отдельного комментария"""
167
+ if not self.validate_input(text):
168
+ return None
169
+
170
  if level > self.max_depth:
171
  logger.warning(f"Maximum depth {self.max_depth} exceeded")
172
  self.stats['max_depth_reached'] += 1
173
  return None
174
 
 
 
 
 
175
  try:
176
+ text = self.normalize_text(text)
177
  match = self.pattern.match(text)
178
+
179
  if not match:
180
+ alt_match = self.alternative_parse(text)
181
+ if not alt_match:
182
+ raise ValueError(f"Could not parse comment: {text[:100]}...")
183
+ match = alt_match
184
 
185
  data = match.groupdict()
186
  comment = Comment(
187
+ username=data['username'].strip(),
188
+ time=self.normalize_time(data['time']),
189
+ content=self.clean_content(data['content']),
190
+ likes=self.parse_likes(data.get('likes', '0')),
191
  level=level,
192
  parent_id=parent_id
193
  )
194
 
195
+ # Анализ тональности и метаданных
 
 
 
196
  comment.sentiment = self.analyze_sentiment(comment.content)
197
  self.extract_metadata(comment)
198
+
199
  self.stats['total_comments'] += 1
200
  return comment
201
 
202
  except Exception as e:
203
+ logger.error(f"Error processing comment: {str(e)}", exc_info=True)
204
+ self.stats['failed_parses'] += 1
205
+ return self.create_damaged_comment()
206
+
207
+ def alternative_parse(self, text: str) -> Optional[re.Match]:
208
+ """Альтернативный метод парсинга для сложных случаев"""
209
+ alternative_patterns = [
210
+ # Более простой паттерн
211
+ r'(?P<username>[\w\u0400-\u04FF.-]+)\s*(?P<content>.*?)(?P<time>\d+\s+\w+\.?)(?P<likes>\d+)?',
212
+ # Паттерн для мобильной версии
213
+ r'(?P<username>[\w\u0400-\u04FF.-]+)\s*(?P<content>.*?)(?P<time>\d+\s+\w+)(?:Like)?(?P<likes>\d+)?'
214
+ ]
215
+
216
+ for pattern in alternative_patterns:
217
+ try:
218
+ match = re.compile(pattern, re.VERBOSE | re.DOTALL).match(text)
219
+ if match:
220
+ return match
221
+ except Exception:
222
+ continue
223
+ return None
224
+
225
+ def parse_likes(self, likes_str: str) -> int:
226
+ """Безопасный парсинг количества лайков"""
227
+ try:
228
+ return int(re.sub(r'\D', '', likes_str) or 0)
229
+ except (ValueError, TypeError):
230
+ return 0
231
+
232
+ def create_damaged_comment(self) -> Comment:
233
+ """Создание заглушки для поврежденного комментария"""
234
+ return Comment(
235
+ username="[damaged]",
236
+ time="unknown",
237
+ content="[Поврежденные данные]",
238
+ is_deleted=True
239
+ )
240
+
241
+ def validate_input(self, text: str) -> bool:
242
+ """Валидация входного текста"""
243
+ if not text or not isinstance(text, str):
244
+ logger.error("Invalid input: text must be non-empty string")
245
+ return False
246
+ if len(text) > 50000:
247
+ logger.error("Input text too large")
248
+ return False
249
+ return True
250
 
251
  def format_comment(self, comment: Comment, index: int) -> str:
252
+ """Форматирование комментария для вывода"""
253
  try:
254
  if comment.is_deleted:
255
+ return f'{index}. "[УДАЛЕНО]"'
256
+
257
+ emoji_str = ' '.join(comment.emojis) if comment.emojis else ''
258
+ mentions_str = ', '.join(comment.mentions) if comment.mentions else ''
259
+ hashtags_str = ', '.join(comment.hashtags) if comment.hashtags else ''
260
 
261
  return (
262
  f'{index}. "{comment.username}" "{comment.time}" '
263
+ f'"{comment.content}" "Лайки: {comment.likes}" '
264
+ f'"Настроение: {comment.sentiment}" '
265
+ f'"Эмодзи: {emoji_str}" '
266
+ f'"Упоминания: {mentions_str}" '
267
+ f'"Хэштеги: {hashtags_str}"'
268
  )
269
  except Exception as e:
270
  logger.error(f"Error formatting comment: {str(e)}")
271
  return f'{index}. "[ОШИБКА ФОРМАТИРОВАНИЯ]"'
272
 
273
  def process_comments(self, text: str) -> List[str]:
274
+ """Обработка всех комментариев"""
275
  try:
276
+ self.stats = self.initialize_stats()
277
  text = self.normalize_text(text)
278
  raw_comments = text.split('ОтветитьНравится')
279
  formatted_comments = []
 
291
  logger.error(f"Error processing comments: {str(e)}")
292
  return ["[ОШИБКА ОБРАБОТКИ КОММЕНТАРИЕВ]"]
293
 
 
294
  def create_interface():
295
+ """Создание интерфейса Gradio"""
296
  analyzer = InstagramCommentAnalyzer()
297
 
298
  def analyze_text(text: str):
 
301
 
302
  iface = gr.Interface(
303
  fn=analyze_text,
304
+ inputs=gr.Textbox(
305
+ lines=10,
306
+ placeholder="Вставьте текст комментариев здесь...",
307
+ label="Входной текст"
308
+ ),
309
+ outputs=gr.Textbox(
310
+ lines=20,
311
+ placeholder="Результаты анализа будут отображены здесь...",
312
+ label="Результаты анализа"
313
+ ),
314
  title="Instagram Comment Analyzer",
315
+ description="Анализатор комментариев Instagram с поддержкой эмодзи и мультиязычности",
316
+ theme="default",
317
+ analytics_enabled=False,
318
  )
319
  return iface
320
 
321
  def main():
322
+ """Основная функция запуска приложения"""
323
+ try:
324
+ interface = create_interface()
325
+ interface.launch(
326
+ server_name="0.0.0.0",
327
+ server_port=7860,
328
+ share=False,
329
+ debug=True
330
+ )
331
+ except Exception as e:
332
+ logger.error(f"Application failed to start: {str(e)}")
333
+ raise
334
 
335
  if __name__ == "__main__":
336
+ main()