File size: 21,037 Bytes
88aba71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
import os
import sys
import subprocess
from typing import Dict, List, Union
import re

import pandas as pd
import json
from pandas import Timestamp
from llamafactory.extras.packages import is_vllm_available

from weclone.data.clean.strategies import LLMCleaningStrategy
from weclone.data.clean.strategies_online import OlineLLMCleaningStrategy
from weclone.utils.config import load_config
from weclone.utils.log import logger
from weclone.data.models import ChatMessage, CutMessage, skip_type_list, QaPair
from weclone.data.strategies import TimeWindowStrategy, LLMStrategy


class DataProcessor:
    def __init__(self):
        self.config = load_config(arg_type="make_dataset")
        self.csv_folder = "./dataset/csv"
        self.system_prompt = self.config["default_system"]
        self.cut_type_list = [
            "图片",
            "视频",
            "合并转发的聊天记录",
            "语音",
            "(分享)音乐",
            "(分享)卡片式链接",
            "(分享)笔记",
            "(分享)小程序",
            "(分享)收藏夹",
            "(分享)小说(猜)",
            "(分享)视频号名片",
            "(分享)视频号视频",
            "粘贴的文本",  # 无法解析的分享链接
        ]

        # blocked_words
        config_blocked_words = self.config.get("blocked_words", [])
        file_blocked_words = []
        try:
            with open("./dataset/blocked_words.json", encoding="utf-8") as f:
                file_blocked_words = json.load(f).get("blocked_words", [])
        except (FileNotFoundError, json.JSONDecodeError):
            pass

        self.blocked_words = list(set(config_blocked_words + file_blocked_words))
        # logger.info(f"聊天记录禁用词: {self.blocked_words}")

        if self.config["single_combine_strategy"] == "time_window":
            self.single_combine_strategy = TimeWindowStrategy(
                time_window=self.config["single_combine_time_window"] * 60,
                is_single_chat=True,
            )
        elif self.config["single_combine_strategy"] == "llm":
            self.single_combine_strategy = LLMStrategy(
                is_single_chat=True,
            )

        if self.config["qa_match_strategy"] == "time_window":
            self.qa_match_strategy = TimeWindowStrategy(
                time_window=self.config["qa_match_time_window"] * 60,
                is_single_chat=False,
            )
        elif self.config["qa_match_strategy"] == "llm":
            self.qa_match_strategy = LLMStrategy(is_single_chat=False)

        clean_dataset_config = self.config.get("clean_dataset", {})
        enable_clean = clean_dataset_config.get("enable_clean", False)

        if enable_clean:
            if self.config.get("prompt_with_history", False):
                logger.warning("开启 prompt_with_history 不支持 clean_dataset 功能")
                exit()

            if not is_vllm_available() and not self.config.get("online_llm_clear"):
                logger.warning("vLLM 不可用,暂不清洗数据集。")
                clean_dataset_config["enable_clean"] = False

        if self.config.get("clean_dataset", {}).get("enable_clean", False):
            if self.config.get("clean_dataset", {}).get("clean_strategy", "llm") == "llm":
                if self.config.get("online_llm_clear"):
                    self.clean_strategy = OlineLLMCleaningStrategy(make_dataset_config=self.config)
                else:
                    self.clean_strategy = LLMCleaningStrategy(make_dataset_config=self.config)
        self.c = self.config

    def main(self):
        if not os.path.exists(self.csv_folder) or not os.listdir(self.csv_folder):
            logger.error(f"错误:目录 '{self.csv_folder}' 不存在或为空,请检查路径并确保其中包含 CSV 聊天数据文件。")
            return

        csv_files = self.get_csv_files()
        logger.info(f"共发现 {len(csv_files)} 个 CSV 文件,开始处理")
        message_list: List[ChatMessage] = []
        for csv_file in csv_files:
            logger.debug(f"开始处理 CSV 文件: {csv_file}")
            chat_messages = self.load_csv(csv_file)
            message_list.extend(self.group_consecutive_messages(messages=chat_messages))
            # self.process_by_msgtype(chat_message)
            logger.debug(f"处理完成: {csv_file},共加载 {len(chat_messages)} 条消息")
        qa_res = self.match_qa(message_list)
        if self.c["prompt_with_history"]:
            qa_res = self.add_history_to_qa(qa_res)
        else:
            qa_res = [item for item in qa_res if isinstance(item, QaPair)]

        if self.c.get("clean_dataset", {}).get("enable_clean", False):
            self.clean_strategy.judge(qa_res)
            # qa_res = self.clean_strategy.clean(qa_res)
        self.save_result(qa_res)
        self._execute_length_cdf_script()

        logger.success(f"聊天记录处理成功,共{len(qa_res)}条,保存到 ./dataset/res_csv/sft/sft-my.json")

    def _execute_length_cdf_script(self):
        """执行 length_cdf.py 脚本来计算cutoff_len。"""
        try:
            python_executable = sys.executable
            # 脚本路径是相对于项目根目录的
            script_path = os.path.join("weclone", "utils", "length_cdf.py")

            command_parts = [
                python_executable,
                script_path,
                f'--model_name_or_path="{self.c["model_name_or_path"]}"',
                f'--dataset="{self.c["dataset"]}"',
                f'--dataset_dir="{self.c["dataset_dir"]}"',
                f'--template="{self.c["template"]}"',
                f"--interval={self.c['cutoff_len']}",
            ]

            child_env = os.environ.copy()
            child_env["CUDA_VISIBLE_DEVICES"] = "0"
            child_env["LLAMAFACTORY_VERBOSITY"] = "ERROR"

            process = subprocess.Popen(
                command_parts,
                env=child_env,
                stdout=None,  # 使用 None 表示使用父进程的标准输出(即终端)
                stderr=None,  # 使用 None 表示使用父进程的标准错误(即终端)
                text=True,
                bufsize=1,  # 行缓冲
            )
            return_code = process.wait()
            if return_code != 0:
                logger.error(f"命令 '{' '.join(command_parts)}' 执行失败,返回码 {return_code}")
        except FileNotFoundError:
            # command_parts[0] 是 python_executable, command_parts[1] 是 script_path
            logger.error(f"命令执行失败: 找不到可执行文件 '{command_parts[0]}' 或脚本 '{command_parts[1]}'")
        except KeyError as e:
            logger.error(f"执行 length_cdf.py 脚本失败:配置项缺失 {str(e)}")
        except Exception as e:
            logger.error(f"执行 length_cdf.py 脚本时发生未知错误: {str(e)}")

    def get_csv_files(self):
        """遍历文件夹获取所有CSV文件路径,并按文件名中的起始序号排序"""

        csv_files = []
        for chat_obj_folder in os.listdir(self.csv_folder):
            chat_obj_folder_path = os.path.join(self.csv_folder, chat_obj_folder)
            for csvfile in os.listdir(chat_obj_folder_path):
                if not csvfile.endswith(".csv"):
                    continue
                csvfile_path = os.path.join(chat_obj_folder_path, csvfile)
                csv_files.append(csvfile_path)
        # 提取文件名中的起始数字,比如 wxid_..._0_5000.csv → 0
        pattern = re.compile(r"_(\d+)_\d+\.csv$")

        def extract_start(fp: str) -> int:
            name = os.path.basename(fp)
            m = pattern.search(name)
            return int(m.group(1)) if m else 0

        # 按起始数字升序排序
        csv_files.sort(key=extract_start)
        return csv_files

    def match_qa(self, messages: List[ChatMessage]) -> List[Union[QaPair, CutMessage]]:
        """

        匹配问答对



        Args:

            messages: 消息列表



        Returns:

            List[Union[QaPair, CutMessage]]: 包含指令和输出的问答对列表

        """
        # 状态定义
        WAITING_INSTRUCTION = "waiting_instruction"  # 等待指令
        WAITING_RESPONSE = "waiting_response"  # 等待回复

        current_state = WAITING_INSTRUCTION
        qa_res: List[Union[QaPair, CutMessage]] = []
        last_message = None
        current_instruction = None
        qa_id_counter = 0

        for msg in messages:
            if isinstance(msg, CutMessage):
                current_state = WAITING_INSTRUCTION
                current_instruction = None
                last_message = None
                if self.c["prompt_with_history"]:
                    qa_res.append(msg)
                continue

            if current_state == WAITING_INSTRUCTION:
                if msg.is_sender == 0:  # 收到对方消息
                    current_instruction = msg.msg
                    last_message = msg
                    current_state = WAITING_RESPONSE

            elif current_state == WAITING_RESPONSE:
                if msg.is_sender == 0:  # 收到对方消息
                    current_instruction = msg.msg
                    last_message = msg
                    # 状态保持不变
                else:  # 自己的回复 使用策略判断是否属于同一对话
                    if last_message and self.qa_match_strategy.is_same_conversation([last_message], msg):
                        assert current_instruction is not None, (
                            "current_instruction should not be None when creating a QA pair"
                        )
                        qa_pair = QaPair(
                            id=qa_id_counter,
                            system=self.system_prompt,
                            instruction=current_instruction,
                            output=msg.msg,
                            history=[],  # No history in this context yet
                            time=msg.CreateTime,  # Use the response message time
                            score=0,  # Default score
                        )
                        qa_res.append(qa_pair)
                        qa_id_counter += 1  # 增加计数器
                    else:
                        if self.c["prompt_with_history"]:
                            qa_res.append(
                                CutMessage(
                                    is_sender=msg.is_sender,
                                    cut_type=msg.type_name,
                                    CreateTime=msg.CreateTime,
                                )
                            )
                    # 无论是否匹配,都重置状态
                    current_state = WAITING_INSTRUCTION
                    current_instruction = None
                    last_message = None

        return qa_res

    # TODO: need review
    def add_history_to_qa(self, qa_res: List[Union[QaPair, CutMessage]]) -> List[QaPair]:
        """

        Adds conversation history to QaPair objects.



        Args:

            qa_res: A list containing QaPair and CutMessage objects.



        Returns:

            A list of QaPair objects with history populated.

        """
        qa_res_with_history: List[QaPair] = []
        current_history: List[List[str]] = []
        last_timestamp: Timestamp = None  # type: ignore

        for item in qa_res:
            if isinstance(item, CutMessage):
                if current_history:
                    instruction = current_history[-1][0]
                    output = current_history[-1][1]
                    history = current_history[:-1]
                    qa_pair_with_history = QaPair(
                        id=-1,
                        system=self.system_prompt,
                        instruction=instruction,
                        output=output,
                        history=history,
                        time=last_timestamp,
                        score=0,
                    )
                    qa_res_with_history.append(qa_pair_with_history)
                current_history = []
                last_timestamp = None  # type: ignore
            elif isinstance(item, QaPair):
                current_history.append([item.instruction, item.output])
                last_timestamp = item.time

        if current_history:
            instruction = current_history[-1][0]
            output = current_history[-1][1]
            history = current_history[:-1]
            # Ensure last_timestamp is not None before assignment
            final_timestamp_end = last_timestamp
            assert final_timestamp_end is not None, "Timestamp cannot be None for the final QaPair"
            qa_pair_with_history = QaPair(
                id=-1,
                system=self.system_prompt,
                instruction=instruction,
                output=output,
                history=history,
                time=final_timestamp_end,
                score=0,
            )
            qa_res_with_history.append(qa_pair_with_history)

        return qa_res_with_history

    def group_consecutive_messages(self, messages: List[ChatMessage]) -> List[ChatMessage]:
        """

        将同一个人连续发送的多条消息组合成一条消息,遇到cut_type添加cut



        Args:

            messages: 消息列表



        Returns:

            List[ChatMessage]: 组合后的消息列表

        """
        if not messages:
            return []

        def _combine_text(messages: List[ChatMessage]) -> ChatMessage:
            """

            合并多条消息为一条



            Args:

                messages: 要合并的消息列表



            Returns:

                ChatMessage: 合并后的消息

            """
            base_msg = messages[0]
            combined_content = messages[0].msg

            for i in messages[1:]:
                content = i.msg
                if not content:
                    continue

                if combined_content and combined_content[-1] not in ["。", "!", "?", "…", ",", "."]:
                    combined_content += ","

                combined_content += content
            if len(combined_content) > self.c["combine_msg_max_length"]:
                logger.warning(
                    f"组合后消息长度超过{self.c['combine_msg_max_length']}将截断:\n {combined_content[:50]}"
                )
                combined_content = combined_content[: self.c["combine_msg_max_length"]]

            combined_message = ChatMessage(
                id=base_msg.id,
                MsgSvrID=base_msg.MsgSvrID,
                type_name=base_msg.type_name,
                is_sender=base_msg.is_sender,
                talker=base_msg.talker,
                room_name=base_msg.room_name,
                msg=combined_content,
                src=base_msg.src,
                CreateTime=messages[-1].CreateTime,  # 使用最后一条消息的时间
            )

            return combined_message

        def _create_cut_message(message: ChatMessage) -> CutMessage:
            return CutMessage(
                is_sender=message.is_sender,
                cut_type=message.type_name,
                CreateTime=message.CreateTime,
            )

        def _combine_current_group(group):
            """

            处理当前消息组并添加到grouped_messages



            Args:

                group: 当前消息组

            """
            if len(group) > 1:
                combined_msg = _combine_text(group)
                grouped_messages.append(combined_msg)
            else:
                grouped_messages.append(group[0])

        grouped_messages = []
        current_group = []

        for _, current_msg in enumerate(messages):
            if current_msg.type_name in self.cut_type_list:
                if current_group:
                    # 当前组有消息,合并当前组,并添加一条cut
                    _combine_current_group(current_group)
                    current_group = []

                    cut_msg = _create_cut_message(current_msg)
                    grouped_messages.append(cut_msg)
                else:
                    # 当前组没消息,检查上一个组
                    if grouped_messages:
                        if not isinstance(grouped_messages[-1], CutMessage):
                            cut_msg = _create_cut_message(current_msg)
                            grouped_messages.append(cut_msg)
                    # 如果上一个组没消息或最后一条是CutMessage,直接continue
                continue

            if not current_group:
                current_group = [current_msg]
                continue

            last_msg = current_group[-1]

            # 判断是否是同一个人的连续消息
            if (
                    current_msg.is_sender == last_msg.is_sender
                    and current_msg.talker == last_msg.talker
                    and self.single_combine_strategy.is_same_conversation([last_msg], current_msg)
            ):
                current_group.append(current_msg)
            else:
                # 不是同一个人的消息,处理当前组并开始新组
                _combine_current_group(current_group)
                # 开始新组
                current_group = [current_msg]

        # 处理最后一组消息
        if current_group:
            _combine_current_group(current_group)

        return grouped_messages

    def process_by_msgtype(self, chat_message: ChatMessage):
        if chat_message.type_name == "文本":
            self.process_text(chat_message)
        # elif chat_message.type_name == "图片":
        #     self.process_image(chat_message)

    def load_csv(self, file_path) -> List[ChatMessage]:
        """

        做整体第一次预处理,过滤不符合条件的行

        """
        df = pd.read_csv(file_path, encoding="utf-8", dtype={"msg": str})

        df = df[~df["type_name"].isin(values=skip_type_list)]

        # 如果type_name为文本 并且msg 包含 手机号、身份证号、邮箱、网址则删除这行
        for i in df.index:
            if df.loc[i, "type_name"] == "文本":
                msg_str = str(df.loc[i, "msg"])
                if (
                        re.search(r"1\d{10}", msg_str)
                        or re.search(r"\d{18}", msg_str)
                        or re.search(r"\w+@\w+", msg_str)
                        or "http" in msg_str
                        or r"\\xa0" in msg_str
                        or r"\\u" in msg_str
                ):
                    df = df.drop(index=i)
                    continue
                for blocked_word in self.blocked_words:
                    if blocked_word in msg_str:
                        df = df.drop(index=i)
                        break
            else:
                df.loc[i, "msg"] = ""

        df = df.dropna(how="all")
        # 时间格式 2021-07-07 10:27:23
        # 遍历行 相同is_sender的行合并msg()遇到不同is_sender就重新开始
        df["CreateTime"] = pd.to_datetime(df["CreateTime"])

        return [ChatMessage(*row) for row in df.values]

    def process_text(self, chat_message: ChatMessage):
        pass

    def save_result(self, qa_res: List[QaPair]):
        """

        Saves the list of QaPair objects to a JSON file after converting them to dictionaries.



        Args:

            qa_res: A list of QaPair objects.

        """
        processed_qa_res = []
        for idx, item in enumerate(qa_res):
            item_dict = {
                "id": idx,
                "system": item.system,
                "instruction": item.instruction,
                "output": item.output,
                "history": item.history,
                "time": item.time.isoformat() if item.time else None,
                "score": item.score,
            }
            processed_qa_res.append(item_dict)

        output_path = "./dataset/res_csv/sft/sft-my.json"
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(processed_qa_res, f, ensure_ascii=False, indent=4)
        logger.success(f"聊天记录处理成功,共{len(qa_res)}条,保存到 {output_path}")


if __name__ == "__main__":
    processor = DataProcessor()
    processor.main()