ChatVID / model /fastchat /data /clean_sharegpt.py
Yiqin's picture
init
6ef31de
raw
history blame
5.83 kB
"""
- Convert html to markdown with basic data cleaning.
- Deduplication.
Usage:
python3 -m fastchat.data.clean_sharegpt --in sharegpt_html.json --out sharegpt_clean.json
"""
import argparse
import json
import logging
import re
from typing import Dict, Union
import bs4
import markdownify # == 0.11.6
import tqdm
div_pattern = re.compile("<div.*?>")
span_pattern = re.compile("<span.*?>")
code_lang_pattern = re.compile(
"```\s*" + "(.*?)" + "(?:Copy code)+" + "(.+?)" + "\s*?```", re.DOTALL
)
code_lang_format = "```\g<1>\n\g<2>\n```"
regenerate_pattern = re.compile("\d+ / \d+")
copy_chars_pattern = re.compile("Copy\d+ chars / \d+ words")
copy_code_pattern = re.compile("```(.*?)Copy code\s*```")
def reformat_code(val: str) -> str:
# Input code format is:
# ```
# $<language>Copy code$<exact_code_here>
#
# ```
# This function convert it into the correct markdown format
return re.sub(code_lang_pattern, code_lang_format, val)
def html_to_markdown(val: str) -> str:
# Remove all <div>. This is required to make intent work in code blocks.
val = re.sub(div_pattern, "", val)
# Remove all <span>. This is required to make underscores work in code blocks.
val = re.sub(span_pattern, "", val)
# Markdown to html
val = markdownify.markdownify(val).strip()
# Reformat code
val = reformat_code(val)
# Remove noisy "[number] / [number]" at the beginning
noise = re.search(regenerate_pattern, val)
if noise and noise.start() == 0:
val = val[noise.end() :]
# Remove noisy "Copy[number] chars / [number] words"
val = re.sub(copy_chars_pattern, "", val)
# Remove empty code block ```\nCopy code\n```
val = re.sub(copy_code_pattern, "", val)
# Strip
val = val.replace("\n\n\n", "\n").strip()
if args.debug:
print(val)
exit()
return val
def should_filter(val: str) -> bool:
black_list = ["openai", "chatgpt"]
for w in black_list:
if w in val.lower():
return True
return False
def clean_html_source(content, begin, end, check_tag, check_num):
"""
Clean the input json content.
Args:
content: json file loaded in memory.
check_tag: a debug purpose arg. If a conversation contains the tag, log
it before and after cleaning.
check_num: number of matched conversations logged.
"""
BARRIER = "\n" + "=" * 20 + "\n"
cnt_skip = 0
cnt_too_short = 0
cnt_id_duplication = 0
cnt_value_duplication = 0
cnt_filter = 0
cnt_tag = 0
visited = {}
content = content[begin:end]
new_content = []
for sample in tqdm.tqdm(content):
skipped = False
cid = sample["id"]
if len(sample["conversations"]) <= 1:
print(f"id {cid} is too short")
cnt_too_short += 1
skipped = True
elif cid in visited:
print(f"id {cid} is an id duplication of {visited[cid]}")
cnt_id_duplication += 1
skipped = True
elif (
sample["conversations"][1]["value"],
len(sample["conversations"]),
) in visited:
key = (sample["conversations"][1]["value"], len(sample["conversations"]))
print(f"id {cid} is a value duplication of {visited[key]}")
cnt_value_duplication += 1
skipped = True
else:
key = (sample["conversations"][1]["value"], len(sample["conversations"]))
visited[cid] = visited[key] = cid
for c in sample["conversations"]:
if should_filter(c["value"]):
print(f"id {cid} is filtered out")
cnt_filter += 1
skipped = True
break
try:
new_val = html_to_markdown(c["value"])
except (bs4.builder.ParserRejectedMarkup, AssertionError):
skipped = True
break
c["value"] = new_val
# Debug
if (
check_tag is not None
and check_tag in c["value"]
and cnt_tag < check_num
):
logging.debug(
BARRIER
+ c["value"]
+ "\n"
+ BARRIER
+ new_val
+ "\n"
+ BARRIER
+ "\n"
)
cnt_tag += 1
if cnt_tag == check_num:
break
if not skipped:
new_content.append(sample)
else:
cnt_skip += 1
print(
f"total: {len(content)}, skip: {cnt_skip}, new: {len(new_content)}, "
f"cnt_too_short: {cnt_too_short}, cnt_id_duplication: {cnt_id_duplication}, "
f"cnt_value_duplication: {cnt_value_duplication}, cnt_filter: {cnt_filter}"
)
return new_content
def main(args):
content = json.load(open(args["in_file"], "r"))
content = clean_html_source(
content, args["begin"], args["end"], args["check_tag"], args["check_num"]
)
json.dump(content, open(args["out_file"], "w"), indent=2)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--in-file", type=str, required=True)
parser.add_argument("--out-file", type=str, default="sharegpt_clean.json")
parser.add_argument("--begin", type=int)
parser.add_argument("--end", type=int)
parser.add_argument("--debug", action="store_true")
parser.add_argument("--check-tag", type=str)
parser.add_argument("--check-num", type=int, default=1)
args = parser.parse_args()
main(vars(args))