|
""" |
|
- Convert html to markdown with basic data cleaning. |
|
- Deduplication. |
|
|
|
Usage: |
|
python3 -m fastchat.data.clean_sharegpt --in sharegpt_html.json --out sharegpt_clean.json |
|
""" |
|
import argparse |
|
import json |
|
import logging |
|
import re |
|
from typing import Dict, Union |
|
|
|
import bs4 |
|
import markdownify |
|
import tqdm |
|
|
|
|
|
div_pattern = re.compile("<div.*?>") |
|
span_pattern = re.compile("<span.*?>") |
|
code_lang_pattern = re.compile( |
|
"```\s*" + "(.*?)" + "(?:Copy code)+" + "(.+?)" + "\s*?```", re.DOTALL |
|
) |
|
code_lang_format = "```\g<1>\n\g<2>\n```" |
|
regenerate_pattern = re.compile("\d+ / \d+") |
|
copy_chars_pattern = re.compile("Copy\d+ chars / \d+ words") |
|
copy_code_pattern = re.compile("```(.*?)Copy code\s*```") |
|
|
|
|
|
def reformat_code(val: str) -> str: |
|
|
|
|
|
|
|
|
|
|
|
|
|
return re.sub(code_lang_pattern, code_lang_format, val) |
|
|
|
|
|
def html_to_markdown(val: str) -> str: |
|
|
|
val = re.sub(div_pattern, "", val) |
|
|
|
val = re.sub(span_pattern, "", val) |
|
|
|
val = markdownify.markdownify(val).strip() |
|
|
|
val = reformat_code(val) |
|
|
|
|
|
noise = re.search(regenerate_pattern, val) |
|
if noise and noise.start() == 0: |
|
val = val[noise.end() :] |
|
|
|
val = re.sub(copy_chars_pattern, "", val) |
|
|
|
val = re.sub(copy_code_pattern, "", val) |
|
|
|
|
|
val = val.replace("\n\n\n", "\n").strip() |
|
|
|
if args.debug: |
|
print(val) |
|
exit() |
|
|
|
return val |
|
|
|
|
|
def should_filter(val: str) -> bool: |
|
black_list = ["openai", "chatgpt"] |
|
for w in black_list: |
|
if w in val.lower(): |
|
return True |
|
return False |
|
|
|
|
|
def clean_html_source(content, begin, end, check_tag, check_num): |
|
""" |
|
Clean the input json content. |
|
|
|
Args: |
|
content: json file loaded in memory. |
|
check_tag: a debug purpose arg. If a conversation contains the tag, log |
|
it before and after cleaning. |
|
check_num: number of matched conversations logged. |
|
""" |
|
BARRIER = "\n" + "=" * 20 + "\n" |
|
cnt_skip = 0 |
|
cnt_too_short = 0 |
|
cnt_id_duplication = 0 |
|
cnt_value_duplication = 0 |
|
cnt_filter = 0 |
|
cnt_tag = 0 |
|
visited = {} |
|
|
|
content = content[begin:end] |
|
new_content = [] |
|
|
|
for sample in tqdm.tqdm(content): |
|
skipped = False |
|
cid = sample["id"] |
|
|
|
if len(sample["conversations"]) <= 1: |
|
print(f"id {cid} is too short") |
|
cnt_too_short += 1 |
|
skipped = True |
|
elif cid in visited: |
|
print(f"id {cid} is an id duplication of {visited[cid]}") |
|
cnt_id_duplication += 1 |
|
skipped = True |
|
elif ( |
|
sample["conversations"][1]["value"], |
|
len(sample["conversations"]), |
|
) in visited: |
|
key = (sample["conversations"][1]["value"], len(sample["conversations"])) |
|
print(f"id {cid} is a value duplication of {visited[key]}") |
|
cnt_value_duplication += 1 |
|
skipped = True |
|
else: |
|
key = (sample["conversations"][1]["value"], len(sample["conversations"])) |
|
visited[cid] = visited[key] = cid |
|
|
|
for c in sample["conversations"]: |
|
if should_filter(c["value"]): |
|
print(f"id {cid} is filtered out") |
|
cnt_filter += 1 |
|
skipped = True |
|
break |
|
|
|
try: |
|
new_val = html_to_markdown(c["value"]) |
|
except (bs4.builder.ParserRejectedMarkup, AssertionError): |
|
skipped = True |
|
break |
|
|
|
c["value"] = new_val |
|
|
|
|
|
if ( |
|
check_tag is not None |
|
and check_tag in c["value"] |
|
and cnt_tag < check_num |
|
): |
|
logging.debug( |
|
BARRIER |
|
+ c["value"] |
|
+ "\n" |
|
+ BARRIER |
|
+ new_val |
|
+ "\n" |
|
+ BARRIER |
|
+ "\n" |
|
) |
|
cnt_tag += 1 |
|
if cnt_tag == check_num: |
|
break |
|
|
|
if not skipped: |
|
new_content.append(sample) |
|
else: |
|
cnt_skip += 1 |
|
|
|
print( |
|
f"total: {len(content)}, skip: {cnt_skip}, new: {len(new_content)}, " |
|
f"cnt_too_short: {cnt_too_short}, cnt_id_duplication: {cnt_id_duplication}, " |
|
f"cnt_value_duplication: {cnt_value_duplication}, cnt_filter: {cnt_filter}" |
|
) |
|
|
|
return new_content |
|
|
|
|
|
def main(args): |
|
content = json.load(open(args["in_file"], "r")) |
|
content = clean_html_source( |
|
content, args["begin"], args["end"], args["check_tag"], args["check_num"] |
|
) |
|
json.dump(content, open(args["out_file"], "w"), indent=2) |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--in-file", type=str, required=True) |
|
parser.add_argument("--out-file", type=str, default="sharegpt_clean.json") |
|
parser.add_argument("--begin", type=int) |
|
parser.add_argument("--end", type=int) |
|
parser.add_argument("--debug", action="store_true") |
|
parser.add_argument("--check-tag", type=str) |
|
parser.add_argument("--check-num", type=int, default=1) |
|
args = parser.parse_args() |
|
main(vars(args)) |
|
|