. This is required to make intent work in code blocks. val = re.sub(div_pattern, "", val) # Remove all . This is required to make underscores work in code blocks. val = re.sub(span_pattern, "", val) # Markdown to html val = markdownify.markdownify(val).strip() # Reformat code val = reformat_code(val) # Remove noisy "[number] / [number]" at the beginning noise = re.search(regenerate_pattern, val) if noise and noise.start() == 0: val = val[noise.end() :] # Remove noisy "Copy[number] chars / [number] words" val = re.sub(copy_chars_pattern, "", val) # Remove empty code block ```\nCopy code\n``` val = re.sub(copy_code_pattern, "", val) # Strip val = val.replace("\n\n\n", "\n").strip() return val def contain_blocked_words(val: str) -> bool: blocked_words = ["openai", "chatgpt"] for w in blocked_words: if w in val.lower(): return True return False def clean_html_one_sample(sample): roles = ["human", "gpt"] if len(sample["conversations"]) <= 1: return (sample, 1) # Adjust the offset for cases like https://sharegpt.com/c/VyaZlh4 if sample["conversations"][0]["from"] != "human": sample["conversations"] = sample["conversations"][1:] if len(sample["conversations"]) <= 1: return (sample, 1) if sample["conversations"][-1]["from"] == "human": sample["conversations"] = sample["conversations"][:-1] if len(sample["conversations"]) <= 1: return (sample, 1) for i, c in enumerate(sample["conversations"]): if c["from"] != roles[i % 2]: return (sample, 2) if contain_blocked_words(c["value"]): return (sample, 3) try: new_val = html_to_markdown(c["value"]) except (bs4.builder.ParserRejectedMarkup, AssertionError): return (sample, 4) c["value"] = new_val return (sample, 0) def clean_html_all(content, begin, end): """ Clean the source html files. """ cnt_skip = 0 cnt_blocked_words = 0 cnt_wrong_format = 0 cnt_parser_error = 0 cnt_too_short = 0 cnt_id_duplication = 0 cnt_value_duplication = 0 cnt_tag = 0 content = content[begin:end] processed = [] with ProcessPoolExecutor() as executor: for result in tqdm( executor.map(clean_html_one_sample, content), total=len(content) ): processed.append(result) visited = {} new_content = [] for sample, error_code in tqdm(processed): cid = sample["id"] skipped = True if error_code != 0: if error_code == 1: print(f"id {cid} is too short") cnt_too_short += 1 elif error_code == 2: print(f"id {cid} has a wrong format") cnt_wrong_format += 1 elif error_code == 3: print(f"id {cid} contains blocked words") cnt_blocked_words += 1 elif error_code == 4: print(f"id {cid} contains parser errors") cnt_parser_error += 1 else: raise ValueError(f"Invalid error_code: {error_code}") elif cid in visited: print(f"id {cid} is an id duplication of {visited[cid]}") cnt_id_duplication += 1 elif ( sample["conversations"][1]["value"], len(sample["conversations"]), ) in visited: key = (sample["conversations"][1]["value"], len(sample["conversations"])) print(f"id {cid} is a value duplication of {visited[key]}") cnt_value_duplication += 1 else: key = (sample["conversations"][1]["value"], len(sample["conversations"])) visited[cid] = visited[key] = cid skipped = False if not skipped: new_content.append(sample) else: cnt_skip += 1 print( f"total: {len(content)}, skip: {cnt_skip}, new: {len(new_content)}, " f"cnt_blocked_words: {cnt_blocked_words}, cnt_parser_error: {cnt_parser_error}, " f"cnt_wrong_format: {cnt_wrong_format}, " f"cnt_too_short: {cnt_too_short}, cnt_id_duplication: {cnt_id_duplication}, " f"cnt_value_duplication: {cnt_value_duplication}, " ) return new_content def main(args): content = json.load(open(args["in_file"], "r")) content = clean_html_all(content, args["begin"], args["end"]) json.dump(content, open(args["out_file"], "w"), indent=2) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--in-file", type=str, required=True) parser.add_argument("--out-file", type=str, default="sharegpt_clean.json") parser.add_argument("--begin", type=int) parser.add_argument("--end", type=int) parser.add_argument("--debug", action="store_true") args = parser.parse_args() main(vars(args))