# Copyright (c) Meta Platforms, Inc. and affiliates | |
# All rights reserved. | |
# | |
# This source code is licensed under the license found in the | |
# MIT_LICENSE file in the root directory of this source tree. | |
import argparse | |
import sys | |
from seamless_communication.toxicity import load_etox_bad_word_checker | |
def main() -> None: | |
parser = argparse.ArgumentParser( | |
description="ETOX will compute the toxicity level of text inputs (STDIN > STDOUT)." | |
) | |
parser.add_argument( | |
"lang", | |
type=str, | |
help="Language, language of the speech to transcribe", | |
) | |
parser.add_argument( | |
"input", nargs="?", type=argparse.FileType("r"), default=sys.stdin | |
) | |
parser.add_argument( | |
"output", nargs="?", type=argparse.FileType("w"), default=sys.stdout | |
) | |
args, _unknown = parser.parse_known_args() | |
bad_word_checker = load_etox_bad_word_checker("mintox") | |
print("text", "toxicity", "bad_words", sep="\t", file=args.output) | |
for line in args.input: | |
l = line.rstrip() | |
bad_words = bad_word_checker.get_bad_words( | |
text=l, | |
lang=args.lang, | |
) | |
print(l, len(bad_words), ",".join(bad_words), sep="\t", file=args.output) | |
if __name__ == "__main__": | |
main() | |