victan commited on
Commit
f09b12c
1 Parent(s): e38a9e2

Upload seamless_communication/cli/toxicity/etox.py with huggingface_hub

Browse files
seamless_communication/cli/toxicity/etox.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # MIT_LICENSE file in the root directory of this source tree.
6
+
7
+ import argparse
8
+ import sys
9
+
10
+ from seamless_communication.toxicity import load_etox_bad_word_checker
11
+
12
+
13
+ def main() -> None:
14
+ parser = argparse.ArgumentParser(
15
+ description="ETOX will compute the toxicity level of text inputs (STDIN > STDOUT)."
16
+ )
17
+ parser.add_argument(
18
+ "lang",
19
+ type=str,
20
+ help="Language, language of the speech to transcribe",
21
+ )
22
+ parser.add_argument(
23
+ "input", nargs="?", type=argparse.FileType("r"), default=sys.stdin
24
+ )
25
+ parser.add_argument(
26
+ "output", nargs="?", type=argparse.FileType("w"), default=sys.stdout
27
+ )
28
+ args, _unknown = parser.parse_known_args()
29
+
30
+ bad_word_checker = load_etox_bad_word_checker("mintox")
31
+
32
+ print("text", "toxicity", "bad_words", sep="\t", file=args.output)
33
+ for line in args.input:
34
+ l = line.rstrip()
35
+ bad_words = bad_word_checker.get_bad_words(
36
+ text=l,
37
+ lang=args.lang,
38
+ )
39
+ print(l, len(bad_words), ",".join(bad_words), sep="\t", file=args.output)
40
+
41
+
42
+ if __name__ == "__main__":
43
+ main()