kadirnar commited on
Commit
7cbe3b6
·
1 Parent(s): d6541b7

added multilingual_translation library

Browse files
Files changed (3) hide show
  1. app.py +22 -4
  2. requirements.txt +2 -1
  3. utils.py +106 -0
app.py CHANGED
@@ -1,5 +1,8 @@
1
  from transformers import pipeline, set_seed
2
  from transformers import BioGptTokenizer, BioGptForCausalLM
 
 
 
3
  import gradio as gr
4
 
5
  model_list = [
@@ -7,18 +10,30 @@ model_list = [
7
  "microsoft/BioGPT-Large-PubMedQA"
8
  ]
9
 
 
 
 
 
 
 
 
 
 
 
10
  def biogpt(
11
  prompt: str,
12
  model_id: str,
13
  max_length: int = 25,
14
- num_return_sequences: int = 5
 
15
  ):
16
 
 
17
  model = BioGptForCausalLM.from_pretrained(model_id)
18
  tokenizer = BioGptTokenizer.from_pretrained(model_id)
19
  generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
20
  set_seed(42)
21
- output = generator(prompt, max_length=max_length, num_return_sequences=num_return_sequences, do_sample=True)
22
  output_dict = {
23
  "1": output[0]['generated_text'],
24
  "2": output[1]['generated_text'],
@@ -33,13 +48,16 @@ inputs = [
33
  gr.inputs.Textbox(label="Prompt", lines=5, default="COVID-19 is"),
34
  gr.Dropdown(model_list, value="microsoft/biogpt", label="Model ID"),
35
  gr.inputs.Slider(5, 100, 25, default=25, label="Max Length"),
36
- gr.inputs.Slider(1, 10, 5, default=5, label="Num Return Sequences")
 
37
  ]
38
 
39
  outputs = gr.outputs.Textbox(label="Output")
40
 
41
  examples = [
42
- ["COVID-19 is", "microsoft/biogpt", 25, 5]
 
 
43
  ]
44
 
45
  title = " BioGPT: Generative Pre-trained Transformer for Biomedical Text Generation and Mining"
 
1
  from transformers import pipeline, set_seed
2
  from transformers import BioGptTokenizer, BioGptForCausalLM
3
+ from multilingual_translation import translate
4
+ from utils import lang_ids
5
+
6
  import gradio as gr
7
 
8
  model_list = [
 
10
  "microsoft/BioGPT-Large-PubMedQA"
11
  ]
12
 
13
+ lang_list = list(lang_ids.keys())
14
+
15
+ def translate_to_english(text, base_lang):
16
+ if base_lang == "en":
17
+ return text
18
+ else:
19
+ new_text = translate("facebook/m2m100_418M", text, base_lang, "en")
20
+ return new_text
21
+
22
+
23
  def biogpt(
24
  prompt: str,
25
  model_id: str,
26
  max_length: int = 25,
27
+ num_return_sequences: int = 5,
28
+ base_lang: str = "en"
29
  ):
30
 
31
+ en_prompt = translate_to_english(prompt, base_lang)
32
  model = BioGptForCausalLM.from_pretrained(model_id)
33
  tokenizer = BioGptTokenizer.from_pretrained(model_id)
34
  generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
35
  set_seed(42)
36
+ output = generator(en_prompt, max_length=max_length, num_return_sequences=num_return_sequences, do_sample=True)
37
  output_dict = {
38
  "1": output[0]['generated_text'],
39
  "2": output[1]['generated_text'],
 
48
  gr.inputs.Textbox(label="Prompt", lines=5, default="COVID-19 is"),
49
  gr.Dropdown(model_list, value="microsoft/biogpt", label="Model ID"),
50
  gr.inputs.Slider(5, 100, 25, default=25, label="Max Length"),
51
+ gr.inputs.Slider(1, 10, 5, default=5, label="Num Return Sequences"),
52
+ gr.Dropdown(lang_list, value="en", label="Base Language")
53
  ]
54
 
55
  outputs = gr.outputs.Textbox(label="Output")
56
 
57
  examples = [
58
+ ["COVID-19 is", "microsoft/biogpt", 25, 5, "en"],
59
+ ["Kanser", "microsoft/BioGPT-Large-PubMedQA", 25, 5, "tr"],
60
+ ["Covid-19 est", "microsoft/biogpt", 25, 5, "fr"],
61
  ]
62
 
63
  title = " BioGPT: Generative Pre-trained Transformer for Biomedical Text Generation and Mining"
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  transformers==4.26.0
2
  sacremoses
3
- torch
 
 
1
  transformers==4.26.0
2
  sacremoses
3
+ torch
4
+ multilingual_translation
utils.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import requests
3
+
4
+
5
+ lang_ids = {
6
+ "Afrikaans": "af",
7
+ "Amharic": "am",
8
+ "Arabic": "ar",
9
+ "Asturian": "ast",
10
+ "Azerbaijani": "az",
11
+ "Bashkir": "ba",
12
+ "Belarusian": "be",
13
+ "Bulgarian": "bg",
14
+ "Bengali": "bn",
15
+ "Breton": "br",
16
+ "Bosnian": "bs",
17
+ "Catalan": "ca",
18
+ "Cebuano": "ceb",
19
+ "Czech": "cs",
20
+ "Welsh": "cy",
21
+ "Danish": "da",
22
+ "German": "de",
23
+ "Greeek": "el",
24
+ "English": "en",
25
+ "Spanish": "es",
26
+ "Estonian": "et",
27
+ "Persian": "fa",
28
+ "Fulah": "ff",
29
+ "Finnish": "fi",
30
+ "French": "fr",
31
+ "Western Frisian": "fy",
32
+ "Irish": "ga",
33
+ "Gaelic": "gd",
34
+ "Galician": "gl",
35
+ "Gujarati": "gu",
36
+ "Hausa": "ha",
37
+ "Hebrew": "he",
38
+ "Hindi": "hi",
39
+ "Croatian": "hr",
40
+ "Haitian": "ht",
41
+ "Hungarian": "hu",
42
+ "Armenian": "hy",
43
+ "Indonesian": "id",
44
+ "Igbo": "ig",
45
+ "Iloko": "ilo",
46
+ "Icelandic": "is",
47
+ "Italian": "it",
48
+ "Japanese": "ja",
49
+ "Javanese": "jv",
50
+ "Georgian": "ka",
51
+ "Kazakh": "kk",
52
+ "Central Khmer": "km",
53
+ "Kannada": "kn",
54
+ "Korean": "ko",
55
+ "Luxembourgish": "lb",
56
+ "Ganda": "lg",
57
+ "Lingala": "ln",
58
+ "Lao": "lo",
59
+ "Lithuanian": "lt",
60
+ "Latvian": "lv",
61
+ "Malagasy": "mg",
62
+ "Macedonian": "mk",
63
+ "Malayalam": "ml",
64
+ "Mongolian": "mn",
65
+ "Marathi": "mr",
66
+ "Malay": "ms",
67
+ "Burmese": "my",
68
+ "Nepali": "ne",
69
+ "Dutch": "nl",
70
+ "Norwegian": "no",
71
+ "Northern Sotho": "ns",
72
+ "Occitan": "oc",
73
+ "Oriya": "or",
74
+ "Panjabi": "pa",
75
+ "Polish": "pl",
76
+ "Pushto": "ps",
77
+ "Portuguese": "pt",
78
+ "Romanian": "ro",
79
+ "Russian": "ru",
80
+ "Sindhi": "sd",
81
+ "Sinhala": "si",
82
+ "Slovak": "sk",
83
+ "Slovenian": "sl",
84
+ "Somali": "so",
85
+ "Albanian": "sq",
86
+ "Serbian": "sr",
87
+ "Swati": "ss",
88
+ "Sundanese": "su",
89
+ "Swedish": "sv",
90
+ "Swahili": "sw",
91
+ "Tamil": "ta",
92
+ "Thai": "th",
93
+ "Tagalog": "tl",
94
+ "Tswana": "tn",
95
+ "Turkish": "tr",
96
+ "Ukrainian": "uk",
97
+ "Urdu": "ur",
98
+ "Uzbek": "uz",
99
+ "Vietnamese": "vi",
100
+ "Wolof": "wo",
101
+ "Xhosa": "xh",
102
+ "Yiddish": "yi",
103
+ "Yoruba": "yo",
104
+ "Chinese": "zh",
105
+ "Zulu": "zu",
106
+ }