tonic commited on
Commit
4889640
1 Parent(s): f8ee061

language list and prompt engineering ;-)

Browse files
Files changed (2) hide show
  1. app.py +19 -13
  2. lang_list.py +255 -0
app.py CHANGED
@@ -5,6 +5,7 @@ from surya.ocr import run_ocr
5
  from surya.model.detection.segformer import load_model as load_det_model, load_processor as load_det_processor
6
  from surya.model.recognition.model import load_model as load_rec_model
7
  from surya.model.recognition.processor import load_processor as load_rec_processor
 
8
  from gradio_client import Client
9
  from dotenv import load_dotenv
10
  import requests
@@ -16,12 +17,14 @@ import re
16
 
17
  title = "# Welcome to AyaTonic"
18
  description = "Learn a New Language With Aya"
19
-
20
  # Load environment variables
21
  load_dotenv()
22
  COHERE_API_KEY = os.getenv('CO_API_KEY')
23
  SEAMLESSM4T = os.getenv('SEAMLESSM4T')
24
 
 
 
 
25
 
26
  # Regular expression patterns for each color
27
  patterns = {
@@ -66,14 +69,14 @@ class TaggedPhraseExtractor:
66
  co = cohere.Client(COHERE_API_KEY)
67
  audio_client = Client(SEAMLESSM4T)
68
 
69
- def process_audio_to_text(audio_path):
70
  """
71
  Convert audio input to text using the Gradio client.
72
  """
73
  result = audio_client.predict(
74
  audio_path,
75
- "English",
76
- "English",
77
  api_name="/s2tt"
78
  )
79
  print("Audio Result: ", result)
@@ -85,8 +88,8 @@ def process_text_to_audio(text, target_language="English"):
85
  """
86
  result = audio_client.predict(
87
  text,
88
- "English",
89
  target_language,
 
90
  api_name="/t2st"
91
  )
92
  return result['audio'] # Adjust based on the actual response
@@ -141,20 +144,22 @@ def process_input(image=None, file=None, audio=None, text=""):
141
  audio_text = process_audio_to_text(audio)
142
  final_text += "\n" + audio_text
143
 
 
 
144
  response = co.generate(
145
  model='c4ai-aya',
146
- prompt=final_text,
147
  max_tokens=1024,
148
  temperature=0.5
149
  )
 
150
  generated_text = response.generations[0].text
151
  print("Generated Text: ", generated_text)
152
-
153
- # Process generated text with command-nightly model
154
  response = co.generate(
155
  model='command-nightly',
156
- prompt=generated_text,
157
- max_tokens=1024,
158
  temperature=0.5
159
  )
160
  processed_text = response.generations[0].text
@@ -162,7 +167,6 @@ def process_input(image=None, file=None, audio=None, text=""):
162
  audio_output = process_text_to_audio(processed_text)
163
 
164
  return processed_text, audio_output
165
-
166
  # Define Gradio interface
167
  iface = gr.Interface(
168
  fn=process_input,
@@ -187,7 +191,7 @@ if __name__ == "__main__":
187
  # co = cohere.Client('yhA228YGeZSl1ctten8LQxw2dky2nngHetXFjV2Q') # This is your trial API key
188
  # response = co.generate(
189
  # model='c4ai-aya',
190
- # prompt='एक यांत्रिक घड़ी दिन के समय को प्रदान करने के लिए एक गैर-इलेक्ट्रॉनिक तंत्र का उपयोग करती है। एक मुख्य स्प्रिंग का उपयोग यांत्रिक तंत्र को ऊर्जा संग्रहीत करने के लिए किया जाता है। एक यांत्रिक घड़ी में दांतों का एक कुंडल होता है जो धीरे-धीरे मुख्य स्प्रिंग से संचालित होता है। दांतों के कुंडल को एक यांत्रिक तंत्र में स्थानांतरित करने के लिए पहियों की एक श्रृंखला का उपयोग किया जाता है जो हाथों को घड़ी के चेहरे पर दाईं ओर ले जाता है। घड़ी के तंत्र को स्थिर करने और यह सुनिश्चित करने के लिए कि हाथ सही दिशा में घूमते हैं, एक कंपन का उपयोग किया जाता है।\n\nProduce a complete blog post in FRENCH based on the above : ',
191
  # max_tokens=3674,
192
  # temperature=0.9,
193
  # k=0,
@@ -224,7 +228,9 @@ iface = gr.Interface(
224
  gr.Image(type="pil", label="Camera Input"),
225
  gr.File(label="File Upload"),
226
  gr.Audio(sources="microphone", type="filepath", label="Mic Input"),
227
- gr.Textbox(lines=2, label="Text Input")
 
 
228
  ],
229
  outputs=[
230
  gr.RichTextbox(label="Processed Text"),
 
5
  from surya.model.detection.segformer import load_model as load_det_model, load_processor as load_det_processor
6
  from surya.model.recognition.model import load_model as load_rec_model
7
  from surya.model.recognition.processor import load_processor as load_rec_processor
8
+ from lang_list import LANGUAGE_NAME_TO_CODE, TEXT_SOURCE_LANGUAGE_NAMES, S2ST_TARGET_LANGUAGE_NAMES
9
  from gradio_client import Client
10
  from dotenv import load_dotenv
11
  import requests
 
17
 
18
  title = "# Welcome to AyaTonic"
19
  description = "Learn a New Language With Aya"
 
20
  # Load environment variables
21
  load_dotenv()
22
  COHERE_API_KEY = os.getenv('CO_API_KEY')
23
  SEAMLESSM4T = os.getenv('SEAMLESSM4T')
24
 
25
+ inputlanguage = ""
26
+ producetext = "\n\nProduce a complete expositional blog post in {target_language} based on the above :"
27
+ formatinputstring = "\n\nthe above text is a learning aid. you must use rich text format to rewrite the above and add 1 . a red color tags for nouns 2. a blue color tag for verbs 3. a green color tag for adjectives and adverbs:"
28
 
29
  # Regular expression patterns for each color
30
  patterns = {
 
69
  co = cohere.Client(COHERE_API_KEY)
70
  audio_client = Client(SEAMLESSM4T)
71
 
72
+ def process_audio_to_text(audio_path, inputlanguage="English"):
73
  """
74
  Convert audio input to text using the Gradio client.
75
  """
76
  result = audio_client.predict(
77
  audio_path,
78
+ inputlanguage,
79
+ inputlanguage,
80
  api_name="/s2tt"
81
  )
82
  print("Audio Result: ", result)
 
88
  """
89
  result = audio_client.predict(
90
  text,
 
91
  target_language,
92
+ target_language, # could be make a variation for learning content
93
  api_name="/t2st"
94
  )
95
  return result['audio'] # Adjust based on the actual response
 
144
  audio_text = process_audio_to_text(audio)
145
  final_text += "\n" + audio_text
146
 
147
+ final_text_with_producetext = final_text + producetext
148
+
149
  response = co.generate(
150
  model='c4ai-aya',
151
+ prompt=final_text_with_producetext,
152
  max_tokens=1024,
153
  temperature=0.5
154
  )
155
+ # add graceful handling for errors (overflow)
156
  generated_text = response.generations[0].text
157
  print("Generated Text: ", generated_text)
158
+ generated_text_with_format = generated_text + "\n" + formatinputstring
 
159
  response = co.generate(
160
  model='command-nightly',
161
+ prompt=generated_text_with_format,
162
+ max_tokens=4000,
163
  temperature=0.5
164
  )
165
  processed_text = response.generations[0].text
 
167
  audio_output = process_text_to_audio(processed_text)
168
 
169
  return processed_text, audio_output
 
170
  # Define Gradio interface
171
  iface = gr.Interface(
172
  fn=process_input,
 
191
  # co = cohere.Client('yhA228YGeZSl1ctten8LQxw2dky2nngHetXFjV2Q') # This is your trial API key
192
  # response = co.generate(
193
  # model='c4ai-aya',
194
+ # prompt='एक यांत्रिक घड़ी दिन के समय को प्रदान करने ��े लिए एक गैर-इलेक्ट्रॉनिक तंत्र का उपयोग करती है। एक मुख्य स्प्रिंग का उपयोग यांत्रिक तंत्र को ऊर्जा संग्रहीत करने के लिए किया जाता है। एक यांत्रिक घड़ी में दांतों का एक कुंडल होता है जो धीरे-धीरे मुख्य स्प्रिंग से संचालित होता है। दांतों के कुंडल को एक यांत्रिक तंत्र में स्थानांतरित करने के लिए पहियों की एक श्रृंखला का उपयोग किया जाता है जो हाथों को घड़ी के चेहरे पर दाईं ओर ले जाता है। घड़ी के तंत्र को स्थिर करने और यह सुनिश्चित करने के लिए कि हाथ सही दिशा में घूमते हैं, एक कंपन का उपयोग किया जाता है। ',
195
  # max_tokens=3674,
196
  # temperature=0.9,
197
  # k=0,
 
228
  gr.Image(type="pil", label="Camera Input"),
229
  gr.File(label="File Upload"),
230
  gr.Audio(sources="microphone", type="filepath", label="Mic Input"),
231
+ gr.Textbox(lines=2, label="Text Input"),
232
+ gr.Dropdown(choices=TEXT_SOURCE_LANGUAGE_NAMES, label="Input Language"),
233
+ gr.Dropdown(choices=TEXT_SOURCE_LANGUAGE_NAMES, label="Target Language")
234
  ],
235
  outputs=[
236
  gr.RichTextbox(label="Processed Text"),
lang_list.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Language dict from https://huggingface.co/spaces/facebook/seamless-m4t-v2-large/blob/main/lang_list.py
2
+ language_code_to_name = {
3
+ "afr": "Afrikaans",
4
+ "amh": "Amharic",
5
+ "arb": "Modern Standard Arabic",
6
+ "ary": "Moroccan Arabic",
7
+ "arz": "Egyptian Arabic",
8
+ "asm": "Assamese",
9
+ "ast": "Asturian",
10
+ "azj": "North Azerbaijani",
11
+ "bel": "Belarusian",
12
+ "ben": "Bengali",
13
+ "bos": "Bosnian",
14
+ "bul": "Bulgarian",
15
+ "cat": "Catalan",
16
+ "ceb": "Cebuano",
17
+ "ces": "Czech",
18
+ "ckb": "Central Kurdish",
19
+ "cmn": "Mandarin Chinese",
20
+ "cym": "Welsh",
21
+ "dan": "Danish",
22
+ "deu": "German",
23
+ "ell": "Greek",
24
+ "eng": "English",
25
+ "est": "Estonian",
26
+ "eus": "Basque",
27
+ "fin": "Finnish",
28
+ "fra": "French",
29
+ "gaz": "West Central Oromo",
30
+ "gle": "Irish",
31
+ "glg": "Galician",
32
+ "guj": "Gujarati",
33
+ "heb": "Hebrew",
34
+ "hin": "Hindi",
35
+ "hrv": "Croatian",
36
+ "hun": "Hungarian",
37
+ "hye": "Armenian",
38
+ "ibo": "Igbo",
39
+ "ind": "Indonesian",
40
+ "isl": "Icelandic",
41
+ "ita": "Italian",
42
+ "jav": "Javanese",
43
+ "jpn": "Japanese",
44
+ "kam": "Kamba",
45
+ "kan": "Kannada",
46
+ "kat": "Georgian",
47
+ "kaz": "Kazakh",
48
+ "kea": "Kabuverdianu",
49
+ "khk": "Halh Mongolian",
50
+ "khm": "Khmer",
51
+ "kir": "Kyrgyz",
52
+ "kor": "Korean",
53
+ "lao": "Lao",
54
+ "lit": "Lithuanian",
55
+ "ltz": "Luxembourgish",
56
+ "lug": "Ganda",
57
+ "luo": "Luo",
58
+ "lvs": "Standard Latvian",
59
+ "mai": "Maithili",
60
+ "mal": "Malayalam",
61
+ "mar": "Marathi",
62
+ "mkd": "Macedonian",
63
+ "mlt": "Maltese",
64
+ "mni": "Meitei",
65
+ "mya": "Burmese",
66
+ "nld": "Dutch",
67
+ "nno": "Norwegian Nynorsk",
68
+ "nob": "Norwegian Bokm\u00e5l",
69
+ "npi": "Nepali",
70
+ "nya": "Nyanja",
71
+ "oci": "Occitan",
72
+ "ory": "Odia",
73
+ "pan": "Punjabi",
74
+ "pbt": "Southern Pashto",
75
+ "pes": "Western Persian",
76
+ "pol": "Polish",
77
+ "por": "Portuguese",
78
+ "ron": "Romanian",
79
+ "rus": "Russian",
80
+ "slk": "Slovak",
81
+ "slv": "Slovenian",
82
+ "sna": "Shona",
83
+ "snd": "Sindhi",
84
+ "som": "Somali",
85
+ "spa": "Spanish",
86
+ "srp": "Serbian",
87
+ "swe": "Swedish",
88
+ "swh": "Swahili",
89
+ "tam": "Tamil",
90
+ "tel": "Telugu",
91
+ "tgk": "Tajik",
92
+ "tgl": "Tagalog",
93
+ "tha": "Thai",
94
+ "tur": "Turkish",
95
+ "ukr": "Ukrainian",
96
+ "urd": "Urdu",
97
+ "uzn": "Northern Uzbek",
98
+ "vie": "Vietnamese",
99
+ "xho": "Xhosa",
100
+ "yor": "Yoruba",
101
+ "yue": "Cantonese",
102
+ "zlm": "Colloquial Malay",
103
+ "zsm": "Standard Malay",
104
+ "zul": "Zulu",
105
+ }
106
+ LANGUAGE_NAME_TO_CODE = {v: k for k, v in language_code_to_name.items()}
107
+
108
+ # Source langs: S2ST / S2TT / ASR don't need source lang
109
+ # T2TT / T2ST use this
110
+ text_source_language_codes = [
111
+ "afr",
112
+ "amh",
113
+ "arb",
114
+ "ary",
115
+ "arz",
116
+ "asm",
117
+ "azj",
118
+ "bel",
119
+ "ben",
120
+ "bos",
121
+ "bul",
122
+ "cat",
123
+ "ceb",
124
+ "ces",
125
+ "ckb",
126
+ "cmn",
127
+ "cym",
128
+ "dan",
129
+ "deu",
130
+ "ell",
131
+ "eng",
132
+ "est",
133
+ "eus",
134
+ "fin",
135
+ "fra",
136
+ "gaz",
137
+ "gle",
138
+ "glg",
139
+ "guj",
140
+ "heb",
141
+ "hin",
142
+ "hrv",
143
+ "hun",
144
+ "hye",
145
+ "ibo",
146
+ "ind",
147
+ "isl",
148
+ "ita",
149
+ "jav",
150
+ "jpn",
151
+ "kan",
152
+ "kat",
153
+ "kaz",
154
+ "khk",
155
+ "khm",
156
+ "kir",
157
+ "kor",
158
+ "lao",
159
+ "lit",
160
+ "lug",
161
+ "luo",
162
+ "lvs",
163
+ "mai",
164
+ "mal",
165
+ "mar",
166
+ "mkd",
167
+ "mlt",
168
+ "mni",
169
+ "mya",
170
+ "nld",
171
+ "nno",
172
+ "nob",
173
+ "npi",
174
+ "nya",
175
+ "ory",
176
+ "pan",
177
+ "pbt",
178
+ "pes",
179
+ "pol",
180
+ "por",
181
+ "ron",
182
+ "rus",
183
+ "slk",
184
+ "slv",
185
+ "sna",
186
+ "snd",
187
+ "som",
188
+ "spa",
189
+ "srp",
190
+ "swe",
191
+ "swh",
192
+ "tam",
193
+ "tel",
194
+ "tgk",
195
+ "tgl",
196
+ "tha",
197
+ "tur",
198
+ "ukr",
199
+ "urd",
200
+ "uzn",
201
+ "vie",
202
+ "yor",
203
+ "yue",
204
+ "zsm",
205
+ "zul",
206
+ ]
207
+ TEXT_SOURCE_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in text_source_language_codes])
208
+
209
+ # Target langs:
210
+ # S2ST / T2ST
211
+ s2st_target_language_codes = [
212
+ "eng",
213
+ "arb",
214
+ "ben",
215
+ "cat",
216
+ "ces",
217
+ "cmn",
218
+ "cym",
219
+ "dan",
220
+ "deu",
221
+ "est",
222
+ "fin",
223
+ "fra",
224
+ "hin",
225
+ "ind",
226
+ "ita",
227
+ "jpn",
228
+ "kor",
229
+ "mlt",
230
+ "nld",
231
+ "pes",
232
+ "pol",
233
+ "por",
234
+ "ron",
235
+ "rus",
236
+ "slk",
237
+ "spa",
238
+ "swe",
239
+ "swh",
240
+ "tel",
241
+ "tgl",
242
+ "tha",
243
+ "tur",
244
+ "ukr",
245
+ "urd",
246
+ "uzn",
247
+ "vie",
248
+ ]
249
+ S2ST_TARGET_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in s2st_target_language_codes])
250
+ T2ST_TARGET_LANGUAGE_NAMES = S2ST_TARGET_LANGUAGE_NAMES
251
+
252
+ # S2TT / T2TT / ASR
253
+ S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
254
+ T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
255
+ ASR_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES