Anonymous commited on
Commit
15f5208
·
1 Parent(s): 707f578

more fixes

Browse files
Files changed (6) hide show
  1. app.py +1 -1
  2. generate_prompt.py +21 -16
  3. tasks/ner.py +138 -37
  4. tasks/nli.py +15 -18
  5. tasks/qa.py +49 -14
  6. tasks/summarization.py +30 -5
app.py CHANGED
@@ -131,6 +131,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
131
  elif task == NER:
132
  text_example = {
133
  'tokens': sentence,
 
134
  }
135
  else:
136
  text_example = {
@@ -138,7 +139,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
138
  'premise': premise
139
  }
140
 
141
- print(text_example)
142
  prompt = construct_generic_prompt(task, instruction, text_example, zero_shot, num_examples, selected_language, dataset, config)
143
 
144
  return prompt
 
131
  elif task == NER:
132
  text_example = {
133
  'tokens': sentence,
134
+ 'ner_tags': ''
135
  }
136
  else:
137
  text_example = {
 
139
  'premise': premise
140
  }
141
 
 
142
  prompt = construct_generic_prompt(task, instruction, text_example, zero_shot, num_examples, selected_language, dataset, config)
143
 
144
  return prompt
generate_prompt.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import csv
2
  import enum
3
  import json
@@ -20,7 +21,7 @@ from langchain.prompts import FewShotPromptTemplate, PromptTemplate
20
  from tqdm import tqdm
21
  from yaml.loader import SafeLoader
22
 
23
- from tasks import qa, summarization, ner, nli
24
 
25
 
26
  # from models.model_completion import gpt3x_completion, gemini_completion
@@ -47,8 +48,12 @@ def gpt3x_completion(
47
  ) -> str:
48
  import os
49
  import openai
50
- os.environ["OPENAI_API_KEY"] = ''
51
-
 
 
 
 
52
 
53
  def get_entities_chatGPT(final_prompt):
54
  response = openai.ChatCompletion.create(
@@ -67,7 +72,7 @@ def mixtral_completion(prompt):
67
  url = "https://api.together.xyz/v1/chat/completions"
68
 
69
  # Define your Together API key
70
- together_api_key = "" # Replace with your actual API key
71
 
72
  # Define the request payload
73
  payload = {
@@ -554,6 +559,7 @@ def construct_generic_prompt(task, instruction, test_example, zero_shot, num_exa
554
  instruction=instruction,
555
  test_example=test_example,
556
  zero_shot=zero_shot,
 
557
  num_examples=num_examples,
558
  lang=str(selected_language).lower(),
559
  config=config,
@@ -612,31 +618,30 @@ class Config:
612
  def recommend_config(task, lang, model_type):
613
  print(task)
614
  print(model_type)
 
615
  language_type = _get_language_type(lang)
616
- config = Config()
617
  print(language_type)
618
  if task == QA:
619
  if model_type == ModelType.English.value:
620
- config.set(prefix='source', context='source', examples='source', output='source')
621
  else:
622
- config.set(prefix='english', context='source', examples='source', output='source')
623
  if task == NER:
624
  if model_type == ModelType.English.value:
625
- config.set(prefix='source', context='source', examples='source', output='source')
626
  elif language_type == LanguageType.High:
627
- config.set(prefix='english', context='source', examples='source', output='source')
628
  else:
629
- config.set(prefix='english', context='source', examples='source', output='english')
630
  if task == NLI:
631
  if model_type == ModelType.English.value:
632
- config.set(prefix='source', context='source', examples='source', output='source')
633
  elif language_type == LanguageType.High:
634
- print("here")
635
- config.set(prefix='english', context='source', examples='english')
636
  else:
637
- print("here1")
638
- config.set(prefix='english', context='english', examples='english')
639
  if task == SUMMARIZATION:
640
- config.set(context='english')
641
 
642
  return config.to_dict()
 
1
+ import collections
2
  import csv
3
  import enum
4
  import json
 
21
  from tqdm import tqdm
22
  from yaml.loader import SafeLoader
23
 
24
+ from tasks import ner, summarization, qa, nli
25
 
26
 
27
  # from models.model_completion import gpt3x_completion, gemini_completion
 
48
  ) -> str:
49
  import os
50
  import openai
51
+ os.environ["OPENAI_API_KEY"] = '07d805ec4fbd484ebc923a3a41e1773d'
52
+ OPENAI_API_KEY = '07d805ec4fbd484ebc923a3a41e1773d'
53
+ openai.api_type = "azure"
54
+ openai.api_base = 'https://hebsum-itaim-uks.openai.azure.com/'
55
+ openai.api_version = "2023-03-15-preview"
56
+ openai.api_key = '07d805ec4fbd484ebc923a3a41e1773d'
57
 
58
  def get_entities_chatGPT(final_prompt):
59
  response = openai.ChatCompletion.create(
 
72
  url = "https://api.together.xyz/v1/chat/completions"
73
 
74
  # Define your Together API key
75
+ together_api_key = "851cfc39f3d7a246a2342259f5f6fbba4721c6002123365fba2254c9c9c424ad" # Replace with your actual API key
76
 
77
  # Define the request payload
78
  payload = {
 
559
  instruction=instruction,
560
  test_example=test_example,
561
  zero_shot=zero_shot,
562
+ dataset=dataset,
563
  num_examples=num_examples,
564
  lang=str(selected_language).lower(),
565
  config=config,
 
618
  def recommend_config(task, lang, model_type):
619
  print(task)
620
  print(model_type)
621
+ print(lang)
622
  language_type = _get_language_type(lang)
623
+ config = Config(lang, lang, lang, lang)
624
  print(language_type)
625
  if task == QA:
626
  if model_type == ModelType.English.value:
627
+ config.set(prefix=lang, context=lang, examples=lang, output=lang)
628
  else:
629
+ config.set(prefix='English', context=lang, examples=lang, output=lang)
630
  if task == NER:
631
  if model_type == ModelType.English.value:
632
+ config.set(prefix=lang, context=lang, examples=lang, output=lang)
633
  elif language_type == LanguageType.High:
634
+ config.set(prefix='English', context=lang, examples=lang, output=lang)
635
  else:
636
+ config.set(prefix='English', context=lang, examples=lang, output='English')
637
  if task == NLI:
638
  if model_type == ModelType.English.value:
639
+ config.set(prefix=lang, context=lang, examples=lang, output=lang)
640
  elif language_type == LanguageType.High:
641
+ config.set(prefix='English', context=lang, examples='English')
 
642
  else:
643
+ config.set(prefix='English', context='English', examples='English')
 
644
  if task == SUMMARIZATION:
645
+ config.set(context='English')
646
 
647
  return config.to_dict()
tasks/ner.py CHANGED
@@ -1,33 +1,56 @@
1
- from typing import List, Dict, Any
2
 
 
 
3
  from easygoogletranslate import EasyGoogleTranslate
4
  from langchain.prompts import PromptTemplate, FewShotPromptTemplate
5
 
6
- LANGUAGE_TO_GOOGLE_TRANSLATE_MARK = {
 
 
 
 
 
7
  "english": "en",
8
- "bambara": "bm",
9
- "ewe": "ee",
10
- "hausa": "ha",
11
- "igbo": "ig",
12
- "kinyarwanda": "rw",
13
- "chichewa": "ny",
14
- "twi": "ak",
15
- "yoruba": "yo",
16
- "slovak": "sk",
17
- "serbian": "sr",
18
- "swedish": "sv",
 
 
 
 
 
 
 
 
 
19
  "vietnamese": "vi",
 
 
 
 
 
 
 
20
  "italian": "it",
21
- "portuguese": "pt",
22
- "chinese": "zh",
23
- "english": "en",
24
- "french": "fr"
25
-
26
-
27
-
28
- }
29
-
30
- LANGAUGE_TO_PREFIX = {
31
  "bambara": "bam",
32
  "ewe": "ewe",
33
  "fon": "fon",
@@ -58,7 +81,7 @@ def _translate_instruction(basic_instruction: str, target_language: str) -> str:
58
  return translator.translate(basic_instruction)
59
 
60
 
61
- def create_instruction(lang: str, expected_output: str):
62
  basic_instruction = f"""You are an NLP assistant whose
63
  purpose is to perform Named Entity Recognition
64
  (NER). You will need to give each entity a tag, from the following:
@@ -69,11 +92,90 @@ def create_instruction(lang: str, expected_output: str):
69
  The entities should be in {expected_output} language"""
70
 
71
  return (
72
- basic_instruction
73
  if lang == "english"
74
  else _translate_instruction(basic_instruction, target_language=lang)
75
  )
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  def construct_prompt(
78
  instruction: str,
79
  test_example: dict,
@@ -84,20 +186,21 @@ def construct_prompt(
84
  config: Dict[str, str],
85
  ):
86
  if not instruction:
87
- print(lang)
88
- instruction = create_instruction(lang, config['prefix'])
89
 
90
  example_prompt = PromptTemplate(
91
- input_variables=["summary", "text"], template="Text: {text}\nSummary: {summary}"
 
92
  )
93
 
94
- zero_shot_template = f"""{instruction}""" + "\n Input: {text} " ""
 
 
 
 
 
95
 
96
- test_data = load_xlsum_data(lang=lang, split="test", limit=100)
97
 
98
- print(test_data)
99
- print(num_examples)
100
- print(lang)
101
  ic_examples = []
102
  if not zero_shot:
103
 
@@ -121,12 +224,10 @@ def construct_prompt(
121
  else PromptTemplate(input_variables=["text"], template=zero_shot_template)
122
  )
123
 
124
- print("lang", lang)
125
- print(config["input"] , lang)
126
  if config["input"] != lang:
127
  test_example = _translate_example(
128
  example=test_example, src_language=lang, target_language=config["input"]
129
  )
130
 
131
- print("test_example", prompt)
132
- return prompt.format(text=test_example["text"])
 
1
+ from typing import List, Dict, Any, Union
2
 
3
+ import numpy as np
4
+ from datasets import load_dataset, Dataset
5
  from easygoogletranslate import EasyGoogleTranslate
6
  from langchain.prompts import PromptTemplate, FewShotPromptTemplate
7
 
8
+
9
+ LANGAUGE_TO_PREFIX = {
10
+
11
+ "chinese_simplified": "zh-CN",
12
+ "french": "fr",
13
+ "portuguese": "pt",
14
  "english": "en",
15
+ "arabic": "ar",
16
+ "hindi": "hi",
17
+ "indonesian": "id",
18
+ "amharic": "am",
19
+ "bengali": "bn",
20
+ "burmese": "my",
21
+ "uzbek": "uz",
22
+ "nepali": "ne",
23
+ "japanese": "ja",
24
+ "spanish": "es",
25
+ "turkish": "tr",
26
+ "persian": "fa",
27
+ "azerbaijani": "az",
28
+ "korean": "ko",
29
+ "hebrew": "he",
30
+ "telugu": "te",
31
+ "german": "de",
32
+ "greek": "el",
33
+ "tamil": "ta",
34
+ "assamese": "as",
35
  "vietnamese": "vi",
36
+ "russian": "ru",
37
+ "romanian": "ro",
38
+ "malayalam": "ml",
39
+ "swahili": "sw",
40
+ "bulgarian": "bg",
41
+ "thai": "th",
42
+ "urdu": "ur",
43
  "italian": "it",
44
+ "polish": "pl",
45
+ "dutch": "nl",
46
+ "swedish": "sv",
47
+ "danish": "da",
48
+ "norwegian": "no",
49
+ "finnish": "fi",
50
+ "hungarian": "hu",
51
+ "czech": "cs",
52
+ "slovak": "sk",
53
+ "ukrainian": "uk",
54
  "bambara": "bam",
55
  "ewe": "ewe",
56
  "fon": "fon",
 
81
  return translator.translate(basic_instruction)
82
 
83
 
84
+ def create_instruction(lang: str, instruction_language: str, expected_output: str):
85
  basic_instruction = f"""You are an NLP assistant whose
86
  purpose is to perform Named Entity Recognition
87
  (NER). You will need to give each entity a tag, from the following:
 
92
  The entities should be in {expected_output} language"""
93
 
94
  return (
95
+ instruction_language
96
  if lang == "english"
97
  else _translate_instruction(basic_instruction, target_language=lang)
98
  )
99
 
100
+
101
+ def load_wikiann_dataset(lang, split, limit):
102
+ """Loads the xlsum dataset"""
103
+ dataset = load_dataset("wikiann", LANGAUGE_TO_PREFIX[lang])[split]
104
+ return dataset.select(np.arange(limit))
105
+
106
+
107
+ def _translate_example(
108
+ example: Dict[str, str], src_language: str, target_language: str
109
+ ):
110
+ translator = EasyGoogleTranslate(
111
+ source_language=LANGAUGE_TO_PREFIX[src_language],
112
+ target_language=LANGAUGE_TO_PREFIX[target_language],
113
+ timeout=30,
114
+ )
115
+
116
+ return {
117
+ "tokens": translator.translate(str(example["tokens"])),
118
+ "ner_tags": translator.translate(str(example["ner_tags"]))
119
+ }
120
+
121
+
122
+ def choose_few_shot_examples(
123
+ train_dataset: Dataset,
124
+ few_shot_size: int,
125
+ context: List[str],
126
+ selection_criteria: str,
127
+ lang: str,
128
+ ) -> List[Dict[str, Union[str, int]]]:
129
+ """Selects few-shot examples from training datasets
130
+
131
+ Args:
132
+ train_dataset (Dataset): Training Dataset
133
+ few_shot_size (int): Number of few-shot examples
134
+ selection_criteria (few_shot_selection): How to select few-shot examples. Choices: [random, first_k]
135
+
136
+ Returns:
137
+ List[Dict[str, Union[str, int]]]: Selected examples
138
+ """
139
+ selected_examples = []
140
+
141
+ example_idxs = []
142
+ if selection_criteria == "first_k":
143
+ example_idxs = list(range(few_shot_size))
144
+ elif selection_criteria == "random":
145
+ example_idxs = (
146
+ np.random.choice(len(train_dataset), size=few_shot_size, replace=True)
147
+ .astype(int)
148
+ .tolist()
149
+ )
150
+
151
+ ic_examples = [train_dataset[idx] for idx in example_idxs]
152
+
153
+ ic_examples = [
154
+ {
155
+ "tokens": ' '.join(example['tokens']),
156
+ "ner_tags": example['spans']
157
+ }
158
+ for example in ic_examples
159
+ ]
160
+
161
+ for idx, ic_language in enumerate(context):
162
+ (
163
+ selected_examples.append(ic_examples[idx])
164
+ if ic_language == lang
165
+ else (
166
+ selected_examples.append(
167
+ _translate_example(
168
+ example=ic_examples[idx],
169
+ src_language=lang,
170
+ target_language=ic_language,
171
+ )
172
+ )
173
+ )
174
+ )
175
+
176
+ return selected_examples
177
+
178
+
179
  def construct_prompt(
180
  instruction: str,
181
  test_example: dict,
 
186
  config: Dict[str, str],
187
  ):
188
  if not instruction:
189
+ instruction = create_instruction(lang, config['prefix'], config['output'])
 
190
 
191
  example_prompt = PromptTemplate(
192
+ input_variables=["tokens", "ner_tags"],
193
+ template="Sentence: {tokens}\nNer Tags: {ner_tags}",
194
  )
195
 
196
+ zero_shot_template = f"""{instruction}""" + "\n Sentence: {text} " ""
197
+
198
+ try:
199
+ test_data = load_wikiann_dataset(lang=lang, split="test", limit=500)
200
+ except Exception as e:
201
+ raise KeyError(f"{lang} is not supported in 'wikiAnn' dataset, choose supported language in few-shot")
202
 
 
203
 
 
 
 
204
  ic_examples = []
205
  if not zero_shot:
206
 
 
224
  else PromptTemplate(input_variables=["text"], template=zero_shot_template)
225
  )
226
 
 
 
227
  if config["input"] != lang:
228
  test_example = _translate_example(
229
  example=test_example, src_language=lang, target_language=config["input"]
230
  )
231
 
232
+ print(test_example)
233
+ return prompt.format(text=test_example["tokens"])
tasks/nli.py CHANGED
@@ -432,15 +432,14 @@ def process_test_example(
432
 
433
 
434
  def construct_prompt(
435
- instruction: str,
436
- test_example: dict,
437
- zero_shot: bool,
438
- num_examples: int,
439
- lang: str,
440
- config: Dict[str, str],
441
- dataset_name: str = 'xnli'
442
  ):
443
-
444
  if not instruction:
445
  print(lang)
446
  instruction = create_instruction(lang)
@@ -451,17 +450,16 @@ def construct_prompt(
451
  )
452
 
453
  zero_shot_template = (
454
- f"""{instruction}""" + "\n Hypothesis: {hypothesis} + \n Premise: {premise}" ""
455
  )
 
 
 
 
 
456
 
457
- test_data = load_xnli_dataset(dataset_name, lang, split="test", limit=100)
458
-
459
- print(test_data)
460
- print(num_examples)
461
- print(lang)
462
  ic_examples = []
463
  if not zero_shot:
464
-
465
  ic_examples = choose_few_shot_examples(
466
  train_dataset=test_data,
467
  few_shot_size=num_examples,
@@ -485,12 +483,11 @@ def construct_prompt(
485
  )
486
 
487
  print("lang", lang)
488
- print(config["input"] , lang)
489
  if config["input"] != lang:
490
  test_example = _translate_example(
491
  example=test_example, src_language=lang, target_language=config["input"]
492
  )
493
 
494
  return prompt.format(
495
- hypothesis=test_example["hypothesis"], premise=test_example["premise"]
496
- )
 
432
 
433
 
434
  def construct_prompt(
435
+ instruction: str,
436
+ test_example: dict,
437
+ zero_shot: bool,
438
+ num_examples: int,
439
+ lang: str,
440
+ config: Dict[str, str],
441
+ dataset_name: str = 'xnli'
442
  ):
 
443
  if not instruction:
444
  print(lang)
445
  instruction = create_instruction(lang)
 
450
  )
451
 
452
  zero_shot_template = (
453
+ f"""{instruction}""" + "\n Hypothesis: {hypothesis} + \n Premise: {premise}" ""
454
  )
455
+ if not zero_shot:
456
+ try:
457
+ test_data = load_xnli_dataset(dataset_name, lang, split="test", limit=100)
458
+ except KeyError as e:
459
+ raise KeyError(f"{lang} is not supported in {dataset_name} dataset, choose supported language in few-shot")
460
 
 
 
 
 
 
461
  ic_examples = []
462
  if not zero_shot:
 
463
  ic_examples = choose_few_shot_examples(
464
  train_dataset=test_data,
465
  few_shot_size=num_examples,
 
483
  )
484
 
485
  print("lang", lang)
486
+ print(config["input"], lang)
487
  if config["input"] != lang:
488
  test_example = _translate_example(
489
  example=test_example, src_language=lang, target_language=config["input"]
490
  )
491
 
492
  return prompt.format(
493
+ hypothesis=test_example["hypothesis"], premise=test_example["premise"])
 
tasks/qa.py CHANGED
@@ -29,7 +29,7 @@ from yaml.loader import SafeLoader
29
 
30
  def gemini_completion(prompt):
31
  # Define the endpoint URL
32
- genai.configure(api_key="")
33
  model = genai.GenerativeModel("models/gemini-1.0-pro-latest")
34
  return model.generate_content(prompt).text
35
 
@@ -41,6 +41,14 @@ def gemini_completion(prompt):
41
  # model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, torch_dtype="auto", device_map="auto")
42
  # model.to("cuda:04")
43
 
 
 
 
 
 
 
 
 
44
 
45
  def get_entities_gpt3_long(prompt):
46
  response = openai.ChatCompletion.create(
@@ -58,8 +66,12 @@ def gpt3x_completion(
58
  ) -> str:
59
  import os
60
  import openai
61
- os.environ["OPENAI_API_KEY"] = ''
 
62
  openai.api_type = "azure"
 
 
 
63
 
64
  def get_entities_chatGPT(final_prompt):
65
  response = openai.ChatCompletion.create(
@@ -84,7 +96,7 @@ def mixtral_completion(prompt):
84
  url = "https://api.together.xyz/v1/chat/completions"
85
 
86
  # Define your Together API key
87
- together_api_key = "" # Replace with your actual API key
88
 
89
  # Define the request payload
90
  payload = {
@@ -177,21 +189,43 @@ LANGUAGE_TO_SUFFIX = {
177
  "indonesian": "id",
178
  "amharic": "am",
179
  "bengali": "bn",
180
- "telugu": "te",
181
  "burmese": "my",
 
 
 
 
 
 
 
 
 
 
182
  "german": "de",
183
  "greek": "el",
184
  "tamil": "ta",
185
  "assamese": "as",
186
- "hindi": "hi",
187
  "vietnamese": "vi",
188
  "russian": "ru",
189
- "telugu": "te",
190
  "romanian": "ro",
191
  "malayalam": "ml",
192
- "persian": "fa",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  }
194
 
 
195
  PARAMS = NewType("PARAMS", Dict[str, Any])
196
 
197
 
@@ -337,7 +371,7 @@ def _translate_prediction_to_output_language(
337
  return translator.translate(prediction)
338
 
339
 
340
- def create_instruction(lang: str, expected_output: str):
341
  basic_instruction = (
342
  "Answer to the <Question> below, based only to the given <Context>, Follow these instructions: \n "
343
  "1. The answer should include only words from the given context \n "
@@ -347,7 +381,7 @@ def create_instruction(lang: str, expected_output: str):
347
  )
348
  return (
349
  basic_instruction
350
- if expected_output == "english"
351
  else _translate_instruction(basic_instruction, target_language=lang)
352
  )
353
 
@@ -714,7 +748,7 @@ def construct_prompt(
714
  dataset_name: str = 'xquad'
715
  ):
716
  if not instruction:
717
- instruction = create_instruction(lang, config['prefix'])
718
 
719
  example_prompt = PromptTemplate(
720
  input_variables=["context", "question", "answers"],
@@ -724,12 +758,13 @@ def construct_prompt(
724
  zero_shot_template = (
725
  f"""{instruction}""" + " \n <Context>: {context} \n <Question>: {question} " ""
726
  )
 
 
 
 
 
727
 
728
- test_data = load_qa_dataset(dataset_name = dataset_name, lang=lang, split="test", limit=100)
729
 
730
- print(test_data)
731
- print(num_examples)
732
- print(lang)
733
  ic_examples = []
734
  if not zero_shot:
735
 
 
29
 
30
  def gemini_completion(prompt):
31
  # Define the endpoint URL
32
+ genai.configure(api_key="AIzaSyCSvECR2K_ca3QcMBcCHbxMzBpZe3y82iI")
33
  model = genai.GenerativeModel("models/gemini-1.0-pro-latest")
34
  return model.generate_content(prompt).text
35
 
 
41
  # model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, torch_dtype="auto", device_map="auto")
42
  # model.to("cuda:04")
43
 
44
+ os.environ["OPENAI_API_KEY"] = (
45
+ "sk-proj-YeuUCE17wxVRRjD61Rn8T3BlbkFJr337RfppJB8fadACBXwG"
46
+ )
47
+ OPENAI_API_KEY = "sk-proj-YeuUCE17wxVRRjD61Rn8T3BlbkFJr337RfppJB8fadACBXwG"
48
+ openai.api_key = "sk-proj-YeuUCE17wxVRRjD61Rn8T3BlbkFJr337RfppJB8fadACBXwG"
49
+
50
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
51
+
52
 
53
  def get_entities_gpt3_long(prompt):
54
  response = openai.ChatCompletion.create(
 
66
  ) -> str:
67
  import os
68
  import openai
69
+ os.environ["OPENAI_API_KEY"] = '07d805ec4fbd484ebc923a3a41e1773d'
70
+ OPENAI_API_KEY = '07d805ec4fbd484ebc923a3a41e1773d'
71
  openai.api_type = "azure"
72
+ openai.api_base = 'https://hebsum-itaim-uks.openai.azure.com/'
73
+ openai.api_version = "2023-03-15-preview"
74
+ openai.api_key = '07d805ec4fbd484ebc923a3a41e1773d'
75
 
76
  def get_entities_chatGPT(final_prompt):
77
  response = openai.ChatCompletion.create(
 
96
  url = "https://api.together.xyz/v1/chat/completions"
97
 
98
  # Define your Together API key
99
+ together_api_key = "851cfc39f3d7a246a2342259f5f6fbba4721c6002123365fba2254c9c9c424ad" # Replace with your actual API key
100
 
101
  # Define the request payload
102
  payload = {
 
189
  "indonesian": "id",
190
  "amharic": "am",
191
  "bengali": "bn",
 
192
  "burmese": "my",
193
+ "uzbek": "uz",
194
+ "nepali": "ne",
195
+ "japanese": "ja",
196
+ "spanish": "es",
197
+ "turkish": "tr",
198
+ "persian": "fa",
199
+ "azerbaijani": "az",
200
+ "korean": "ko",
201
+ "hebrew": "he",
202
+ "telugu": "te",
203
  "german": "de",
204
  "greek": "el",
205
  "tamil": "ta",
206
  "assamese": "as",
 
207
  "vietnamese": "vi",
208
  "russian": "ru",
 
209
  "romanian": "ro",
210
  "malayalam": "ml",
211
+ "swahili": "sw",
212
+ "bulgarian": "bg",
213
+ "thai": "th",
214
+ "urdu": "ur",
215
+ "italian": "it",
216
+ "polish": "pl",
217
+ "dutch": "nl",
218
+ "swedish": "sv",
219
+ "danish": "da",
220
+ "norwegian": "no",
221
+ "finnish": "fi",
222
+ "hungarian": "hu",
223
+ "czech": "cs",
224
+ "slovak": "sk",
225
+ "ukrainian": "uk"
226
  }
227
 
228
+
229
  PARAMS = NewType("PARAMS", Dict[str, Any])
230
 
231
 
 
371
  return translator.translate(prediction)
372
 
373
 
374
+ def create_instruction(lang: str, instruction_language: str, expected_output):
375
  basic_instruction = (
376
  "Answer to the <Question> below, based only to the given <Context>, Follow these instructions: \n "
377
  "1. The answer should include only words from the given context \n "
 
381
  )
382
  return (
383
  basic_instruction
384
+ if instruction_language == "english"
385
  else _translate_instruction(basic_instruction, target_language=lang)
386
  )
387
 
 
748
  dataset_name: str = 'xquad'
749
  ):
750
  if not instruction:
751
+ instruction = create_instruction(lang, config['prefix'], config['output'])
752
 
753
  example_prompt = PromptTemplate(
754
  input_variables=["context", "question", "answers"],
 
758
  zero_shot_template = (
759
  f"""{instruction}""" + " \n <Context>: {context} \n <Question>: {question} " ""
760
  )
761
+ if not zero_shot:
762
+ try:
763
+ test_data = load_qa_dataset(dataset_name = dataset_name, lang=lang, split="test", limit=100)
764
+ except Exception as e:
765
+ raise KeyError(f"{lang} is not supported in {dataset_name}")
766
 
 
767
 
 
 
 
768
  ic_examples = []
769
  if not zero_shot:
770
 
tasks/summarization.py CHANGED
@@ -1,9 +1,10 @@
1
- from typing import List, Dict, Optional, Union
 
2
  import numpy as np
3
  from datasets import Dataset, load_dataset
4
  from easygoogletranslate import EasyGoogleTranslate
5
  from langchain.prompts import PromptTemplate, FewShotPromptTemplate
6
- from iso639 import to_iso639_1
7
  LANGUAGE_TO_SUFFIX = {
8
  "chinese_simplified": "zh-CN",
9
  "french": "fr",
@@ -24,6 +25,30 @@ LANGUAGE_TO_SUFFIX = {
24
  "azerbaijani": "az",
25
  "korean": "ko",
26
  "hebrew": "he",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  }
28
 
29
 
@@ -56,15 +81,15 @@ def choose_few_shot_examples(
56
  def _translate_instruction(basic_instruction: str, target_language: str) -> str:
57
  translator = EasyGoogleTranslate(
58
  source_language="en",
59
- target_language=to_iso639_1(target_language),
60
  timeout=50,
61
  )
62
  return translator.translate(basic_instruction)
63
 
64
 
65
  def _translate_example(example: Dict[str, str], src_language: str, target_language: str):
66
- translator = EasyGoogleTranslate(source_language=to_iso639_1(str(src_language).capitalize()),
67
- target_language=to_iso639_1(str(target_language).capitalize()),
68
  timeout=30)
69
  try:
70
  return {'text': translator.translate(example['text']), 'summary': ''}
 
1
+ from typing import List, Dict, Union
2
+
3
  import numpy as np
4
  from datasets import Dataset, load_dataset
5
  from easygoogletranslate import EasyGoogleTranslate
6
  from langchain.prompts import PromptTemplate, FewShotPromptTemplate
7
+
8
  LANGUAGE_TO_SUFFIX = {
9
  "chinese_simplified": "zh-CN",
10
  "french": "fr",
 
25
  "azerbaijani": "az",
26
  "korean": "ko",
27
  "hebrew": "he",
28
+ "telugu": "te",
29
+ "german": "de",
30
+ "greek": "el",
31
+ "tamil": "ta",
32
+ "assamese": "as",
33
+ "vietnamese": "vi",
34
+ "russian": "ru",
35
+ "romanian": "ro",
36
+ "malayalam": "ml",
37
+ "swahili": "sw",
38
+ "bulgarian": "bg",
39
+ "thai": "th",
40
+ "urdu": "ur",
41
+ "italian": "it",
42
+ "polish": "pl",
43
+ "dutch": "nl",
44
+ "swedish": "sv",
45
+ "danish": "da",
46
+ "norwegian": "no",
47
+ "finnish": "fi",
48
+ "hungarian": "hu",
49
+ "czech": "cs",
50
+ "slovak": "sk",
51
+ "ukrainian": "uk"
52
  }
53
 
54
 
 
81
  def _translate_instruction(basic_instruction: str, target_language: str) -> str:
82
  translator = EasyGoogleTranslate(
83
  source_language="en",
84
+ target_language=LANGUAGE_TO_SUFFIX[target_language],
85
  timeout=50,
86
  )
87
  return translator.translate(basic_instruction)
88
 
89
 
90
  def _translate_example(example: Dict[str, str], src_language: str, target_language: str):
91
+ translator = EasyGoogleTranslate(source_language=LANGUAGE_TO_SUFFIX[src_language],
92
+ target_language=LANGUAGE_TO_SUFFIX[target_language],
93
  timeout=30)
94
  try:
95
  return {'text': translator.translate(example['text']), 'summary': ''}