bhavanishankarpullela commited on
Commit
2dc7757
·
verified ·
1 Parent(s): b817ab5

Upload 9 files

Browse files
ST/inference/codes/alignment.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import itertools
3
+ from transformers import AutoModel, AutoTokenizer, AutoModelForSeq2SeqLM
4
+ import spacy
5
+ import random
6
+ import os
7
+ import csv
8
+ os.environ["CUDA_VISIBLE_DEVICES"] = "7" # SET the GPUs you want to use
9
+
10
+
11
+ class TextAligner:
12
+ def __init__(self):
13
+ self.model = AutoModel.from_pretrained("aneuraz/awesome-align-with-co", )
14
+ self.tokenizer = AutoTokenizer.from_pretrained("aneuraz/awesome-align-with-co")
15
+ self.align_layer = 8
16
+ self.threshold = 1e-3
17
+
18
+ def align_texts(self, original, translated):
19
+ sent_src, sent_tgt = original.strip().split(), translated.strip().split()
20
+ token_src, token_tgt = [self.tokenizer.tokenize(word) for word in sent_src], [self.tokenizer.tokenize(word) for word in sent_tgt]
21
+ wid_src, wid_tgt = [self.tokenizer.convert_tokens_to_ids(x) for x in token_src], [self.tokenizer.convert_tokens_to_ids(x) for x in token_tgt]
22
+ ids_src, ids_tgt = self.tokenizer.prepare_for_model(list(itertools.chain(*wid_src)), return_tensors='pt', model_max_length=self.tokenizer.model_max_length, truncation=True)['input_ids'], self.tokenizer.prepare_for_model(list(itertools.chain(*wid_tgt)), return_tensors='pt', truncation=True, model_max_length=self.tokenizer.model_max_length)['input_ids']
23
+
24
+ sub2word_map_src = [i for i, word_list in enumerate(token_src) for _ in word_list]
25
+ sub2word_map_tgt = [i for i, word_list in enumerate(token_tgt) for _ in word_list]
26
+
27
+ self.model.eval()
28
+ with torch.no_grad():
29
+ out_src = self.model(ids_src.unsqueeze(0), output_hidden_states=True)[2][self.align_layer][0, 1:-1]
30
+ out_tgt = self.model(ids_tgt.unsqueeze(0), output_hidden_states=True)[2][self.align_layer][0, 1:-1]
31
+
32
+ dot_prod = torch.matmul(out_src, out_tgt.transpose(-1, -2))
33
+
34
+ softmax_srctgt = torch.nn.Softmax(dim=-1)(dot_prod)
35
+ softmax_tgtsrc = torch.nn.Softmax(dim=-2)(dot_prod)
36
+
37
+ softmax_inter = (softmax_srctgt > self.threshold)*(softmax_tgtsrc > self.threshold)
38
+
39
+ align_subwords = torch.nonzero(softmax_inter, as_tuple=False)
40
+ align_words = {(sent_src[sub2word_map_src[i]], sent_tgt[sub2word_map_tgt[j]]) for i, j in align_subwords}
41
+
42
+ return align_words
43
+
44
+ # Load the NLLB model for translation
45
+ #tel_Telu
46
+ #hin_Deva
47
+ #mar_Deva
48
+ #ben_Beng
49
+ #vie_Latn
50
+ #ces_Latn
51
+ # tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-3.3B", src_lang="hin_Deva")
52
+ # model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B")
53
+
54
+ # def get_translation(word):
55
+ # """Fetch the English translation for a given Telugu word using the NLLB model."""
56
+ # inputs = tokenizer(word, return_tensors="pt")
57
+ # translated_tokens = model.generate(
58
+ # **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=100
59
+ # )
60
+ # english_phrase = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
61
+ # return english_phrase
62
+
63
+ class CodeSwitcher(TextAligner):
64
+ def __init__(self):
65
+ super().__init__()
66
+ self.nlp = spacy.load("en_core_web_sm")
67
+
68
+ def switch_content_words(self, source_sentences, ratio=0.5):
69
+ english_translations = ['Uhh tell me, is this not a service?', 'But getting stuck here would not help.', 'Bajpai was also included among these economists.', 'I thought, um...should I go see a movie this evening?', 'He...means, he bought his new car.']
70
+ mixed_sentences = []
71
+
72
+ for source, english in zip(source_sentences, english_translations):
73
+ aligned_pairs = self.align_texts(source, english)
74
+ print(aligned_pairs)
75
+ aligned_dict = dict(aligned_pairs)
76
+ print(aligned_dict)
77
+
78
+ doc = self.nlp(english)
79
+ content_word_tags = ["NN", "NNS", "NNP", "NNPS", "JJ", "JJR", "JJS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "RB", "RBR", "RBS", "NOUN", "VERB", "ADJ", "ADV"]
80
+ content_words = [token.text for token in doc if token.tag_ in content_word_tags]
81
+
82
+ num_to_switch = int(len(content_words) * ratio)
83
+ words_to_switch = random.sample(content_words, min(num_to_switch, len(content_words)))
84
+
85
+ new_sentence = []
86
+ for word in source.split():
87
+ aligned_english_word = aligned_dict.get(word, None)
88
+ if aligned_english_word and aligned_english_word in words_to_switch:
89
+ new_sentence.append(aligned_english_word)
90
+ else:
91
+ new_sentence.append(word)
92
+
93
+ mixed_sentences.append(' '.join(new_sentence))
94
+
95
+ return mixed_sentences
96
+
97
+ # Usage:
98
+ switcher = CodeSwitcher()
99
+ hindi_sentences = ['अ मुझे बताइए ये सेवा नहीं है क्या?', 'लेकिन यहां पर पर अटकने से काम नहीं होगा।', 'बाजपेयी भी इन अर अर्थशास्त्रियों में शामिल थे।', 'मैंने ��ोचा कि, उम्म...क्या मैं आज शाम को फिल्म देखने जाऊँ?', 'उसके...मतलब, उसने अपनी नई कार खरीदी है।']
100
+ # french_sentences = ["Je veux que tu envoies la photo photo d' écran à Mireille avec Lucie en cc.", "Envoie un mail à euh jena@polonium.com.", "envoie une un message à Alice.", "Peux-tu euuh envoyer cet SMS sur le chien de maman?", "Dis à Karen par euh SMS que j'arrive en en joignant mon heure d'arrivée.", "Mets en pièce jointe mes coordonnées GPS au courriel pour euh Lucie.", "écris une un mail à Alice.", "Merci d'écrire un email à Pierre."]
101
+ print(switcher.switch_content_words(hindi_sentences, 0.6))
102
+ print("-----------------")
103
+ print(switcher.switch_content_words(hindi_sentences, 0.8))
104
+ print("-----------------")
105
+ print(switcher.switch_content_words(hindi_sentences, 1.0))
ST/inference/codes/bleu_significance.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ # from evaluate import load
3
+ from scipy import stats
4
+ from nltk.translate.bleu_score import sentence_bleu
5
+ import string
6
+
7
+ # Load data from the CSV files
8
+ df1 = pd.read_csv('MT0_xxl_ape/result_mr')
9
+ df2 = pd.read_csv('MT0_xxl_ape/result_mr_50p')
10
+ df_reference = pd.read_csv('MT0_xxl_ape/result_mr')
11
+
12
+ # bleu = load("sacrebleu")
13
+
14
+ sentences1 = df1['pred_label']
15
+ sentences2 = df2['pred_label']
16
+ reference_sentences = df_reference['ref']
17
+
18
+ def process_sentence(sentence):
19
+ if not isinstance(sentence, str):
20
+ return ""
21
+ # Remove spaces before and after the sentence
22
+ sentence = sentence.split('\n')[0]
23
+ sentence = sentence.strip()
24
+ sentence = sentence.lower()
25
+
26
+
27
+ # Remove punctuation marks in the sentence
28
+ for punctuation in string.punctuation:
29
+ sentence = sentence.replace(punctuation, "")
30
+ sentence = sentence.strip()
31
+
32
+ if sentence == "":
33
+ return sentence
34
+
35
+ if (sentence[-1] == '।'):
36
+ print(sentence)
37
+ sentence = sentence[:-1]
38
+ print(sentence)
39
+
40
+ return sentence
41
+
42
+ # Calculate BLEU scores
43
+ def calculate_bleu(sentences, reference):
44
+ return [sentence_bleu([reference[i]], sentences[i]) for i in range(len(sentences))]
45
+
46
+ sentences1 = [process_sentence(s) for s in list(sentences1)]
47
+ sentences2 = [process_sentence(s) for s in list(sentences2)]
48
+ reference_sentences = [process_sentence(s) for s in list(reference_sentences)]
49
+
50
+ bleu_scores1 = calculate_bleu(sentences1, reference_sentences)
51
+ bleu_scores2 = calculate_bleu(sentences2, reference_sentences)
52
+
53
+ # Check for normality
54
+ def check_normality(data):
55
+ stat, p = stats.shapiro(data)
56
+ if p > 0.05:
57
+ return True
58
+ else:
59
+ return False
60
+
61
+ is_normal1 = check_normality(bleu_scores1)
62
+ is_normal2 = check_normality(bleu_scores2)
63
+
64
+ # Check for equal variances
65
+ def check_variance(data1, data2):
66
+ stat, p = stats.levene(data1, data2)
67
+ if p > 0.05:
68
+ return True
69
+ else:
70
+ return False
71
+
72
+ is_equal_var = check_variance(bleu_scores1, bleu_scores2)
73
+
74
+ # Decide and perform the significance test
75
+ def perform_significance_test():
76
+ if is_normal1 and is_normal2:
77
+ if is_equal_var:
78
+ t_stat, p = stats.ttest_ind(bleu_scores1, bleu_scores2)
79
+ return "T-test", p
80
+ else:
81
+ t_stat, p = stats.ttest_ind(bleu_scores1, bleu_scores2, equal_var=False)
82
+ return "Welch's T-test", p
83
+ else:
84
+ u_stat, p = stats.mannwhitneyu(bleu_scores1, bleu_scores2)
85
+ return "Mann-Whitney U test", p
86
+
87
+ test_name, p_value = perform_significance_test()
88
+
89
+ # Output results
90
+ print(f"Test used: {test_name}")
91
+ print(f"P-value: {p_value}")
92
+ if p_value < 0.05:
93
+ print("The difference in BLEU scores is statistically significant.")
94
+ else:
95
+ print("The difference in BLEU scores is not statistically significant.")
96
+
ST/inference/codes/dictionary_creation.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import bz2
3
+ import xml.etree.ElementTree as ET
4
+ import os
5
+ import pickle
6
+ from tqdm import tqdm
7
+ import mwparserfromhell
8
+
9
+ # Step 1: Download the latest dump
10
+ DUMP_URL = "https://dumps.wikimedia.org/tewiktionary/latest/tewiktionary-latest-pages-articles.xml.bz2"
11
+ response = requests.get(DUMP_URL, stream=True)
12
+
13
+ print("Downloading the latest dump...")
14
+ total_size = int(response.headers.get('content-length', 0))
15
+ progress_bar = tqdm(total=total_size, unit='B', unit_scale=True)
16
+
17
+ dump_file = "tewiktionary-latest-pages-articles.xml.bz2"
18
+ with open(dump_file, 'wb') as file:
19
+ for chunk in response.iter_content(chunk_size=8192):
20
+ progress_bar.update(len(chunk))
21
+ file.write(chunk)
22
+ progress_bar.close()
23
+
24
+ # Step 2: Extract the dump
25
+ print("\nExtracting the dump...")
26
+ with bz2.open(dump_file, 'rb') as source, open(dump_file[:-4], 'wb') as dest:
27
+ for line in source:
28
+ dest.write(line)
29
+
30
+ # Step 3: Parse the XML dump and extract translations
31
+ print("Parsing the XML dump to extract translations...")
32
+ tree = ET.parse(dump_file[:-4])
33
+ root = tree.getroot()
34
+
35
+ ns = {'ns': 'http://www.mediawiki.org/xml/export-0.10/'}
36
+
37
+ translations = {}
38
+
39
+ for page in root.findall('ns:page', ns):
40
+ title = page.find('ns:title', ns).text
41
+ revision = page.find('ns:revision', ns)
42
+ if revision:
43
+ text_data = revision.find('ns:text', ns)
44
+ if text_data and text_data.text:
45
+ # Parse the wikitext
46
+ wikicode = mwparserfromhell.parse(text_data.text)
47
+ links = [link.title for link in wikicode.filter_wikilinks() if link.title.startswith("en:")]
48
+ if links:
49
+ english_translations = [str(link.split(':')[1]) for link in links]
50
+ translations[title] = english_translations
51
+
52
+ # Display the first 1000 translations
53
+ print("\nDisplaying the first 1000 translations:")
54
+ for i, (telugu_word, english_words) in enumerate(translations.items()):
55
+ if i >= 1000:
56
+ break
57
+ print(f"Telugu Word: {telugu_word}, English Translations: {', '.join(english_words)}")
58
+
59
+ # Save the translations to a pickle file
60
+ print("\nSaving translations to pickle file...")
61
+ pickle_filename = "telugu_english_translations.pkl"
62
+ with open(pickle_filename, 'wb') as file:
63
+ pickle.dump(translations, file)
64
+
65
+ print(f"Translations saved to {pickle_filename}")
66
+
67
+ # Optional: Remove the downloaded files if you want
68
+ # os.remove(dump_file)
69
+ # os.remove(dump_file[:-4])
ST/inference/codes/evaluate_exactmatch.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from evaluate import load
2
+ import pandas as pd
3
+ import string
4
+
5
+ # import os
6
+ # os.environ["CUDA_VISIBLE_DEVICES"] = "3"
7
+
8
+ exact_match_metric = load("exact_match")
9
+ bleu = load("sacrebleu")
10
+ # meteor = load('meteor')
11
+ # comet = load('comet')
12
+ # bertscore = load('bertscore')
13
+
14
+ # import torch
15
+
16
+ # # Check if CUDA (GPU) is available
17
+ # if torch.cuda.is_available():
18
+ # device = torch.device('cuda')
19
+ # print("Using GPU:", torch.cuda.get_device_name(0))
20
+ # else:
21
+ # device = torch.device('cpu')
22
+ # print("Using CPU")
23
+
24
+
25
+ # # Optimize for Tensor Cores if available
26
+ # if 'A100' in torch.cuda.get_device_name(0):
27
+ # # Set the precision for matrix multiplications
28
+ # # Choose 'medium' for a balance between performance and precision
29
+ # # Or 'high' if you need higher precision
30
+ # torch.set_float32_matmul_precision('medium')
31
+
32
+
33
+
34
+ df = pd.read_csv("MT0_xxl_results/result_m_eng_l")
35
+ reference = list(df.label)
36
+ predicted = list(df.pred_label)
37
+ # source = list(df.disfluent)
38
+
39
+ def process_sentence(sentence):
40
+ if not isinstance(sentence, str):
41
+ return ""
42
+ # Remove spaces before and after the sentence
43
+ sentence = sentence.split('\n')[0]
44
+ sentence = sentence.strip()
45
+ sentence = sentence.lower()
46
+
47
+
48
+ # Remove punctuation marks in the sentence
49
+ for punctuation in string.punctuation:
50
+ sentence = sentence.replace(punctuation, "")
51
+ sentence = sentence.strip()
52
+
53
+ if sentence == "":
54
+ return sentence
55
+
56
+ if (sentence[-1] == '।'):
57
+ print(sentence)
58
+ sentence = sentence[:-1]
59
+ print(sentence)
60
+
61
+ return sentence
62
+
63
+ reference = [process_sentence(s) for s in list(df.label)]
64
+ # source = [process_sentence(s) for s in list(df.disfluent)]
65
+ predicted = [process_sentence(s) for s in list(df.pred_label)]
66
+
67
+
68
+
69
+
70
+
71
+ results = {}
72
+ results['exact_match'] = exact_match_metric.compute(predictions=predicted, references=reference)
73
+ results['bleu'] = bleu.compute(predictions=predicted, references=reference)
74
+ # results['meteor'] = meteor.compute(predictions=predicted, references=reference)
75
+ # results['comet'] = comet.compute(sources=source, predictions=predicted, references=reference)
76
+ # results['bertscore'] = bertscore.compute(predictions=predicted, references=reference)
77
+
78
+ print(results)
ST/inference/codes/evaluate_sari.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from evaluate import load
2
+ import pandas as pd
3
+ import string
4
+
5
+ # Load SARI metric
6
+ sari = load("sari")
7
+
8
+ # Read the CSV
9
+ df = pd.read_csv("MT0_xxl_results/result_pt_80p")
10
+
11
+ def process_sentence(sentence):
12
+ if not isinstance(sentence, str):
13
+ return ""
14
+ sentence = sentence.split('\n')[0]
15
+ sentence = sentence.strip().lower()
16
+ for punctuation in string.punctuation:
17
+ sentence = sentence.replace(punctuation, "")
18
+ sentence = sentence.strip()
19
+ if sentence and sentence[-1] == '।':
20
+ sentence = sentence[:-1]
21
+ return sentence
22
+
23
+ # Process predictions
24
+ original = [process_sentence(s) for s in df['original']]
25
+ predicted = [process_sentence(s) for s in df['pred_label']]
26
+
27
+ # Assuming columns "ref1", "ref2", ... "refN" are reference columns
28
+ # Change ["ref1", "ref2", "refN"] to your actual column names
29
+ reference_columns = ["label1", "label2", "label3", "label4"]
30
+ references = []
31
+
32
+ for _, row in df.iterrows():
33
+ current_references = [process_sentence(row[col]) for col in reference_columns]
34
+ references.append(current_references)
35
+
36
+ # Compute SARI score
37
+ results = {}
38
+ results['sari'] = sari.compute(sources=original, predictions=predicted, references=references)
39
+ print(results)
ST/inference/codes/german_synthetic_switching.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import spacy
3
+
4
+ # Load the German spacy model
5
+ # nlp = spacy.load('de_core_news_sm')
6
+ # nlp = spacy.load('de_core_news_sm')
7
+ # nlp = spacy.load('tr_core_news_trf') #French
8
+ # nlp = spacy.load('hi_core_news_sm') #Greek
9
+ # nlp = spacy.load("fr_core_news_sm")
10
+ nlp = spacy.load('pt_core_news_sm')
11
+ # nlp = spacy.load('es_core_news_sm')
12
+
13
+ def load_german_english_dict(file_path):
14
+ """
15
+ Load the German-English dictionary from a file.
16
+
17
+ Args:
18
+ - file_path (str): Path to the dictionary file.
19
+
20
+ Returns:
21
+ - dict: German-English dictionary.
22
+ """
23
+ with open(file_path, 'r', encoding='utf-8') as file:
24
+ lines = file.readlines()
25
+ return {line.split()[0]: line.split()[1] for line in lines}
26
+
27
+ def translate_content_words(sentence, dictionary, probability=0.5):
28
+ """
29
+ Randomly translate content words from German to English.
30
+
31
+ Args:
32
+ - sentence (str): German sentence to translate.
33
+ - dictionary (dict): Bilingual German-English dictionary.
34
+ - probability (float): Probability to translate a word.
35
+
36
+ Returns:
37
+ - str: Sentence with randomly translated content words.
38
+ """
39
+ doc = nlp(sentence.lower())
40
+ translated_sentence = []
41
+
42
+ for token in doc:
43
+ # Check if the token is a content word
44
+ if token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV', ]:
45
+ # Randomly decide whether to translate
46
+ if random.random() < probability:
47
+ # Translate if word is in the dictionary, otherwise keep the original word
48
+ translated_sentence.append(dictionary.get(token.text, token.text))
49
+ else:
50
+ translated_sentence.append(token.text)
51
+ else:
52
+ translated_sentence.append(token.text)
53
+
54
+ return ' '.join(translated_sentence)
55
+
56
+ # Load the dictionary from the file
57
+ german_english_dict = load_german_english_dict('Dictionary/portuguese_english_dict.txt')
58
+
59
+ # Example usage
60
+ sentence = "비교 가능한 유속을 유지할 수있을 때 그 결과가 높습니다."
61
+ print(translate_content_words(sentence, german_english_dict, 0.5))
62
+ print(translate_content_words(sentence, german_english_dict, 0.8))
63
+ print(translate_content_words(sentence, german_english_dict, 1.0))
ST/inference/codes/getTranslationBleu.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
+ import os
4
+
5
+ from evaluate import load
6
+ import pandas as pd
7
+ import string
8
+ os.environ["CUDA_VISIBLE_DEVICES"] = "2" # SET the GPUs you want to use
9
+
10
+ exact_match_metric = load("exact_match")
11
+ bleu = load("sacrebleu")
12
+
13
+ df = pd.read_csv("Annotations/ep1_transcripts.csv")
14
+ df2 = pd.read_csv("Annotations/ep1_translations.csv")
15
+ input_ = []
16
+ reference = []
17
+
18
+ # Step 3: Iterate through rows of the DataFrame and filter out rows with "contentType" as "overlap"
19
+ for index, row in df.iterrows():
20
+ if row['contentType'] != 'overlap':
21
+ # Append the values to input_ and reference if "contentType" is not "overlap"
22
+ input_.append(row['asr_transcript'])
23
+ reference.append(row['translation'])
24
+
25
+ # Load the NLLB model for translation
26
+ #tel_Telu
27
+ #hin_Deva
28
+ #mar_Deva
29
+ #ben_Beng
30
+ #vie_Latn
31
+ #ces_Latn
32
+ #por_Latn
33
+ tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang="tel_Telu", use_safetensors=True)
34
+ model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
35
+
36
+ def get_translation(word):
37
+ """Fetch the English translation for a given Telugu word using the NLLB model."""
38
+ inputs = tokenizer(word, return_tensors="pt")
39
+ translated_tokens = model.generate(
40
+ **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=1500
41
+ )
42
+ english_phrase = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
43
+
44
+ return english_phrase
45
+
46
+ # df = pd.read_csv("Annotations/ep1.csv")
47
+ # input_ = list(df.transcript)
48
+ # reference = list(df.translation)
49
+ # source = list(df.disfluent)
50
+
51
+ # # Step 1: Identify indices of "NULL" in input_
52
+ # null_indices = [i for i, transcript in enumerate(input_) if transcript.strip().lower() == "null"]
53
+
54
+ # # Step 2: Remove corresponding elements from input_ and reference
55
+ # input_ = [transcript for i, transcript in enumerate(input_) if i not in null_indices]
56
+ # reference = [translation for i, translation in enumerate(reference) if i not in null_indices]
57
+
58
+ def process_sentence(sentence):
59
+ if not isinstance(sentence, str):
60
+ return ""
61
+ # Remove spaces before and after the sentence
62
+ sentence = sentence.split('\n')[0]
63
+ sentence = sentence.strip()
64
+ sentence = sentence.lower()
65
+
66
+
67
+ # Remove punctuation marks in the sentence
68
+ for punctuation in string.punctuation:
69
+ sentence = sentence.replace(punctuation, "")
70
+ sentence = sentence.strip()
71
+
72
+ if sentence == "":
73
+ return sentence
74
+
75
+ if (sentence[-1] == '।'):
76
+ print(sentence)
77
+ sentence = sentence[:-1]
78
+ print(sentence)
79
+
80
+ return sentence
81
+
82
+
83
+ processed_input = [process_sentence(s) for s in input_]
84
+ processed_ref = [process_sentence(s) for s in reference]
85
+ translated = []
86
+
87
+ for i in processed_input:
88
+ translated_sentence = get_translation(i)
89
+ print(translated_sentence)
90
+ translated.append(process_sentence(translated_sentence))
91
+
92
+ results = {}
93
+ # results['exact_match'] = exact_match_metric.compute(predictions=predicted, references=reference)
94
+ results['bleu'] = bleu.compute(predictions=translated, references=processed_ref)
95
+ # results['meteor'] = meteor.compute(predictions=predicted, references=reference)
96
+ # results['comet'] = comet.compute(sources=source, predictions=predicted, references=reference)
97
+ # results['bertscore'] = bertscore.compute(predictions=predicted, references=reference)
98
+
99
+ print(results)
100
+ df2['cascaded_pred'] = translated
101
+
102
+
103
+
ST/inference/codes/syntheticCodeSwitching.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import spacy
3
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
+ import os
5
+ import csv
6
+ os.environ["CUDA_VISIBLE_DEVICES"] = "3" # SET the GPUs you want to use
7
+
8
+ # Load the NLLB model for translation
9
+ #tel_Telu
10
+ #hin_Deva
11
+ #mar_Deva
12
+ #ben_Beng
13
+ #vie_Latn
14
+ #ces_Latn
15
+ #por_Latn
16
+ tokenizer = AutoTokenizer.from_pretrained("models/nllb-200-3.3B", src_lang="por_Latn")
17
+ model = AutoModelForSeq2SeqLM.from_pretrained("models/nllb-200-3.3B")
18
+
19
+ def get_translation(word):
20
+ """Fetch the English translation for a given Telugu word using the NLLB model."""
21
+ inputs = tokenizer(word, return_tensors="pt")
22
+ translated_tokens = model.generate(
23
+ **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=100
24
+ )
25
+ english_phrase = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
26
+
27
+ return english_phrase
28
+
29
+ def fetch_translation(word, content=1):
30
+ """Fetch the English translation for a given Telugu word using the NLLB model."""
31
+ inputs = tokenizer(word, return_tensors="pt")
32
+ translated_tokens = model.generate(
33
+ **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=100
34
+ )
35
+ english_phrase = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
36
+
37
+ if content:
38
+ # Extract content words from the translated phrase
39
+ content_words = [token.text for token in nlp_en(english_phrase) if token.pos_ in ["NOUN", "VERB", "ADJ", "ADV"]]
40
+
41
+ # Return the first content word, or the whole phrase if no content words are found
42
+ return content_words[0] if content_words else english_phrase
43
+
44
+ return english_phrase
45
+
46
+ def get_pos_tag_english(word):
47
+ """Get the POS tag of the translated English word using spaCy."""
48
+ doc = nlp_en(word)
49
+ return doc[0].pos_
50
+
51
+ def code_switch(sentence, ratio=0.5, content=1):
52
+ """Perform code switching based on the given ratio."""
53
+ words = sentence.split()
54
+ new_sentence = []
55
+
56
+ for word in words:
57
+ english_word = fetch_translation(word)
58
+ if content:
59
+ pos_tag = get_pos_tag_english(english_word)
60
+ if pos_tag in ["NOUN", "VERB", "ADJ", "ADV"] and random.random() < ratio:
61
+ new_sentence.append(english_word)
62
+ else:
63
+ new_sentence.append(word)
64
+
65
+ else:
66
+ if random.random() < ratio:
67
+ new_sentence.append(english_word)
68
+ else:
69
+ new_sentence.append(word)
70
+
71
+ return ' '.join(new_sentence)
72
+
73
+ # Load spaCy's English model for POS tagging
74
+ nlp_en = spacy.load("en_core_web_sm")
75
+
76
+ # Test
77
+ telugu_sentences = ['నేను ఉహుహు... ప్రపంచ కప్ మ్యాచ్ చూడాలనుకుంటున్నాను','అరేరే నేను నా మొబైల్ ఎక్కడ పెట్టానో మరచిపోయాను!', 'ఆయన బాగా బాగా ఆడుతున్నాడు క్రికెట్.', 'మన ప్రయాణం శుక్రవా శనివారం నాడు కాదండి?', '3:30 కాదు కాదు, 4:30 కి మన ప్రయాణం.', 'నా పుస్తకం నాకు పేపర్ కావాలి.']
78
+ hindi_sentences = ['अ मुझे बताइए ये सेवा नहीं है क्या?', 'लेकिन यहां पर पर अटकने से काम नहीं होगा।', 'बाजपेयी भी इन अर अर्थशास्त्रियों में शामिल थे।', 'मैंने सोचा कि, उम्म...क्या मैं आज शाम को फिल्म देखने जाऊँ?', 'अरे, यह कुत्ता हमारे पास क्यों आ रहा है?', 'क्या हमें कल.. हमें कल चलना चाहिए।', 'उसके...मतलब, उसने अपनी नई कार खरीदी है।']
79
+ marathi_sentences = ['अ मला सांगा, ही सेवा नाही का?', 'माझ्या माझ्या कामाची चर्चा आहे उद्या.', 'या अर अर्थतज्ज्ञांमध्ये वाजपेयींचाही समावेश होता.', 'मी विचार केला कि, अं...मी आज सायंकाळी मी चित्रपट पहायला जाऊ का?', 'अरे हा कुत्रा आमच्याजवळ का येतोय?', 'का आपण उद्या.. आपण उद्या जायला हवं.', 'त्याची...म्हणजे, त्याने त्याची नवीन खरेदी केली आहे.']
80
+ bengali_sentences = ['আহ আমাকে বলুন, এটা কি পরিষেবা নয়?', 'কিন্তু কিন্তু এখানে আটকালে তো কাজ হবেনা।', 'বাজপেয়ী জিও এইসব অর্থ অর্থনীতি তে অন্তর্ভুক্ত ছিলেন।', 'আমি ভাবলাম কি যে, আঃ আজকে সন্ধায় কি আমি সিনেমা দেখতে যাব?', 'আরেহ এই কুকুর টা আমাদের দিকে কেন আসছে?', 'আমিাদের কি কাল, আমাদের কাল যাওয়া উচিত।', 'ওনার, মানে উনি নিজের নতুন গাড়ি কিনেছেন।']
81
+ viet_sentences = ['tôi cần thuê à tôi muốn bay một chuyến khứ hồi từ đà nẵng đến đà lạt.', 'sân bay ừm không hãng hàng không nào có đường bay từ bắc kinh ờ ý tôi là thượng thượng hải đến washington dc mà cần nối chuyến qua các thành phố khác.', 'cho tôi biết tất cả các máy bay à chuyến bay từ huế đến quy nhơn.', 'đà nẵng đến ờ hồ chí minh í lộn đến cà mau.', 'có bao nhiêu ghế à ý tôi là hạng ghế.', 'chuyến bay nào rời buôn ma thuột vào ngày mùng 4 ờ không ngày 5 tháng 7 sau 7 giờ tối và đến cần thơ.']
82
+ czech_sentences = ['Strávily jsme měsíc v hlavním městě Jemenu Sané , kde jsme se zúčastnily kurzu arabštiny.', 'Musíme být úspěšní poprvé sámi.']
83
+
84
+
85
+ #XNLI
86
+ telugu_xnli = ['మా నంబర్‌లో ఒకరు మీ సూచనలను సూక్ష్మంగా అమలు చేస్తారు.', 'నా బృందంలోని సభ్యుడు మీ ఆర్డర్‌లను చాలా ఖచ్చితత్వంతో అమలు చేస్తారు.', 'స్వలింగ సంపర్కులు మరియు లెస్బియన్లు.', 'భిన్న లింగ సంపర్కులు.', 'వేద వైపు తిరిగి నవ్వాడు.', 'తల్లితో కలిసి తన వెనకే మెల్లగా నడుస్తున్న వేదను చూసి నవ్వాడు.', 'నీకు ఎలా తెలుసు ? ఇదంతా మళ్లీ వారి సమాచారం.', 'ఈ సమాచారం వారికే చెందుతుంది.', 'జాతీయ ఉద్యానవనాలు మరియు నిర్జన ప్రాంతాలలో సహజ పరిస్థితులకు తిరిగి రావాలనే కాంగ్రెస్ నిర్దేశించిన లక్ష్యం వైపు రాష్ట్రాలు తమ రాష్ట్ర అమలు ప్రణాళికలలో సహేతుకమైన పురోగతిని చూపాలి.', 'ఏదైనా మెరుగుదల ఉండాల్సిన అవసరం లేదు.', 'ఆమె తిరిగి నవ్వింది.', 'ఆమె నవ్వు ఆపుకోలేక చాలా సంతోషించింది.']
87
+ hindi_xnli = ['हमारा एक नंबर आपके निर्देशों का सूक्ष्मता से पालन करेगा।', 'मेरी टीम का एक सदस्य आपके आदेशों को अत्यंत सटीकता के साथ निष्पादित करेगा।', 'समलैंगिक और लेस्बियन।', 'विषमलैंगिक।', 'वह मुड़ा और वेदा की ओर देखकर मुस्कुराया।', 'वह वेदा को देखकर मुस्कुराया जो अपनी माँ के साथ उसके पीछे धीरे-धीरे चल रही थी।', 'आपको कैसे मालूम ? ये सब उनकी जानकारी है।', 'ये जानकारी उनकी है।', 'राज्यों को राष्ट्रीय उद्यानों और जंगल क्षेत्रों में प्राकृतिक परिस्थितियों में लौटने के कांग्रेस द्वारा निर्धारित लक्ष्य की दिशा में अपनी राज्य कार्यान्वयन योजनाओं में उचित प्रगति दिखानी चाहिए।', 'इसमें कोई सुधार होना जरूरी नहीं है।', 'वह वापस मुस्कुराई।', 'वह इतनी खुश थी कि वह मुस्कुराना बंद नहीं कर पा रही थी।']
88
+ marathi_xnli = ['आमचा एक नंबर तुमच्या सूचनांची काटेकोरपणे अंम��बजावणी करेल.', 'माझ्या टीमचा एक सदस्य तुमच्या ऑर्डर्स अत्यंत अचूकतेने अंमलात आणेल.', 'समलिंगी आणि समलैंगिक.', 'भिन्नलिंगी.', 'तो वळून वेदाकडे हसला.', 'तो वेदाकडे बघून हसला जो आईसोबत त्याच्या मागे हळू चालत होता.', 'तुला कसे माहीत ? ही सर्व त्यांची माहिती आहे.', 'ही माहिती त्यांच्या मालकीची आहे.', 'राष्ट्रीय उद्याने आणि वाळवंट भागात नैसर्गिक परिस्थितीत परत येण्याच्या कॉंग्रेसने अनिवार्य केलेल्या उद्दिष्टाच्या दिशेने राज्यांनी त्यांच्या राज्य अंमलबजावणी योजनांमध्ये वाजवी प्रगती दर्शविली पाहिजे.', 'त्यात काही सुधारणा होणे आवश्यक नाही.', 'ती परत हसली.', 'तिला इतका आनंद झाला होता की तिला हसू आवरता आले नाही.']
89
+ bengali_xnli = ['আমাদের নম্বরগুলির মধ্যে একটি আপনার নির্দেশাবলী মিনিটে কার্যকর করবে।', 'আমার দলের একজন সদস্য আপনার আদেশগুলি অত্যন্ত নির্ভুলতার সাথে কার্যকর করবে।', 'সমকামী এবং সমকামীরা।', 'বিষমকামী।', 'সে ঘুরে বেদের দিকে তাকিয়ে হাসল।', 'সে বেদাকে দেখে হাসল যে তার মায়ের সাথে তার পিছনে ধীরে ধীরে হাঁটছিল।', 'তুমি কিভাবে জান ? এসবই তাদের তথ্য।', 'এই তথ্য তাদের অন্তর্গত।', 'জাতীয় উদ্যান এবং মরুভূমি অঞ্চলে প্রাকৃতিক পরিস্থিতিতে ফিরে আসার কংগ্রেসের নির্দেশিত লক্ষ্যের দিকে রাজ্যগুলিকে অবশ্যই তাদের রাষ্ট্রীয় বাস্তবায়ন পরিকল্পনায় যুক্তিসঙ্গত অগ্রগতি দেখাতে হবে।', 'এর জন্য কোনো উন্নতির প্রয়োজন নেই।', 'সে ফিরে হাসল।', 'সে এত খুশি ছিল যে সে হাসি থামাতে পারেনি।']
90
+
91
+ #Sentiment
92
+ telugu_sentiment = ['అణు కార్యక్రమాన్ని పౌర అవసరాలు, సైనిక అవసరాలుగా విడదీసినందున ఆ ఒప్పందంపై సంతకం పెట్టాల్సిన అవసరం లేదని భారత్ వాదన.', 'ప్రజలకు అన్నివిధాలా తోడ్పాటును అందించాలని సూచించారు.', 'అవసరాలకు అనుగుణంగా అనేక చట్టాలను మార్చుకోవడం సాధ్యం కావడం లేదు.', 'అనంతరం ప్రాంతాల వారీగా చేపట్టే కార్యక్రమాలపై చర్చించారు.', 'జోరుగా టీఆరెస్ సభ్యత్వ నమోదు.', 'కారణం ఏమిటో గానీ తెలుగు రాష్ట్రాల్లో ఏ ఒక్కరికీ కేంద్రంలో మంత్రిపదవి దక్కలేదు.']
93
+ hindi_sentiment = ['असम में ब्रह्मपुत्र नदी के किनारे स्थित इस पार्क में गैंडे के साथ - साथ हाथी , चीता , बाघ , हिरण , डॉल्फिन , सांभर आदि देखे जा सकते हैं ।', 'इसका 13 मेगा पिक्सल कैमरा जो इस डिवाईस का हिरो है ।', 'कुल मिलाकर जी3 स्टाइलस का परफॉर्मेंस अच्छा नहीं कहा जा सकता ।', 'इसके अन्दर लगी 3120 एमएएच क�� बैटरी , पूरे डेढ़ दिन तक चलती है ।', 'इसके बेंचमार्क स्कोर्स बहुत ही आशाजनक थे क्योंकि यह स्मार्टफोन प्रतियोगिता को दूर कर देता है ।', 'जिसका नुकसान ये होता है कि अगर आप फिल्म देख रहे है या गेम खेल रहे है तो स्पीकर्स आपके हाथों से ढक जाते हैं ।']
94
+ # marathi_sentiment =
95
+ # bengali_sentiment =
96
+
97
+ hindi_qa = ['पैंथर्स डिफ़ेंस ने कितने अंक दिए?', 'डिवीजनल राउंड में ब्रोंकोस से कौन हारा?', 'वर्तमान में ब्रॉनकोस फ्रैंचाइज़ी में जॉन एलवे की क्या भूमिका है?', 'लेडी गागा ने कितने ग्रैमी जीते हैं?']
98
+ marathi_ape = ['हळूहळू खायला आणि प्यायला मदत होते आणि लहान, वारंवार जेवण होते.', 'कधी कधी खांद्यावरून बाहेर पडणाऱ्या आगीचे चित्रण केले जात नाही.', 'पिंपळाच्या आकाराचे मातीचे शरीर, संपूर्ण शरीरावर लाल कापड चिकटवले जाते.', 'या कालखंडात आणखी एक महत्त्वाची गोष्ट म्हणजे तांत्रिकवादाची वाढ.', 'या कामांच्या माध्यमातून माहिम बेट परेल आणि वरळीशी जोडले गेले होते.']
99
+ portuguese_simple = ['Comportamento semelhante tiveram outros mercados de capitais no mundo.', '- O CPC está abaixo do que queremos, apesar do aumento quando comparado com janeiro.', 'As coisas vão voltar à normalidade.', 'O presidente foi recebido por uma platéia reunida por PT, PC do B e PSB , partidos da base do governo.', '- Havia um Fiat Doblò estacionado em frente a uma panificadora.']
100
+
101
+
102
+ for sentence in portuguese_simple:
103
+ for ratio in [0.3, 0.5, 0.8]:
104
+ print(f"Ratio: {ratio*100}%")
105
+ print(code_switch(sentence, ratio))
106
+ print("-----------------------------")
107
+
108
+ # for sentence in marathi_sentences:
109
+ # translation = get_translation(sentence)
110
+ # print(translation)
ST/inference/codes/wilcoxon.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import string
3
+ from scipy.stats import wilcoxon
4
+ import numpy as np
5
+
6
+ def process_sentence(sentence):
7
+ if not isinstance(sentence, str):
8
+ return ""
9
+
10
+ sentence = sentence.split('\n')[0]
11
+ sentence = sentence.strip()
12
+ sentence = sentence.lower()
13
+
14
+ for punctuation in string.punctuation:
15
+ sentence = sentence.replace(punctuation, "")
16
+ sentence = sentence.strip()
17
+
18
+ if sentence and sentence[-1] == '।':
19
+ sentence = sentence[:-1]
20
+
21
+ return sentence
22
+
23
+ # Read CSV and generate exact match scores for Prompt A
24
+ with open('MT0_xxl_results/result_vi', 'r') as csvfile:
25
+ reader = csv.DictReader(csvfile)
26
+ scores_a = [1 if process_sentence(row['pred_label']) == process_sentence(row['label']) else 0 for row in reader]
27
+
28
+ # Read CSV and generate exact match scores for Prompt B
29
+ with open('MT0_xxl_results/result_vi_80p', 'r') as csvfile:
30
+ reader = csv.DictReader(csvfile)
31
+ scores_b = [1 if process_sentence(row['pred_label']) == process_sentence(row['label']) else 0 for row in reader]
32
+
33
+ # Count the number of 1s in each list
34
+ count_a = scores_a.count(1)
35
+ count_b = scores_b.count(1)
36
+
37
+ # Print the counts
38
+ print(f"Number of exact matches for Prompt A: {count_a}")
39
+ print(f"Number of exact matches for Prompt B: {count_b}")
40
+
41
+ # Conduct Wilcoxon Signed Rank test
42
+ w_stat, p_val = wilcoxon(scores_a, scores_b)
43
+
44
+ # Print the results
45
+ print(f"Wilcoxon Signed Rank statistic: {w_stat}")
46
+ print(f"P-value: {p_val}")
47
+
48
+ if p_val < 0.05:
49
+ print("The difference in score distributions between the prompts is statistically significant.")
50
+ else:
51
+ print("The difference in score distributions between the prompts is not statistically significant.")