File size: 12,484 Bytes
44102fe
64f007a
bc91923
64f007a
9066111
44102fe
 
 
 
 
d5eb5d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db23a89
d5eb5d2
 
 
db23a89
d5eb5d2
199305f
 
d5eb5d2
 
 
59c6335
 
 
2379992
59c6335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0149e13
 
 
d62f0f0
0149e13
2379992
0149e13
 
d62f0f0
0149e13
2379992
59c6335
 
 
44102fe
 
 
 
 
 
 
 
 
 
 
 
 
 
e79aa9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64f007a
3704ce9
 
 
 
 
 
 
e79aa9f
64f007a
3704ce9
 
 
 
 
59c6335
89caf53
59c6335
3704ce9
64f007a
d5eb5d2
 
 
3704ce9
 
 
 
 
 
c65e36c
89caf53
3704ce9
 
 
 
 
 
 
 
7d55ccf
3704ce9
 
 
 
 
 
 
d5eb5d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44102fe
 
8d8cdf7
4203f55
d7e36aa
 
 
 
 
8d8cdf7
 
 
 
4203f55
8d8cdf7
4203f55
8d8cdf7
4203f55
d7e36aa
 
 
 
 
4203f55
d7e36aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d980473
3704ce9
8d8cdf7
 
 
d980473
d7e36aa
 
44102fe
f8cba8f
e834e63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7e36aa
44102fe
4891a01
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
import os
import re
import json
import time
import requests
import gradio as gr

import google.auth
from google.auth.transport.requests import Request

import google.generativeai as genai

genai.configure(api_key=os.environ.get("GEMINI_API_KEY"))

def upload_to_gemini(path, mime_type=None):
    file = genai.upload_file(path, mime_type=mime_type)
    print(f"Uploaded file '{file.display_name}' as: {file.uri}")
    return file

generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 1_048_576,
    "response_mime_type": "text/plain",
}

safety_settings = [
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE",
    },
]

model = genai.GenerativeModel(
    model_name="gemini-1.5-pro-latest",
    safety_settings=safety_settings,
    generation_config=generation_config,
    system_instruction="Act as a language model trained on a specific style of writing that incorporates both Roman and Devanagari script",
)

transliteration_example_file = upload_to_gemini(
    "ai_exp_json.txt", mime_type="text/plain"
)

chat_session = model.start_chat(
    history=[
        {
            "role": "user",
            "parts": [
                "Given a sentence in Roman written English and a set of pre-defined patterns, transliterate only specific words to Devanagari script while maintaining a desired ratio between Roman and Devanagari words. Your task is to transliterate only a subset of words while maintaining the overall meaning and sentence structure.\n",
                'Based on a provided English sentence and a desired transliteration ratio, use your knowledge of this unique style to select words for transliteration that enhance the overall message and aesthetic. I will provide you with training examples to understand the preferred approach.\nGo through the examples in the file in following JSON format: [{"English": xxx, "Transliteration"}]." and Develop a system that can intelligently choose which English words to transliterate into Devanagari in a sentence, aiming for a specific ratio between the two scripts. With the help of examples in Json format file, design a system that can learn the optimal ratio and transliteration pattern.',
                transliteration_example_file,
            ],
        },
    ]
)


def generate_transliteration_gemini_15_pro(text):
    texts = [text]
    response = chat_session.send_message(
        'Given an English sentences: \n```' +  "\n".join(texts) + '\n```\nTransliterate English sentences into a mix of Roman and Devanagari script, following a predefined pattern or learning from provided examples above without explain anything.\nReturn output in JSON in following format for the list of sentences: {"text": xxx, "transliterate": xxx}'
    )
    clean_text = lambda res: res.replace("```json", "").replace("```", "").replace("\n", "")
    print(response.text)
    data = json.loads(clean_text(response.text))
    if type(data) is list:
        data = data[0]
    return clean_hindi_transliterated_text(data["transliterate"])
    
    

def update_text_from_dictionary(text, dictionary_path="./en_hi.dict", initial_lookup=True):
    if not dictionary_path:
        return text

    with open(dictionary_path) as f:
        lines = f.read().splitlines()
    
    updated_lines = list(map(lambda x: x.split("|"), lines))
    
    initial_pass_dict = {}
    final_pass_dict = {}
    for initial, incorrect, correct in updated_lines:
        initial_pass_dict[initial] = correct
        initial_pass_dict[initial+"."] = correct+"."
        initial_pass_dict[initial+"?"] = correct+"?"
        initial_pass_dict[initial+","] = correct+","
        final_pass_dict[incorrect] = correct
        final_pass_dict[incorrect+"."] = correct+"."
        final_pass_dict[incorrect+"?"] = correct+"?"
        final_pass_dict[incorrect+","] = correct+","


    if initial_lookup:
        print(f"Original [{initial_lookup}]: ", text)
        # print(initial_pass_dict)
        new_text = " ".join([initial_pass_dict.get(t, t) for t in text.split()])
        print(f"New [{initial_lookup}]: ", new_text)
    else:
        print(f"Original [{initial_lookup}]: ", text)
        # print(final_pass_dict)
        new_text = " ".join([final_pass_dict.get(t, t) for t in text.split()])
        print(f"New [{initial_lookup}]: ", new_text)
    return new_text


def get_google_token():
    credentials, project = google.auth.load_credentials_from_dict(
        json.loads(os.environ.get('GCP_FINETUNE_KEY')),
        scopes=[
            "https://www.googleapis.com/auth/cloud-platform",
            "https://www.googleapis.com/auth/generative-language.tuning",
        ],
    )
    request = Request()
    credentials.refresh(request)
    access_token = credentials.token
    return access_token


def transliterate_first_word(text):
    texts = text.split(maxsplit=1)
    if len(texts) > 1:
        first_word, rest = texts
    else:
        first_word, rest = texts[0], ""
    if not first_word.isalnum():
        return text
    
    url = "https://inputtools.google.com/request"
    n=1
    params = {
        "text": first_word,
        "num": n,
        "itc": "hi-t-i0-und",
        "cp": 0,
        "cs": 1,
        "ie": "utf-8",
        "app": "demopage"
    }
    response = requests.get(url, params=params)
    results = response.json()[1][0][1]
    first_word_transliterated = results[0]
    return f"{first_word_transliterated} {rest}"


def clean(result):
    text = result["choices"][0]['message']["content"]
    text = re.sub(r"\(.*?\)|\[.*?\]","", text)
    text = text.strip("'").replace('"', "").replace('`', "")
    if "\n" in text.strip("\n"):
        text = text.split("\n")[-1]
    return clean_hindi_transliterated_text(text)


def clean_hindi_transliterated_text(text):
    updates = [('ऑ', 'औ'), ('ॉ', 'ौ'), ('ॅ', 'े'), ("{", ""), ("}", ""), ("'text'", ""), (":", "")]
    text = text.replace('`', '').replace("output:", "")
    for o, n in updates:
        text = text.replace(o, n)
    final_text = text.strip().strip("'").strip('"')
    result_text = update_text_from_dictionary(final_text, initial_lookup=False)
    return result_text





def dubpro_english_transliteration(text, call_gpt):
    if call_gpt:
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
        }

        text = update_text_from_dictionary(text, initial_lookup=True)
        
        prompt = f"Given the English text, transliterate it to Hindi, without translation. Return only the transliterated text, without any instruction or messages. Text: `{text}`\nOutput: "
        messages = [
            {"role": "user", "content": prompt}
        ]
        resp = None
        while resp is None:
            resp = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json={
                "model": "gpt-4o-2024-05-13",
                "messages": messages
            })
            if resp.status_code != 200:
                print(resp.text)
            time.sleep(0.5)
        return clean(resp.json())
    else:
        return generate_transliteration_gemini_15_pro(text)
        # API_URL = os.environ.get("GEMINI_FINETUNED_HINDI_ENG_API")
        # BEARER_TOKEN = get_google_token()
        # headers = {
        #     "Authorization": f"Bearer {BEARER_TOKEN}",
        #     "Content-Type": "application/json",
        # }
        # payload = {
        #     "contents": [
        #         {
        #             "parts": [{"text": f"input: {text}"}],
        #             "role": "user",
        #         }
        #     ],
        #     "generationConfig": {
        #         "maxOutputTokens": 8192,
        #         "temperature": 0.85,
        #     },
        #     "safetySettings": [
        #         {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
        #         {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
        #         {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
        #         {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
        #     ],
        # }
        # result = requests.post(
        #     url=API_URL,
        #     headers=headers,
        #     json=payload
        # )
        # response = result.json()
        # response_content = response['candidates'][0]['content']['parts'][0]['text'].replace("output:", "").strip().replace("'text':", "").replace("{", "").replace("}", "").strip().strip("'").strip('"')
        # # response_content = transliterate_first_word(response_content)
        # return response_content


def generate_rephrases_gemini(text, language, problem):
    API_URL = os.environ.get("GEMINI_REPHRASER_API")
    BEARER_TOKEN = get_google_token()
    headers = {
        "Authorization": f"Bearer {BEARER_TOKEN}",
        "Content-Type": "application/json",
    }
    if problem == "Gap":
        speak = "more"
    else:
        speak = "less"
    if language == "English":
        prompt = f"You are an English and Hindi language expert, please rephrase a sentence that has been translated from Hindi to English so that it takes little {speak} time to speak."
    elif language == "Hindi":
        prompt = f"You are a hindi language expert please rephrase the below line without summary so that it takes little {speak} time to speak in hinglish manner."
        
    payload = {
        "contents": [
            {
                "parts": [
                    {
                        "text": prompt
                    },
                    {
                        "text": f"input: {text}"
                    },
                    {
                        "text": f"output: "
                    }
                ],
                "role": "user",
            }
        ],
        "generationConfig": {
            "maxOutputTokens": 8192,
            "temperature": 0.85,
            "candidateCount": 1,
        },
        "safetySettings": [
            {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
        ],
    }
    result = requests.post(url=API_URL, headers=headers, json=payload)
    response = result.json()
    output_text = response["candidates"][0]["content"]["parts"][0]["text"]

    texts = list(map(lambda x: x.replace("-", "").strip(), output_text.split("\n")))
    texts = "\n".join(texts)
    # texts = dubpro_english_transliteration(texts)

    wc = f"Original Word Count: {len(text.split())}\nRephrased Word Count: {len(texts.split())}"
    
    return texts, wc


with gr.Blocks() as demo:
    gr.Markdown("# Translator Assistance Tools")
    with gr.Tab("Transliteration"):
        with gr.Row():
            with gr.Column():
                input_text = gr.Textbox(label="Input text", info="Please enter English text.")
                full_transliteration = gr.Checkbox(label="Full transliteration", value=True)
            output_text = gr.Textbox(label="Output text")
        transliterate = gr.Button("Submit")
        transliterate.click(dubpro_english_transliteration, [input_text, full_transliteration], output_text)

    with gr.Tab("Rephraser Tool"):
        with gr.Row():
            rephrase_text = gr.Textbox(label="Input text", info="Please enter text.")
            language = gr.Dropdown(["English", "Hindi"], value="Hindi")
            solving_for = gr.Dropdown(["Gap", "Overflow"], value="Overflow", label="Solving for:")
        with gr.Row():
            word_count = gr.Textbox(label="Word count")
            rephrased_text = gr.Textbox(label="Output text")
        rephrase = gr.Button("Submit")
        rephrase.click(generate_rephrases_gemini, [rephrase_text, language, solving_for], [rephrased_text, word_count])
        

demo.launch(auth=(os.environ.get("USERNAME"), os.environ.get("PASSWORD")))