File size: 12,386 Bytes
44102fe 64f007a bc91923 64f007a 9066111 44102fe d5eb5d2 59c6335 2379992 59c6335 0149e13 d62f0f0 0149e13 2379992 0149e13 d62f0f0 0149e13 2379992 59c6335 44102fe e79aa9f 64f007a 3704ce9 e79aa9f 64f007a 3704ce9 59c6335 89caf53 59c6335 3704ce9 64f007a d5eb5d2 3704ce9 c65e36c 89caf53 3704ce9 d5eb5d2 44102fe 8d8cdf7 4203f55 d7e36aa 8d8cdf7 4203f55 8d8cdf7 4203f55 8d8cdf7 4203f55 d7e36aa 4203f55 d7e36aa d980473 3704ce9 8d8cdf7 d980473 d7e36aa 44102fe f8cba8f e834e63 d7e36aa 44102fe 4891a01 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 |
import os
import re
import json
import time
import requests
import gradio as gr
import google.auth
from google.auth.transport.requests import Request
import google.generativeai as genai
genai.configure(api_key=os.environ.get("GEMINI_API_KEY"))
def upload_to_gemini(path, mime_type=None):
file = genai.upload_file(path, mime_type=mime_type)
print(f"Uploaded file '{file.display_name}' as: {file.uri}")
return file
generation_config = {
"temperature": 1,
"top_p": 0.95,
"top_k": 64,
"max_output_tokens": 1_048_576,
"response_mime_type": "text/plain",
}
safety_settings = [
{
"category": "HARM_CATEGORY_HARASSMENT",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_HATE_SPEECH",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
"threshold": "BLOCK_NONE",
},
]
model = genai.GenerativeModel(
model_name="gemini-1.5-pro-latest",
safety_settings=safety_settings,
generation_config=generation_config,
system_instruction="Act as a language model trained on a specific style of writing that incorporates both Roman and Devanagari script",
)
transliteration_example_file = upload_to_gemini(
"ai_exp_json.txt", mime_type="text/plain"
)
chat_session = model.start_chat(
history=[
{
"role": "user",
"parts": [
"Given a sentence in Roman written English and a set of pre-defined patterns, transliterate only specific words to Devanagari script while maintaining a desired ratio between Roman and Devanagari words. Your task is to transliterate only a subset of words while maintaining the overall meaning and sentence structure.\n",
'Based on a provided English sentence and a desired transliteration ratio, use your knowledge of this unique style to select words for transliteration that enhance the overall message and aesthetic. I will provide you with training examples to understand the preferred approach.\nGo through the examples in the file in following JSON format: [{"English": xxx, "Transliteration"}]." and Develop a system that can intelligently choose which English words to transliterate into Devanagari in a sentence, aiming for a specific ratio between the two scripts. With the help of examples in Json format file, design a system that can learn the optimal ratio and transliteration pattern.',
transliteration_example_file,
],
},
]
)
def generate_transliteration_gemini_15_pro(text):
texts = [text]
chat_session.send_message(
'Given an English sentences: \n```' + "\n".join(texts) + '\n```\nTransliterate English sentences into a mix of Roman and Devanagari script, following a predefined pattern or learning from provided examples above without explain anything.\nReturn output in JSON in following format for the list of sentences: {"text": xxx, "transliterate": xxx}'
)
clean_text = lambda res: res.replace("```json", "").replace("```", "").replace("\n", "")
data = json.loads(clean_text(response.text))
return clean_hindi_transliterated_text(data["transliterate"])
def update_text_from_dictionary(text, dictionary_path="./en_hi.dict", initial_lookup=True):
if not dictionary_path:
return text
with open(dictionary_path) as f:
lines = f.read().splitlines()
updated_lines = list(map(lambda x: x.split("|"), lines))
initial_pass_dict = {}
final_pass_dict = {}
for initial, incorrect, correct in updated_lines:
initial_pass_dict[initial] = correct
initial_pass_dict[initial+"."] = correct+"."
initial_pass_dict[initial+"?"] = correct+"?"
initial_pass_dict[initial+","] = correct+","
final_pass_dict[incorrect] = correct
final_pass_dict[incorrect+"."] = correct+"."
final_pass_dict[incorrect+"?"] = correct+"?"
final_pass_dict[incorrect+","] = correct+","
if initial_lookup:
print(f"Original [{initial_lookup}]: ", text)
# print(initial_pass_dict)
new_text = " ".join([initial_pass_dict.get(t, t) for t in text.split()])
print(f"New [{initial_lookup}]: ", new_text)
else:
print(f"Original [{initial_lookup}]: ", text)
# print(final_pass_dict)
new_text = " ".join([final_pass_dict.get(t, t) for t in text.split()])
print(f"New [{initial_lookup}]: ", new_text)
return new_text
def get_google_token():
credentials, project = google.auth.load_credentials_from_dict(
json.loads(os.environ.get('GCP_FINETUNE_KEY')),
scopes=[
"https://www.googleapis.com/auth/cloud-platform",
"https://www.googleapis.com/auth/generative-language.tuning",
],
)
request = Request()
credentials.refresh(request)
access_token = credentials.token
return access_token
def transliterate_first_word(text):
texts = text.split(maxsplit=1)
if len(texts) > 1:
first_word, rest = texts
else:
first_word, rest = texts[0], ""
if not first_word.isalnum():
return text
url = "https://inputtools.google.com/request"
n=1
params = {
"text": first_word,
"num": n,
"itc": "hi-t-i0-und",
"cp": 0,
"cs": 1,
"ie": "utf-8",
"app": "demopage"
}
response = requests.get(url, params=params)
results = response.json()[1][0][1]
first_word_transliterated = results[0]
return f"{first_word_transliterated} {rest}"
def clean(result):
text = result["choices"][0]['message']["content"]
text = re.sub(r"\(.*?\)|\[.*?\]","", text)
text = text.strip("'").replace('"', "").replace('`', "")
if "\n" in text.strip("\n"):
text = text.split("\n")[-1]
return clean_hindi_transliterated_text(text)
def clean_hindi_transliterated_text(text):
updates = [('ऑ', 'औ'), ('ॉ', 'ौ'), ('ॅ', 'े'), ("{", ""), ("}", ""), ("'text'", ""), (":", "")]
text = text.replace('`', '').replace("output:", "")
for o, n in updates:
text = text.replace(o, n)
final_text = text.strip().strip("'").strip('"')
result_text = update_text_from_dictionary(final_text, initial_lookup=False)
return result_text
def dubpro_english_transliteration(text, call_gpt):
if call_gpt:
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
text = update_text_from_dictionary(text, initial_lookup=True)
prompt = f"Given the English text, transliterate it to Hindi, without translation. Return only the transliterated text, without any instruction or messages. Text: `{text}`\nOutput: "
messages = [
{"role": "user", "content": prompt}
]
resp = None
while resp is None:
resp = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json={
"model": "gpt-4",
"messages": messages
})
if resp.status_code != 200:
print(resp.text)
time.sleep(0.5)
return clean(resp.json())
else:
return generate_transliteration_gemini_15_pro(text)
# API_URL = os.environ.get("GEMINI_FINETUNED_HINDI_ENG_API")
# BEARER_TOKEN = get_google_token()
# headers = {
# "Authorization": f"Bearer {BEARER_TOKEN}",
# "Content-Type": "application/json",
# }
# payload = {
# "contents": [
# {
# "parts": [{"text": f"input: {text}"}],
# "role": "user",
# }
# ],
# "generationConfig": {
# "maxOutputTokens": 8192,
# "temperature": 0.85,
# },
# "safetySettings": [
# {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
# {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
# {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
# {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
# ],
# }
# result = requests.post(
# url=API_URL,
# headers=headers,
# json=payload
# )
# response = result.json()
# response_content = response['candidates'][0]['content']['parts'][0]['text'].replace("output:", "").strip().replace("'text':", "").replace("{", "").replace("}", "").strip().strip("'").strip('"')
# # response_content = transliterate_first_word(response_content)
# return response_content
def generate_rephrases_gemini(text, language, problem):
API_URL = os.environ.get("GEMINI_REPHRASER_API")
BEARER_TOKEN = get_google_token()
headers = {
"Authorization": f"Bearer {BEARER_TOKEN}",
"Content-Type": "application/json",
}
if problem == "Gap":
speak = "more"
else:
speak = "less"
if language == "English":
prompt = f"You are an English and Hindi language expert, please rephrase a sentence that has been translated from Hindi to English so that it takes little {speak} time to speak."
elif language == "Hindi":
prompt = f"You are a hindi language expert please rephrase the below line without summary so that it takes little {speak} time to speak in hinglish manner."
payload = {
"contents": [
{
"parts": [
{
"text": prompt
},
{
"text": f"input: {text}"
},
{
"text": f"output: "
}
],
"role": "user",
}
],
"generationConfig": {
"maxOutputTokens": 8192,
"temperature": 0.85,
"candidateCount": 1,
},
"safetySettings": [
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
],
}
result = requests.post(url=API_URL, headers=headers, json=payload)
response = result.json()
output_text = response["candidates"][0]["content"]["parts"][0]["text"]
texts = list(map(lambda x: x.replace("-", "").strip(), output_text.split("\n")))
texts = "\n".join(texts)
# texts = dubpro_english_transliteration(texts)
wc = f"Original Word Count: {len(text.split())}\nRephrased Word Count: {len(texts.split())}"
return texts, wc
with gr.Blocks() as demo:
gr.Markdown("# Translator Assistance Tools")
with gr.Tab("Transliteration"):
with gr.Row():
with gr.Column():
input_text = gr.Textbox(label="Input text", info="Please enter English text.")
full_transliteration = gr.Checkbox(label="Full transliteration", value=True)
output_text = gr.Textbox(label="Output text")
transliterate = gr.Button("Submit")
transliterate.click(dubpro_english_transliteration, [input_text, full_transliteration], output_text)
with gr.Tab("Rephraser Tool"):
with gr.Row():
rephrase_text = gr.Textbox(label="Input text", info="Please enter text.")
language = gr.Dropdown(["English", "Hindi"], value="Hindi")
solving_for = gr.Dropdown(["Gap", "Overflow"], value="Overflow", label="Solving for:")
with gr.Row():
word_count = gr.Textbox(label="Word count")
rephrased_text = gr.Textbox(label="Output text")
rephrase = gr.Button("Submit")
rephrase.click(generate_rephrases_gemini, [rephrase_text, language, solving_for], [rephrased_text, word_count])
demo.launch(auth=(os.environ.get("USERNAME"), os.environ.get("PASSWORD"))) |