deepsync's picture
Update app.py
199305f verified
raw
history blame
No virus
12.5 kB
import os
import re
import json
import time
import requests
import gradio as gr
import google.auth
from google.auth.transport.requests import Request
import google.generativeai as genai
genai.configure(api_key=os.environ.get("GEMINI_API_KEY"))
def upload_to_gemini(path, mime_type=None):
file = genai.upload_file(path, mime_type=mime_type)
print(f"Uploaded file '{file.display_name}' as: {file.uri}")
return file
generation_config = {
"temperature": 1,
"top_p": 0.95,
"top_k": 64,
"max_output_tokens": 1_048_576,
"response_mime_type": "text/plain",
}
safety_settings = [
{
"category": "HARM_CATEGORY_HARASSMENT",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_HATE_SPEECH",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
"threshold": "BLOCK_NONE",
},
]
model = genai.GenerativeModel(
model_name="gemini-1.5-pro-latest",
safety_settings=safety_settings,
generation_config=generation_config,
system_instruction="Act as a language model trained on a specific style of writing that incorporates both Roman and Devanagari script",
)
transliteration_example_file = upload_to_gemini(
"ai_exp_json.txt", mime_type="text/plain"
)
chat_session = model.start_chat(
history=[
{
"role": "user",
"parts": [
"Given a sentence in Roman written English and a set of pre-defined patterns, transliterate only specific words to Devanagari script while maintaining a desired ratio between Roman and Devanagari words. Your task is to transliterate only a subset of words while maintaining the overall meaning and sentence structure.\n",
'Based on a provided English sentence and a desired transliteration ratio, use your knowledge of this unique style to select words for transliteration that enhance the overall message and aesthetic. I will provide you with training examples to understand the preferred approach.\nGo through the examples in the file in following JSON format: [{"English": xxx, "Transliteration"}]." and Develop a system that can intelligently choose which English words to transliterate into Devanagari in a sentence, aiming for a specific ratio between the two scripts. With the help of examples in Json format file, design a system that can learn the optimal ratio and transliteration pattern.',
transliteration_example_file,
],
},
]
)
def generate_transliteration_gemini_15_pro(text):
texts = [text]
response = chat_session.send_message(
'Given an English sentences: \n```' + "\n".join(texts) + '\n```\nTransliterate English sentences into a mix of Roman and Devanagari script, following a predefined pattern or learning from provided examples above without explain anything.\nReturn output in JSON in following format for the list of sentences: {"text": xxx, "transliterate": xxx}'
)
clean_text = lambda res: res.replace("```json", "").replace("```", "").replace("\n", "")
print(response.text)
data = json.loads(clean_text(response.text))
if type(data) is list:
data = data[0]
return clean_hindi_transliterated_text(data["transliterate"])
def update_text_from_dictionary(text, dictionary_path="./en_hi.dict", initial_lookup=True):
if not dictionary_path:
return text
with open(dictionary_path) as f:
lines = f.read().splitlines()
updated_lines = list(map(lambda x: x.split("|"), lines))
initial_pass_dict = {}
final_pass_dict = {}
for initial, incorrect, correct in updated_lines:
initial_pass_dict[initial] = correct
initial_pass_dict[initial+"."] = correct+"."
initial_pass_dict[initial+"?"] = correct+"?"
initial_pass_dict[initial+","] = correct+","
final_pass_dict[incorrect] = correct
final_pass_dict[incorrect+"."] = correct+"."
final_pass_dict[incorrect+"?"] = correct+"?"
final_pass_dict[incorrect+","] = correct+","
if initial_lookup:
print(f"Original [{initial_lookup}]: ", text)
# print(initial_pass_dict)
new_text = " ".join([initial_pass_dict.get(t, t) for t in text.split()])
print(f"New [{initial_lookup}]: ", new_text)
else:
print(f"Original [{initial_lookup}]: ", text)
# print(final_pass_dict)
new_text = " ".join([final_pass_dict.get(t, t) for t in text.split()])
print(f"New [{initial_lookup}]: ", new_text)
return new_text
def get_google_token():
credentials, project = google.auth.load_credentials_from_dict(
json.loads(os.environ.get('GCP_FINETUNE_KEY')),
scopes=[
"https://www.googleapis.com/auth/cloud-platform",
"https://www.googleapis.com/auth/generative-language.tuning",
],
)
request = Request()
credentials.refresh(request)
access_token = credentials.token
return access_token
def transliterate_first_word(text):
texts = text.split(maxsplit=1)
if len(texts) > 1:
first_word, rest = texts
else:
first_word, rest = texts[0], ""
if not first_word.isalnum():
return text
url = "https://inputtools.google.com/request"
n=1
params = {
"text": first_word,
"num": n,
"itc": "hi-t-i0-und",
"cp": 0,
"cs": 1,
"ie": "utf-8",
"app": "demopage"
}
response = requests.get(url, params=params)
results = response.json()[1][0][1]
first_word_transliterated = results[0]
return f"{first_word_transliterated} {rest}"
def clean(result):
text = result["choices"][0]['message']["content"]
text = re.sub(r"\(.*?\)|\[.*?\]","", text)
text = text.strip("'").replace('"', "").replace('`', "")
if "\n" in text.strip("\n"):
text = text.split("\n")[-1]
return clean_hindi_transliterated_text(text)
def clean_hindi_transliterated_text(text):
updates = [('ऑ', 'औ'), ('ॉ', 'ौ'), ('ॅ', 'े'), ("{", ""), ("}", ""), ("'text'", ""), (":", "")]
text = text.replace('`', '').replace("output:", "")
for o, n in updates:
text = text.replace(o, n)
final_text = text.strip().strip("'").strip('"')
result_text = update_text_from_dictionary(final_text, initial_lookup=False)
return result_text
def dubpro_english_transliteration(text, call_gpt):
if call_gpt:
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
text = update_text_from_dictionary(text, initial_lookup=True)
prompt = f"Given the English text, transliterate it to Hindi, without translation. Return only the transliterated text, without any instruction or messages. Text: `{text}`\nOutput: "
messages = [
{"role": "user", "content": prompt}
]
resp = None
while resp is None:
resp = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json={
"model": "gpt-4o-2024-05-13",
"messages": messages
})
if resp.status_code != 200:
print(resp.text)
time.sleep(0.5)
return clean(resp.json())
else:
return generate_transliteration_gemini_15_pro(text)
# API_URL = os.environ.get("GEMINI_FINETUNED_HINDI_ENG_API")
# BEARER_TOKEN = get_google_token()
# headers = {
# "Authorization": f"Bearer {BEARER_TOKEN}",
# "Content-Type": "application/json",
# }
# payload = {
# "contents": [
# {
# "parts": [{"text": f"input: {text}"}],
# "role": "user",
# }
# ],
# "generationConfig": {
# "maxOutputTokens": 8192,
# "temperature": 0.85,
# },
# "safetySettings": [
# {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
# {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
# {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
# {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
# ],
# }
# result = requests.post(
# url=API_URL,
# headers=headers,
# json=payload
# )
# response = result.json()
# response_content = response['candidates'][0]['content']['parts'][0]['text'].replace("output:", "").strip().replace("'text':", "").replace("{", "").replace("}", "").strip().strip("'").strip('"')
# # response_content = transliterate_first_word(response_content)
# return response_content
def generate_rephrases_gemini(text, language, problem):
API_URL = os.environ.get("GEMINI_REPHRASER_API")
BEARER_TOKEN = get_google_token()
headers = {
"Authorization": f"Bearer {BEARER_TOKEN}",
"Content-Type": "application/json",
}
if problem == "Gap":
speak = "more"
else:
speak = "less"
if language == "English":
prompt = f"You are an English and Hindi language expert, please rephrase a sentence that has been translated from Hindi to English so that it takes little {speak} time to speak."
elif language == "Hindi":
prompt = f"You are a hindi language expert please rephrase the below line without summary so that it takes little {speak} time to speak in hinglish manner."
payload = {
"contents": [
{
"parts": [
{
"text": prompt
},
{
"text": f"input: {text}"
},
{
"text": f"output: "
}
],
"role": "user",
}
],
"generationConfig": {
"maxOutputTokens": 8192,
"temperature": 0.85,
"candidateCount": 1,
},
"safetySettings": [
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
],
}
result = requests.post(url=API_URL, headers=headers, json=payload)
response = result.json()
output_text = response["candidates"][0]["content"]["parts"][0]["text"]
texts = list(map(lambda x: x.replace("-", "").strip(), output_text.split("\n")))
texts = "\n".join(texts)
# texts = dubpro_english_transliteration(texts)
wc = f"Original Word Count: {len(text.split())}\nRephrased Word Count: {len(texts.split())}"
return texts, wc
with gr.Blocks() as demo:
gr.Markdown("# Translator Assistance Tools")
with gr.Tab("Transliteration"):
with gr.Row():
with gr.Column():
input_text = gr.Textbox(label="Input text", info="Please enter English text.")
full_transliteration = gr.Checkbox(label="Full transliteration", value=True)
output_text = gr.Textbox(label="Output text")
transliterate = gr.Button("Submit")
transliterate.click(dubpro_english_transliteration, [input_text, full_transliteration], output_text)
with gr.Tab("Rephraser Tool"):
with gr.Row():
rephrase_text = gr.Textbox(label="Input text", info="Please enter text.")
language = gr.Dropdown(["English", "Hindi"], value="Hindi")
solving_for = gr.Dropdown(["Gap", "Overflow"], value="Overflow", label="Solving for:")
with gr.Row():
word_count = gr.Textbox(label="Word count")
rephrased_text = gr.Textbox(label="Output text")
rephrase = gr.Button("Submit")
rephrase.click(generate_rephrases_gemini, [rephrase_text, language, solving_for], [rephrased_text, word_count])
demo.launch(auth=(os.environ.get("USERNAME"), os.environ.get("PASSWORD")))