|
import os |
|
import re |
|
import json |
|
import time |
|
import requests |
|
import gradio as gr |
|
|
|
import google.auth |
|
from google.auth.transport.requests import Request |
|
|
|
|
|
def update_text_from_dictionary(text, dictionary_path="./en_hi.dict", initial_lookup=True): |
|
if not dictionary_path: |
|
return texts |
|
|
|
with open(dictionary_path) as f: |
|
lines = f.read().splitlines() |
|
|
|
updated_lines = list(map(lambda x: x.split("|"), lines)) |
|
|
|
initial_pass_dict = {} |
|
final_pass_dict = {} |
|
for initial, incorrect, correct in updated_lines: |
|
initial_pass_dict[initial] = correct |
|
initial_pass_dict[initial+"."] = correct+"." |
|
initial_pass_dict[initial+"?"] = correct+"?" |
|
initial_pass_dict[initial+","] = correct+"," |
|
final_pass_dict[incorrect] = correct |
|
final_pass_dict[incorrect+"."] = correct+"." |
|
final_pass_dict[incorrect+"?"] = correct+"?" |
|
final_pass_dict[incorrect+","] = correct+"," |
|
|
|
replacable_dict = initial_pass_dict if initial_lookup else final_pass_dict |
|
print(f"Original [{initial_lookup}]: ", text) |
|
new_text = " ".join([replacable_dict.get(t, t) for t in text.split()]) |
|
print(f"New [{initial_lookup}]: ", text) |
|
return new_text |
|
|
|
|
|
def get_google_token(): |
|
credentials, project = google.auth.load_credentials_from_dict( |
|
json.loads(os.environ.get('GCP_FINETUNE_KEY')), |
|
scopes=[ |
|
"https://www.googleapis.com/auth/cloud-platform", |
|
"https://www.googleapis.com/auth/generative-language.tuning", |
|
], |
|
) |
|
request = Request() |
|
credentials.refresh(request) |
|
access_token = credentials.token |
|
return access_token |
|
|
|
|
|
def transliterate_first_word(text): |
|
texts = text.split(maxsplit=1) |
|
if len(texts) > 1: |
|
first_word, rest = texts |
|
else: |
|
first_word, rest = texts[0], "" |
|
if not first_word.isalnum(): |
|
return text |
|
|
|
url = "https://inputtools.google.com/request" |
|
n=1 |
|
params = { |
|
"text": first_word, |
|
"num": n, |
|
"itc": "hi-t-i0-und", |
|
"cp": 0, |
|
"cs": 1, |
|
"ie": "utf-8", |
|
"app": "demopage" |
|
} |
|
response = requests.get(url, params=params) |
|
results = response.json()[1][0][1] |
|
first_word_transliterated = results[0] |
|
return f"{first_word_transliterated} {rest}" |
|
|
|
|
|
def clean(result): |
|
text = result["choices"][0]['message']["content"] |
|
text = re.sub(r"\(.*?\)|\[.*?\]","", text) |
|
text = text.strip("'").replace('"', "").replace('`', "") |
|
if "\n" in text.strip("\n"): |
|
text = text.split("\n")[-1] |
|
return clean_hindi_transliterated_text(text) |
|
|
|
|
|
def clean_hindi_transliterated_text(text): |
|
updates = [('ऑ', 'औ'), ('ॉ', 'ौ'), ('ॅ', 'े'), ("{", ""), ("}", ""), ("'text'", ""), (":", "")] |
|
text = text.replace('`', '').replace("output:", "") |
|
for o, n in updates: |
|
text = text.replace(o, n) |
|
final_text = text.strip().strip("'").strip('"') |
|
result_text = update_text_from_dictionary(final_text) |
|
return result_text |
|
|
|
|
|
def dubpro_english_transliteration(text, call_gpt): |
|
if call_gpt: |
|
headers = { |
|
"Content-Type": "application/json", |
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" |
|
} |
|
|
|
prompt = f"Given the English text, transliterate it to Hindi, without translation. Return only the transliterated text, without any instruction or messages. Text: `{text}`\nOutput: " |
|
messages = [ |
|
{"role": "user", "content": prompt} |
|
] |
|
resp = None |
|
while resp is None: |
|
resp = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json={ |
|
"model": "gpt-4", |
|
"messages": messages |
|
}) |
|
if resp.status_code != 200: |
|
print(resp.text) |
|
time.sleep(0.5) |
|
return clean(resp.json()) |
|
else: |
|
API_URL = os.environ.get("GEMINI_FINETUNED_HINDI_ENG_API") |
|
BEARER_TOKEN = get_google_token() |
|
headers = { |
|
"Authorization": f"Bearer {BEARER_TOKEN}", |
|
"Content-Type": "application/json", |
|
} |
|
payload = { |
|
"contents": [ |
|
{ |
|
"parts": [{"text": f"input: {text}"}], |
|
"role": "user", |
|
} |
|
], |
|
"generationConfig": { |
|
"maxOutputTokens": 8192, |
|
"temperature": 0.85, |
|
}, |
|
"safetySettings": [ |
|
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"}, |
|
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"}, |
|
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"}, |
|
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"}, |
|
], |
|
} |
|
result = requests.post( |
|
url=API_URL, |
|
headers=headers, |
|
json=payload |
|
) |
|
response = result.json() |
|
response_content = response['candidates'][0]['content']['parts'][0]['text'].replace("output:", "").strip().replace("'text':", "").replace("{", "").replace("}", "").strip().strip("'").strip('"') |
|
response_content = transliterate_first_word(response_content) |
|
return response_content |
|
|
|
|
|
def generate_rephrases_gemini(text, language, problem): |
|
API_URL = os.environ.get("GEMINI_REPHRASER_API") |
|
BEARER_TOKEN = get_google_token() |
|
headers = { |
|
"Authorization": f"Bearer {BEARER_TOKEN}", |
|
"Content-Type": "application/json", |
|
} |
|
if problem == "Gap": |
|
speak = "more" |
|
else: |
|
speak = "less" |
|
if language == "English": |
|
prompt = f"You are an English and Hindi language expert, please rephrase a sentence that has been translated from Hindi to English so that it takes little {speak} time to speak." |
|
elif language == "Hindi": |
|
prompt = f"You are a hindi language expert please rephrase the below line without summary so that it takes little {speak} time to speak in hinglish manner." |
|
|
|
payload = { |
|
"contents": [ |
|
{ |
|
"parts": [ |
|
{ |
|
"text": prompt |
|
}, |
|
{ |
|
"text": f"input: {text}" |
|
}, |
|
{ |
|
"text": f"output: " |
|
} |
|
], |
|
"role": "user", |
|
} |
|
], |
|
"generationConfig": { |
|
"maxOutputTokens": 8192, |
|
"temperature": 0.85, |
|
"candidateCount": 1, |
|
}, |
|
"safetySettings": [ |
|
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"}, |
|
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"}, |
|
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"}, |
|
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"}, |
|
], |
|
} |
|
result = requests.post(url=API_URL, headers=headers, json=payload) |
|
response = result.json() |
|
output_text = response["candidates"][0]["content"]["parts"][0]["text"] |
|
|
|
texts = list(map(lambda x: x.replace("-", "").strip(), output_text.split("\n"))) |
|
texts = "\n".join(texts) |
|
|
|
|
|
wc = f"Original Word Count: {len(text.split())}\nRephrased Word Count: {len(texts.split())}" |
|
|
|
return texts, wc |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("English Transliteration Tool") |
|
with gr.Row(): |
|
with gr.Column(): |
|
input_text = gr.Textbox(label="Input text", info="Please enter English text.") |
|
full_transliteration = gr.Checkbox(label="Full transliteration", value=True) |
|
output_text = gr.Textbox(label="Output text") |
|
transliterate = gr.Button("Submit") |
|
transliterate.click(dubpro_english_transliteration, [input_text, full_transliteration], output_text) |
|
|
|
gr.Markdown("Rephraser Tool") |
|
with gr.Row(): |
|
rephrase_text = gr.Textbox(label="Input text", info="Please enter text.") |
|
language = gr.Dropdown(["English", "Hindi"], value="Hindi") |
|
solving_for = gr.Dropdown(["Gap", "Overflow"], value="Overflow", label="Solving for:") |
|
with gr.Row(): |
|
word_count = gr.Textbox(label="Word count") |
|
rephrased_text = gr.Textbox(label="Output text") |
|
rephrase = gr.Button("Submit") |
|
rephrase.click(generate_rephrases_gemini, [rephrase_text, language, solving_for], [rephrased_text, word_count]) |
|
|
|
|
|
demo.launch(auth=(os.environ.get("USERNAME"), os.environ.get("PASSWORD"))) |