Spaces:

deepsync
/

english-rephraser-transliterator

Running

File size: 12,386 Bytes

import os
import re
import json
import time
import requests
import gradio as gr

import google.auth
from google.auth.transport.requests import Request

import google.generativeai as genai

genai.configure(api_key=os.environ.get("GEMINI_API_KEY"))

def upload_to_gemini(path, mime_type=None):
    file = genai.upload_file(path, mime_type=mime_type)
    print(f"Uploaded file '{file.display_name}' as: {file.uri}")
    return file

generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 1_048_576,
    "response_mime_type": "text/plain",
}

safety_settings = [
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE",
    },
]

model = genai.GenerativeModel(
    model_name="gemini-1.5-pro-latest",
    safety_settings=safety_settings,
    generation_config=generation_config,
    system_instruction="Act as a language model trained on a specific style of writing that incorporates both Roman and Devanagari script",
)

transliteration_example_file = upload_to_gemini(
    "ai_exp_json.txt", mime_type="text/plain"
)

chat_session = model.start_chat(
    history=[
        {
            "role": "user",
            "parts": [
                "Given a sentence in Roman written English and a set of pre-defined patterns, transliterate only specific words to Devanagari script while maintaining a desired ratio between Roman and Devanagari words. Your task is to transliterate only a subset of words while maintaining the overall meaning and sentence structure.\n",
                'Based on a provided English sentence and a desired transliteration ratio, use your knowledge of this unique style to select words for transliteration that enhance the overall message and aesthetic. I will provide you with training examples to understand the preferred approach.\nGo through the examples in the file in following JSON format: [{"English": xxx, "Transliteration"}]." and Develop a system that can intelligently choose which English words to transliterate into Devanagari in a sentence, aiming for a specific ratio between the two scripts. With the help of examples in Json format file, design a system that can learn the optimal ratio and transliteration pattern.',
                transliteration_example_file,
            ],
        },
    ]
)


def generate_transliteration_gemini_15_pro(text):
    texts = [text]
    chat_session.send_message(
        'Given an English sentences: \n```' +  "\n".join(texts) + '\n```\nTransliterate English sentences into a mix of Roman and Devanagari script, following a predefined pattern or learning from provided examples above without explain anything.\nReturn output in JSON in following format for the list of sentences: {"text": xxx, "transliterate": xxx}'
    )
    clean_text = lambda res: res.replace("```json", "").replace("```", "").replace("\n", "")
    data = json.loads(clean_text(response.text))
    return clean_hindi_transliterated_text(data["transliterate"])
    
    

def update_text_from_dictionary(text, dictionary_path="./en_hi.dict", initial_lookup=True):
    if not dictionary_path:
        return text

    with open(dictionary_path) as f:
        lines = f.read().splitlines()
    
    updated_lines = list(map(lambda x: x.split("|"), lines))
    
    initial_pass_dict = {}
    final_pass_dict = {}
    for initial, incorrect, correct in updated_lines:
        initial_pass_dict[initial] = correct
        initial_pass_dict[initial+"."] = correct+"."
        initial_pass_dict[initial+"?"] = correct+"?"
        initial_pass_dict[initial+","] = correct+","
        final_pass_dict[incorrect] = correct
        final_pass_dict[incorrect+"."] = correct+"."
        final_pass_dict[incorrect+"?"] = correct+"?"
        final_pass_dict[incorrect+","] = correct+","


    if initial_lookup:
        print(f"Original [{initial_lookup}]: ", text)
        # print(initial_pass_dict)
        new_text = " ".join([initial_pass_dict.get(t, t) for t in text.split()])
        print(f"New [{initial_lookup}]: ", new_text)
    else:
        print(f"Original [{initial_lookup}]: ", text)
        # print(final_pass_dict)
        new_text = " ".join([final_pass_dict.get(t, t) for t in text.split()])
        print(f"New [{initial_lookup}]: ", new_text)
    return new_text


def get_google_token():
    credentials, project = google.auth.load_credentials_from_dict(
        json.loads(os.environ.get('GCP_FINETUNE_KEY')),
        scopes=[
            "https://www.googleapis.com/auth/cloud-platform",
            "https://www.googleapis.com/auth/generative-language.tuning",
        ],
    )
    request = Request()
    credentials.refresh(request)
    access_token = credentials.token
    return access_token


def transliterate_first_word(text):
    texts = text.split(maxsplit=1)
    if len(texts) > 1:
        first_word, rest = texts
    else:
        first_word, rest = texts[0], ""
    if not first_word.isalnum():
        return text
    
    url = "https://inputtools.google.com/request"
    n=1
    params = {
        "text": first_word,
        "num": n,
        "itc": "hi-t-i0-und",
        "cp": 0,
        "cs": 1,
        "ie": "utf-8",
        "app": "demopage"
    }
    response = requests.get(url, params=params)
    results = response.json()[1][0][1]
    first_word_transliterated = results[0]
    return f"{first_word_transliterated} {rest}"


def clean(result):
    text = result["choices"][0]['message']["content"]
    text = re.sub(r"\(.*?\)|\[.*?\]","", text)
    text = text.strip("'").replace('"', "").replace('`', "")
    if "\n" in text.strip("\n"):
        text = text.split("\n")[-1]
    return clean_hindi_transliterated_text(text)


def clean_hindi_transliterated_text(text):
    updates = [('ऑ', 'औ'), ('ॉ', 'ौ'), ('ॅ', 'े'), ("{", ""), ("}", ""), ("'text'", ""), (":", "")]
    text = text.replace('`', '').replace("output:", "")
    for o, n in updates:
        text = text.replace(o, n)
    final_text = text.strip().strip("'").strip('"')
    result_text = update_text_from_dictionary(final_text, initial_lookup=False)
    return result_text





def dubpro_english_transliteration(text, call_gpt):
    if call_gpt:
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
        }

        text = update_text_from_dictionary(text, initial_lookup=True)
        
        prompt = f"Given the English text, transliterate it to Hindi, without translation. Return only the transliterated text, without any instruction or messages. Text: `{text}`\nOutput: "
        messages = [
            {"role": "user", "content": prompt}
        ]
        resp = None
        while resp is None:
            resp = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json={
                "model": "gpt-4",
                "messages": messages
            })
            if resp.status_code != 200:
                print(resp.text)
            time.sleep(0.5)
        return clean(resp.json())
    else:
        return generate_transliteration_gemini_15_pro(text)
        # API_URL = os.environ.get("GEMINI_FINETUNED_HINDI_ENG_API")
        # BEARER_TOKEN = get_google_token()
        # headers = {
        #     "Authorization": f"Bearer {BEARER_TOKEN}",
        #     "Content-Type": "application/json",
        # }
        # payload = {
        #     "contents": [
        #         {
        #             "parts": [{"text": f"input: {text}"}],
        #             "role": "user",
        #         }
        #     ],
        #     "generationConfig": {
        #         "maxOutputTokens": 8192,
        #         "temperature": 0.85,
        #     },
        #     "safetySettings": [
        #         {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
        #         {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
        #         {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
        #         {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
        #     ],
        # }
        # result = requests.post(
        #     url=API_URL,
        #     headers=headers,
        #     json=payload
        # )
        # response = result.json()
        # response_content = response['candidates'][0]['content']['parts'][0]['text'].replace("output:", "").strip().replace("'text':", "").replace("{", "").replace("}", "").strip().strip("'").strip('"')
        # # response_content = transliterate_first_word(response_content)
        # return response_content


def generate_rephrases_gemini(text, language, problem):
    API_URL = os.environ.get("GEMINI_REPHRASER_API")
    BEARER_TOKEN = get_google_token()
    headers = {
        "Authorization": f"Bearer {BEARER_TOKEN}",
        "Content-Type": "application/json",
    }
    if problem == "Gap":
        speak = "more"
    else:
        speak = "less"
    if language == "English":
        prompt = f"You are an English and Hindi language expert, please rephrase a sentence that has been translated from Hindi to English so that it takes little {speak} time to speak."
    elif language == "Hindi":
        prompt = f"You are a hindi language expert please rephrase the below line without summary so that it takes little {speak} time to speak in hinglish manner."
        
    payload = {
        "contents": [
            {
                "parts": [
                    {
                        "text": prompt
                    },
                    {
                        "text": f"input: {text}"
                    },
                    {
                        "text": f"output: "
                    }
                ],
                "role": "user",
            }
        ],
        "generationConfig": {
            "maxOutputTokens": 8192,
            "temperature": 0.85,
            "candidateCount": 1,
        },
        "safetySettings": [
            {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
        ],
    }
    result = requests.post(url=API_URL, headers=headers, json=payload)
    response = result.json()
    output_text = response["candidates"][0]["content"]["parts"][0]["text"]

    texts = list(map(lambda x: x.replace("-", "").strip(), output_text.split("\n")))
    texts = "\n".join(texts)
    # texts = dubpro_english_transliteration(texts)

    wc = f"Original Word Count: {len(text.split())}\nRephrased Word Count: {len(texts.split())}"
    
    return texts, wc


with gr.Blocks() as demo:
    gr.Markdown("# Translator Assistance Tools")
    with gr.Tab("Transliteration"):
        with gr.Row():
            with gr.Column():
                input_text = gr.Textbox(label="Input text", info="Please enter English text.")
                full_transliteration = gr.Checkbox(label="Full transliteration", value=True)
            output_text = gr.Textbox(label="Output text")
        transliterate = gr.Button("Submit")
        transliterate.click(dubpro_english_transliteration, [input_text, full_transliteration], output_text)

    with gr.Tab("Rephraser Tool"):
        with gr.Row():
            rephrase_text = gr.Textbox(label="Input text", info="Please enter text.")
            language = gr.Dropdown(["English", "Hindi"], value="Hindi")
            solving_for = gr.Dropdown(["Gap", "Overflow"], value="Overflow", label="Solving for:")
        with gr.Row():
            word_count = gr.Textbox(label="Word count")
            rephrased_text = gr.Textbox(label="Output text")
        rephrase = gr.Button("Submit")
        rephrase.click(generate_rephrases_gemini, [rephrase_text, language, solving_for], [rephrased_text, word_count])
        

demo.launch(auth=(os.environ.get("USERNAME"), os.environ.get("PASSWORD")))