Spaces:

deepsync
/

english-rephraser-transliterator

Running

App Files Files Community

english-rephraser-transliterator / app.py

deepsync

Update app.py

199305f verified 2 months ago

raw

history blame

No virus

12.5 kB

	import os
	import re
	import json
	import time
	import requests
	import gradio as gr

	import google.auth
	from google.auth.transport.requests import Request

	import google.generativeai as genai

	genai.configure(api_key=os.environ.get("GEMINI_API_KEY"))

	def upload_to_gemini(path, mime_type=None):
	file = genai.upload_file(path, mime_type=mime_type)
	print(f"Uploaded file '{file.display_name}' as: {file.uri}")
	return file

	generation_config = {
	"temperature": 1,
	"top_p": 0.95,
	"top_k": 64,
	"max_output_tokens": 1_048_576,
	"response_mime_type": "text/plain",
	}

	safety_settings = [
	{
	"category": "HARM_CATEGORY_HARASSMENT",
	"threshold": "BLOCK_NONE",
	},
	{
	"category": "HARM_CATEGORY_HATE_SPEECH",
	"threshold": "BLOCK_NONE",
	},
	{
	"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
	"threshold": "BLOCK_NONE",
	},
	{
	"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
	"threshold": "BLOCK_NONE",
	},
	]

	model = genai.GenerativeModel(
	model_name="gemini-1.5-pro-latest",
	safety_settings=safety_settings,
	generation_config=generation_config,
	system_instruction="Act as a language model trained on a specific style of writing that incorporates both Roman and Devanagari script",
	)

	transliteration_example_file = upload_to_gemini(
	"ai_exp_json.txt", mime_type="text/plain"
	)

	chat_session = model.start_chat(
	history=[
	{
	"role": "user",
	"parts": [
	"Given a sentence in Roman written English and a set of pre-defined patterns, transliterate only specific words to Devanagari script while maintaining a desired ratio between Roman and Devanagari words. Your task is to transliterate only a subset of words while maintaining the overall meaning and sentence structure.\n",
	'Based on a provided English sentence and a desired transliteration ratio, use your knowledge of this unique style to select words for transliteration that enhance the overall message and aesthetic. I will provide you with training examples to understand the preferred approach.\nGo through the examples in the file in following JSON format: [{"English": xxx, "Transliteration"}]." and Develop a system that can intelligently choose which English words to transliterate into Devanagari in a sentence, aiming for a specific ratio between the two scripts. With the help of examples in Json format file, design a system that can learn the optimal ratio and transliteration pattern.',
	transliteration_example_file,
	],
	},
	]
	)


	def generate_transliteration_gemini_15_pro(text):
	texts = [text]
	response = chat_session.send_message(
	'Given an English sentences: \n```' + "\n".join(texts) + '\n```\nTransliterate English sentences into a mix of Roman and Devanagari script, following a predefined pattern or learning from provided examples above without explain anything.\nReturn output in JSON in following format for the list of sentences: {"text": xxx, "transliterate": xxx}'
	)
	clean_text = lambda res: res.replace("```json", "").replace("```", "").replace("\n", "")
	print(response.text)
	data = json.loads(clean_text(response.text))
	if type(data) is list:
	data = data[0]
	return clean_hindi_transliterated_text(data["transliterate"])



	def update_text_from_dictionary(text, dictionary_path="./en_hi.dict", initial_lookup=True):
	if not dictionary_path:
	return text

	with open(dictionary_path) as f:
	lines = f.read().splitlines()

	updated_lines = list(map(lambda x: x.split("\|"), lines))

	initial_pass_dict = {}
	final_pass_dict = {}
	for initial, incorrect, correct in updated_lines:
	initial_pass_dict[initial] = correct
	initial_pass_dict[initial+"."] = correct+"."
	initial_pass_dict[initial+"?"] = correct+"?"
	initial_pass_dict[initial+","] = correct+","
	final_pass_dict[incorrect] = correct
	final_pass_dict[incorrect+"."] = correct+"."
	final_pass_dict[incorrect+"?"] = correct+"?"
	final_pass_dict[incorrect+","] = correct+","


	if initial_lookup:
	print(f"Original [{initial_lookup}]: ", text)
	# print(initial_pass_dict)
	new_text = " ".join([initial_pass_dict.get(t, t) for t in text.split()])
	print(f"New [{initial_lookup}]: ", new_text)
	else:
	print(f"Original [{initial_lookup}]: ", text)
	# print(final_pass_dict)
	new_text = " ".join([final_pass_dict.get(t, t) for t in text.split()])
	print(f"New [{initial_lookup}]: ", new_text)
	return new_text


	def get_google_token():
	credentials, project = google.auth.load_credentials_from_dict(
	json.loads(os.environ.get('GCP_FINETUNE_KEY')),
	scopes=[
	"https://www.googleapis.com/auth/cloud-platform",
	"https://www.googleapis.com/auth/generative-language.tuning",
	],
	)
	request = Request()
	credentials.refresh(request)
	access_token = credentials.token
	return access_token


	def transliterate_first_word(text):
	texts = text.split(maxsplit=1)
	if len(texts) > 1:
	first_word, rest = texts
	else:
	first_word, rest = texts[0], ""
	if not first_word.isalnum():
	return text

	url = "https://inputtools.google.com/request"
	n=1
	params = {
	"text": first_word,
	"num": n,
	"itc": "hi-t-i0-und",
	"cp": 0,
	"cs": 1,
	"ie": "utf-8",
	"app": "demopage"
	}
	response = requests.get(url, params=params)
	results = response.json()[1][0][1]
	first_word_transliterated = results[0]
	return f"{first_word_transliterated} {rest}"


	def clean(result):
	text = result["choices"][0]['message']["content"]
	text = re.sub(r"\(.?\)\|\[.?\]","", text)
	text = text.strip("'").replace('"', "").replace('`', "")
	if "\n" in text.strip("\n"):
	text = text.split("\n")[-1]
	return clean_hindi_transliterated_text(text)


	def clean_hindi_transliterated_text(text):
	updates = [('ऑ', 'औ'), ('ॉ', 'ौ'), ('ॅ', 'े'), ("{", ""), ("}", ""), ("'text'", ""), (":", "")]
	text = text.replace('`', '').replace("output:", "")
	for o, n in updates:
	text = text.replace(o, n)
	final_text = text.strip().strip("'").strip('"')
	result_text = update_text_from_dictionary(final_text, initial_lookup=False)
	return result_text





	def dubpro_english_transliteration(text, call_gpt):
	if call_gpt:
	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
	}

	text = update_text_from_dictionary(text, initial_lookup=True)

	prompt = f"Given the English text, transliterate it to Hindi, without translation. Return only the transliterated text, without any instruction or messages. Text: `{text}`\nOutput: "
	messages = [
	{"role": "user", "content": prompt}
	]
	resp = None
	while resp is None:
	resp = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json={
	"model": "gpt-4o-2024-05-13",
	"messages": messages
	})
	if resp.status_code != 200:
	print(resp.text)
	time.sleep(0.5)
	return clean(resp.json())
	else:
	return generate_transliteration_gemini_15_pro(text)
	# API_URL = os.environ.get("GEMINI_FINETUNED_HINDI_ENG_API")
	# BEARER_TOKEN = get_google_token()
	# headers = {
	# "Authorization": f"Bearer {BEARER_TOKEN}",
	# "Content-Type": "application/json",
	# }
	# payload = {
	# "contents": [
	# {
	# "parts": [{"text": f"input: {text}"}],
	# "role": "user",
	# }
	# ],
	# "generationConfig": {
	# "maxOutputTokens": 8192,
	# "temperature": 0.85,
	# },
	# "safetySettings": [
	# {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
	# {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
	# {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
	# {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
	# ],
	# }
	# result = requests.post(
	# url=API_URL,
	# headers=headers,
	# json=payload
	# )
	# response = result.json()
	# response_content = response['candidates'][0]['content']['parts'][0]['text'].replace("output:", "").strip().replace("'text':", "").replace("{", "").replace("}", "").strip().strip("'").strip('"')
	# # response_content = transliterate_first_word(response_content)
	# return response_content


	def generate_rephrases_gemini(text, language, problem):
	API_URL = os.environ.get("GEMINI_REPHRASER_API")
	BEARER_TOKEN = get_google_token()
	headers = {
	"Authorization": f"Bearer {BEARER_TOKEN}",
	"Content-Type": "application/json",
	}
	if problem == "Gap":
	speak = "more"
	else:
	speak = "less"
	if language == "English":
	prompt = f"You are an English and Hindi language expert, please rephrase a sentence that has been translated from Hindi to English so that it takes little {speak} time to speak."
	elif language == "Hindi":
	prompt = f"You are a hindi language expert please rephrase the below line without summary so that it takes little {speak} time to speak in hinglish manner."

	payload = {
	"contents": [
	{
	"parts": [
	{
	"text": prompt
	},
	{
	"text": f"input: {text}"
	},
	{
	"text": f"output: "
	}
	],
	"role": "user",
	}
	],
	"generationConfig": {
	"maxOutputTokens": 8192,
	"temperature": 0.85,
	"candidateCount": 1,
	},
	"safetySettings": [
	{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
	{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
	{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
	{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
	],
	}
	result = requests.post(url=API_URL, headers=headers, json=payload)
	response = result.json()
	output_text = response["candidates"][0]["content"]["parts"][0]["text"]

	texts = list(map(lambda x: x.replace("-", "").strip(), output_text.split("\n")))
	texts = "\n".join(texts)
	# texts = dubpro_english_transliteration(texts)

	wc = f"Original Word Count: {len(text.split())}\nRephrased Word Count: {len(texts.split())}"

	return texts, wc


	with gr.Blocks() as demo:
	gr.Markdown("# Translator Assistance Tools")
	with gr.Tab("Transliteration"):
	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(label="Input text", info="Please enter English text.")
	full_transliteration = gr.Checkbox(label="Full transliteration", value=True)
	output_text = gr.Textbox(label="Output text")
	transliterate = gr.Button("Submit")
	transliterate.click(dubpro_english_transliteration, [input_text, full_transliteration], output_text)

	with gr.Tab("Rephraser Tool"):
	with gr.Row():
	rephrase_text = gr.Textbox(label="Input text", info="Please enter text.")
	language = gr.Dropdown(["English", "Hindi"], value="Hindi")
	solving_for = gr.Dropdown(["Gap", "Overflow"], value="Overflow", label="Solving for:")
	with gr.Row():
	word_count = gr.Textbox(label="Word count")
	rephrased_text = gr.Textbox(label="Output text")
	rephrase = gr.Button("Submit")
	rephrase.click(generate_rephrases_gemini, [rephrase_text, language, solving_for], [rephrased_text, word_count])


	demo.launch(auth=(os.environ.get("USERNAME"), os.environ.get("PASSWORD")))