DL4NLP / language_translation.py
santanus24's picture
uploading all .py files
9b5fe77 verified
# !pip install googletrans==3.1.0a0
# !pip install transformers sentencepiece
from googletrans import Translator
from transformers import MarianMTModel, MarianTokenizer # transformer based pre-trained language translation model
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
def translate_hi2en_gtrans(sentence):
"""
Function to translate from Hindi to English.
Args:
- sentence: string in Hindi
Returns:
- English translated text string
"""
translator = Translator()
output = translator.translate(sentence, dest='en', src='hi')
return output.text
def translate_en2hi_gtrans(sentence):
"""
Function to translate from English to Hindi.
Args:
- sentence: string in English
Returns:
- Hindi translated text string
"""
translator = Translator()
output = translator.translate(sentence, dest='hi', src='en')
return output.text
# Translates text from source_lang to target_lang using the pre-trained model
def translate_en_hi_transformer(text):
# Load the Pre-trained Model and Tokenizer for english to hindi
model_name_en_hi = "Helsinki-NLP/opus-mt-en-hi" # English to Hindi translation model
tokenizer = MarianTokenizer.from_pretrained(model_name_en_hi)
model_en_hi = MarianMTModel.from_pretrained(model_name_en_hi)
encoded = tokenizer(text, return_tensors="pt")
translated = model_en_hi.generate(**encoded)
return tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
# Translates text from Hindi to english using the pre-trained model
def translate_hi_en_transformer(text):
# Load the Pre-trained Model and Tokenizer for hindi to english
model_name_hi_en = "Helsinki-NLP/opus-mt-hi-en" # Hindi to English translation model
tokenizer_hi = MarianTokenizer.from_pretrained(model_name_hi_en)
model_hi_en = MarianMTModel.from_pretrained(model_name_hi_en)
encoded = tokenizer_hi(text, return_tensors="pt")
translated = model_hi_en.generate(**encoded)
return tokenizer_hi.batch_decode(translated, skip_special_tokens=True)[0]
def translate_mbart(text, source_lang, target_lang):
# Load model and tokenizer outside the function
model_name = "facebook/mbart-large-50-many-to-many-mmt"
model = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
# Set source language
tokenizer.src_lang = source_lang
# Encode the text
encoded_text = tokenizer(text, return_tensors="pt")
# Force target language token
forced_bos_token_id = tokenizer.lang_code_to_id[target_lang]
# Generate the translation
generated_tokens = model.generate(**encoded_text, forced_bos_token_id=forced_bos_token_id)
# Decode the translation
translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
return translation
if __name__ == "__main__":
print(translate_hi2en_gtrans("मैं खुश हूँ!!!"))
print(translate_en2hi_gtrans("I am happy!!!"))
import pandas as pd
# Read CSV file into a Pandas DataFrame
df_en = pd.read_csv('Data_with_QnA.csv', usecols=['Question1', 'Answer1', 'Question2', 'Answer2', 'Question3', 'Answer3', 'Question4', 'Answer4'])
df_en.head()
# convert all the question answers from English to Hindi
df_hi = df_en.applymap(translate_en2hi_gtrans)
df_hi.head()
# Save the modified DataFrame to a CSV file
df_hi.to_csv('Hindi_QnA.csv', index=False)
# English to Hindi example
english_text = " What is the material used to create the chess set?"
hindi_translation = translate_en_hi_transformer(english_text)
print(f"English: {english_text}")
print(f"Hindi: {hindi_translation}")
# Hindi to English example
hindi_text = "आपका दिन कैसा चल रहा है?" # How is your day going?
english_translation = translate_hi_en_transformer(hindi_text)
print(f"Hindi: {hindi_text}")
print(f"English: {english_translation}")
# Example usage
hindi_text = "हिन्दी साहित्य पर अगर समुचित परिप्रेक्ष्य में विचार किया जाए तो स्पष्ट होता है कि हिन्दी साहित्य का इतिहास अत्यन्त विस्तृत व प्राचीन है। सुप्रसिद्ध भाषा वैज्ञानिक डॉ० हरदेव बाहरी के शब्दों में, हिन्दी साहित्य का इतिहास वस्तुतः वैदिक काल से आरम्भ होता है। यह कहना ही ठीक होगा कि वैदिक भाषा ही हिन्दी है। इस भाषा का दुर्भाग्य रहा है कि युग-युग में इसका नाम परिवर्तित होता रहा है। कभी 'वैदिक', कभी 'संस्कृत', कभी 'प्राकृत', कभी'अपभ्रंश' और अब - हिन्दी।[1] आलोचक कह सकते हैं कि 'वैदिक संस्कृत' और 'हिन्दी' में तो जमीन-आसमान का अन्तर है। पर ध्यान देने योग्य है कि हिब्रू, रूसी, चीनी, जर्मन और तमिल आदि जिन भाषाओं को 'बहुत पुरानी' बताया जाता है, उनके भी प्राचीन और वर्तमान रूपों में जमीन-आसमान का अन्तर है; पर लोगों ने उन भाषाओं के नाम नहीं बदले और उनके परिवर्तित स्वरूपों को 'प्राचीन', 'मध्यकालीन', 'आधुनिक' आदि कहा गया, जबकि 'हिन्दी' के सन्दर्भ में प्रत्येक युग की भाषा का नया नाम रखा जाता रहा।"
english_translation = translate_mbart(hindi_text, "hi_IN", "en_XX")
print(english_translation)
english_text = "English literature, the body of written works produced in the English language by inhabitants of the British Isles (including Ireland) from the 7th century to the present day. The major literatures written in English outside the British Isles are treated separately under American literature, Australian literature, Canadian literature, and New Zealand literature. English literature has sometimes been stigmatized as insular. It can be argued that no single English novel attains the universality of the Russian writer Leo Tolstoy’s War and Peace or the French writer Gustave Flaubert’s Madame Bovary. Yet in the Middle Ages the Old English literature of the subjugated Saxons was leavened by the Latin and Anglo-Norman writings, eminently foreign in origin, in which the churchmen and the Norman conquerors expressed themselves. From this combination emerged a flexible and subtle linguistic instrument exploited by Geoffrey Chaucer and brought to supreme application by William Shakespeare. During the Renaissance the renewed interest in Classical learning and values had an important effect on English literature, as on all the arts; and ideas of Augustan literary propriety in the 18th century and reverence in the 19th century for a less specific, though still selectively viewed, Classical antiquity continued to shape the literature. All three of these impulses derived from a foreign source, namely the Mediterranean basin. The Decadents of the late 19th century and the Modernists of the early 20th looked to continental European individuals and movements for inspiration. Nor was attraction toward European intellectualism dead in the late 20th century, for by the mid-1980s the approach known as structuralism, a phenomenon predominantly French and German in origin, infused the very study of English literature itself in a host of published critical studies and university departments. Additional influence was exercised by deconstructionist analysis, based largely on the work of French philosopher Jacques Derrida."
hindi_translation = translate_mbart(english_text, "en_XX", "hi_IN")
print(hindi_translation)