Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import pickle | |
from tqdm import tqdm | |
from Levenshtein import distance as lev | |
import joblib | |
from googletrans import Translator | |
from indictrans import Transliterator | |
from pyphonetics import RefinedSoundex | |
from bs4 import BeautifulSoup | |
import re | |
import torch | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
# Load sentiment analysis model and tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("Seethal/sentiment_analysis_generic_dataset") | |
model = AutoModelForSequenceClassification.from_pretrained("Seethal/sentiment_analysis_generic_dataset") | |
# Define a function to get the sentiment from the model | |
def get_sentiment(text): | |
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True) | |
outputs = model(**inputs) | |
sentiment = torch.argmax(outputs.logits, dim=1).item() | |
return 'Positive' if sentiment == 1 else 'Negative' | |
def closest_match(word, vocabulary): | |
best_match = None | |
best_distance = float('inf') | |
for vocab_word in vocabulary: | |
dist = lev(word, vocab_word) | |
if dist < best_distance: | |
best_distance = dist | |
best_match = vocab_word | |
return best_match | |
def main(): | |
st.title('Text Processing App') | |
rs = RefinedSoundex() | |
normalized_string_final=[] | |
translator = Translator() | |
trn = Transliterator(source='eng', target='hin') | |
with open(r'./english_vocab.pkl', "rb") as fp: | |
english = pickle.load(fp) | |
english_vocab=english | |
with open(r'./hinglish_vocab.pkl', "rb") as fp: | |
hinglish = pickle.load(fp) | |
hinglish_vocab=hinglish | |
english_vocab['and'] = ['and'] | |
english_vocab['is'] = ['is'] | |
def clean_tweet(tweet): | |
text=re.sub(r'@ [A-Za-z0-9\']+','',tweet) | |
text=BeautifulSoup(text,'lxml').get_text() | |
text=re.sub(r'https (//)[A-Za-z0-9. ]*(/) [A-Za-z0-9]+','',text) | |
text=re.sub(r'https[A-Za-z0-9/. ]*','',text) | |
text=re.sub("[^a-zA-Z]"," ",text) | |
text=re.sub(r'\bRT\b',' ',text) | |
text=re.sub(r'\bnan\b',' ',text) | |
return text | |
input_text = st.text_area("Enter the text:") | |
total_translated = [] | |
if st.button('Process'): | |
data = {'Text': [input_text]} | |
df1 = pd.DataFrame(data) | |
df1['Text'] = df1['Text'].apply(clean_tweet) | |
cleaned_text = df1['Text'].tolist()[0] | |
total_text = [cleaned_text] | |
st.write("Input Text:", total_text) | |
for i in tqdm(total_text): | |
test_text=i.split() | |
not_changed_idx=[] | |
for i in range(len(test_text)): | |
not_changed_idx.append(0) | |
changed_text=[] | |
changed_idx=[] | |
for i in range(len(test_text)): | |
for key in english_vocab: | |
done=0 | |
for val in english_vocab[key]: | |
if(test_text[i]==val): | |
changed_text.append(key) | |
changed_idx.append(i) | |
not_changed_idx[i]=1 | |
done=1 | |
break | |
if done==1: | |
break | |
normalized_string=[] | |
res = dict(zip(changed_idx, changed_text)) | |
for i in range(len(test_text)): | |
try: | |
normalized_string.append(res[i]) | |
except: | |
normalized_string.append(test_text[i]) | |
print("English Normalized String:", normalized_string) | |
# hinglish word change | |
test_list = [i for i in range(len(test_text))] | |
changed_hing_idx = [i for i in test_list if i not in changed_idx] | |
hinglish_text_part = [test_text[i] for i in changed_hing_idx] | |
changed_text2 = [] | |
changed_idx2 = [] | |
for i in range(len(hinglish_text_part)): | |
for key in hinglish_vocab: | |
done = 0 | |
for val in hinglish_vocab[key]: | |
if hinglish_text_part[i] == val: | |
changed_text2.append(key) | |
changed_idx2.append(i) | |
done = 1 | |
break | |
if done == 1: | |
break | |
normalized_string2 = [] | |
res2 = dict(zip(changed_idx2, changed_text2)) | |
for i in range(len(hinglish_text_part)): | |
try: | |
normalized_string2.append(res2[i]) | |
except: | |
normalized_string2.append(hinglish_text_part[i]) | |
for i in changed_idx: | |
normalized_string2.append(res[i]) | |
print("Hinglish Normalized String:", normalized_string) | |
# finding phoneme and leventise distance for unchanged word | |
for i in range(len(not_changed_idx)): | |
try: | |
if not_changed_idx[i] == 0: | |
eng_phoneme_correction = [] | |
for j in english_vocab: | |
try: | |
phoneme = rs.distance(normalized_string2[i], j) | |
except: | |
pass | |
if phoneme <= 1: | |
eng_phoneme_correction.append(j) | |
eng_lev_correction = [] | |
for k in eng_phoneme_correction: | |
dist = lev(normalized_string2[i], k) | |
if dist <= 2: | |
eng_lev_correction.append(k) | |
eng_lev_correction.extend(hing_lev_correction) | |
new_correction = eng_lev_correction | |
eng_lev_correction = [] | |
for l in new_correction: | |
dist = lev(normalized_string2[i], l) | |
eng_lev_correction.append(dist) | |
min_val = min(eng_lev_correction) | |
min_idx = eng_lev_correction.index(min_val) | |
suggestion = closest_match(new_correction[min_idx], english_vocab.keys()) | |
normalized_string2[i] = suggestion | |
except: | |
pass | |
normalized_string_final = normalized_string2 | |
print("Phoneme levenshtein Distionary suggestion Normalized String:", normalized_string_final) | |
# sentence tagging | |
classifier = joblib.load(r"./classifer.joblib") | |
classify = [] | |
for i in normalized_string: | |
test_classify = classifier(i) | |
classify.append(test_classify[0].get("label")) | |
for i in range(len(classify)): | |
if classify[i] == 'en': | |
try: | |
normalized_string[i] = translator.translate(normalized_string[i], src='en', dest='hi').text | |
except: | |
normalized_string[i] = "delete" | |
print("English -> Hindi Translated String:", normalized_string) | |
conversion_list = [trn.transform(i) for i in normalized_string] | |
print("Hinglish -> Hindi Transliterated String:", conversion_list) | |
sentence = [" ".join(conversion_list)] | |
translated = [] | |
for i in sentence: | |
try: | |
translated_text = translator.translate(i, src='hi', dest='en') | |
translated.append(translated_text.text) | |
except: | |
translated.append("delete") | |
print("Hindi -> English Translated String:", translated) | |
total_translated.append(translated[0]) | |
st.write("English Normalized String:", normalized_string) | |
st.write("Hinglish Normalized String:", normalized_string) | |
st.write("Phoneme Levenshtein Dictionary Suggestion Normalized String:", normalized_string_final) | |
st.write("English -> Hindi Translated String:", normalized_string) | |
st.write("Hinglish -> Hindi Transliterated String:", conversion_list) | |
st.write("Hindi -> English Translated String:", translated) | |
# Get the sentiment of the translated text | |
sentiment = get_sentiment(translated[0]) | |
st.write("Sentiment of Translated Text:", sentiment) | |
if __name__ == '__main__': | |
main() |