import streamlit as st import numpy as np import tensorflow as tf from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import load_model from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM import torch import pickle import joblib # Load models and tokenizers model = load_model('rnn_lstm_final.h5') loaded_model = joblib.load("my_rnn_model.joblib") with open("tokenizer_and_sequences.pkl", "rb") as f: tokenizer, data = pickle.load(f) model1 = AutoModelForSequenceClassification.from_pretrained('punjabiSentimentAnalysis') tokenizer1 = AutoTokenizer.from_pretrained('punjabiSentimentAnalysis') model_summ = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/MultiIndicSentenceSummarizationSS") tokenizer_summ = AutoTokenizer.from_pretrained("ai4bharat/MultiIndicSentenceSummarizationSS", do_lower_case=False, use_fast=False, keep_accents=True) bos_id = tokenizer_summ._convert_token_to_id_with_added_voc("") eos_id = tokenizer_summ._convert_token_to_id_with_added_voc("") pad_id = tokenizer_summ._convert_token_to_id_with_added_voc("") # Define helper functions def is_valid_punjabi_text(text): english_alphabet = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") numbers = set("0123456789") punctuation = set("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") for char in text: if char in english_alphabet or char in numbers or char in punctuation: return False return True def predict_sentiment(text, model, tokenizer): inputs = tokenizer(text, return_tensors="pt") outputs = model(**inputs) predicted_class = torch.argmax(outputs.logits, dim=-1).item() return "Negative" if predicted_class == 0 else "Positive" def summarize(text): input_ids = tokenizer_summ(f"{text} <2pa>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids model_output = model_summ.generate(input_ids, use_cache=True, no_repeat_ngram_size=3, num_beams=5, length_penalty=0.8, max_length=20, min_length=1, early_stopping=True, pad_token_id=pad_id, bos_token_id=bos_id, eos_token_id=eos_id, decoder_start_token_id=tokenizer_summ._convert_token_to_id_with_added_voc("<2pa>")) decoded_output = tokenizer_summ.decode(model_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False) return decoded_output def process_input(text): a = [text] a = tokenizer.texts_to_sequences(a) a = np.array(a) a = pad_sequences(a, padding='post', maxlen=100) a = a.reshape((a.shape[0], a.shape[1], 1)) prediction = model.predict(np.array(a)) for row in prediction: element1 = row[0] element2 = row[1] return "Negative" if element1 > element2 else "Positive" # Streamlit app st.title("Indic Sentence Summarization & Sentiment Analysis") st.header("Insightful Echoes: Crafting Summaries with Sentiments (for ਪੰਜਾਬੀ Text)") model_choice = st.selectbox("Select the Model", ["Indic-Bert", "RNN"]) summarize_before_sentiment = st.checkbox("Summarize before analyzing sentiment") user_input = st.text_area("Enter some text here") if st.button("Analyze Sentiment"): if not is_valid_punjabi_text(user_input): st.warning("Please enter valid Punjabi text.") else: sentiment_output = "" if summarize_before_sentiment: summarized_text = summarize(user_input) sentiment_bert = predict_sentiment(summarized_text, model1, tokenizer1) sentiment_output = f'Sentiment (Indic-BERT): {sentiment_bert}\nSummary: {summarized_text}' else: sentiment_bert = predict_sentiment(user_input, model1, tokenizer1) sentiment_output = f'Sentiment (Indic-BERT): {sentiment_bert}' if model_choice == "RNN": sentiment_rnn = process_input(user_input) sentiment_output += f"\nSentiment (Bidirectional LSTM): {sentiment_rnn}" if summarize_before_sentiment: summarized_text_rnn = summarize(user_input) sentiment_output += f"\nSummary (Bidirectional LSTM): {summarized_text_rnn}" st.text_area("Sentiment Output", sentiment_output, height=200)