Abinaya Mahendiran
Updated app
36338f2
""" Script for streamlit demo
@author: AbinayaM02
"""
# Install necessary libraries
from transformers import AutoTokenizer, AutoModelWithLMHead, pipeline
import streamlit as st
from pprint import pprint
import json
# Read the config
with open("config.json") as f:
config = json.loads(f.read())
# Set page layout
st.set_page_config(
page_title="Tamil Language Models",
layout="wide",
initial_sidebar_state="expanded"
)
# Load the model
@st.cache(allow_output_mutation=True)
def load_model(model_name):
with st.spinner('Waiting for the model to load.....'):
model = AutoModelWithLMHead.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
st.success('Model loaded!!')
return model, tokenizer
# Side bar
img = st.sidebar.image("images/tamil_logo.jpg", width=380)
# Choose the model based on selection
page = st.sidebar.selectbox("Model", config["models"])
data = st.sidebar.selectbox("Data", config[page])
# Main page
st.header("Tamil Language Demos")
st.markdown(
"This demo uses [GPT2 trained on Oscar dataset](https://huggingface.co/flax-community/gpt-2-tamil) "
"and [GPT2 trained on Oscar & Indic Corpus dataset] (https://huggingface.co/abinayam/gpt-2-tamil) "
"to show language generation"
)
if page == 'Text Generation' and data == 'Oscar':
st.title('Tamil text generation with GPT2')
st.markdown('A simple demo using gpt-2-tamil model trained on Oscar data')
model, tokenizer = load_model(config[data])
# Set default options
seed = st.text_input('Starting text', 'அகர முதல எழுதெல்லம்')
#seq_num = st.number_input('Number of sentences to generate ', 1, 20, 5)
max_len = st.number_input('Length of the sentence', 5, 300, 100)
gen_bt = st.button('Generate')
if gen_bt:
try:
with st.spinner('Generating...'):
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
seqs = generator(seed, max_length=max_len) # num_return_sequences=seq_num)
st.write(seqs)
except Exception as e:
st.exception(f'Exception: {e}')
elif page == 'Text Generation' and data == "Oscar + Indic Corpus":
st.title('Tamil text generation with GPT2')
st.markdown('A simple demo using gpt-2-tamil model trained on Oscar data')
model, tokenizer = load_model(config[data])
# Set default options
seed = st.text_input('Starting text', 'அகர முதல எழுதெல்லம்')
#seq_num = st.number_input('Number of sentences to generate ', 1, 20, 5)
max_len = st.number_input('Length of the sentence', 5, 300, 100)
gen_bt = st.button('Generate')
if gen_bt:
try:
with st.spinner('Generating...'):
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
seqs = generator(seed, max_length=max_len) #num_return_sequences=seq_num)
st.write(seqs)
except Exception as e:
st.exception(f'Exception: {e}')
else:
st.title('Tamil News classification with Finetuned GPT2')
st.markdown('In progress')