File size: 3,200 Bytes
40e9898
 
 
 
 
36338f2
40e9898
 
 
 
 
 
36338f2
40e9898
 
36338f2
 
 
 
 
40e9898
 
 
36338f2
 
 
 
 
 
40e9898
36338f2
 
40e9898
36338f2
 
 
 
 
40e9898
 
 
36338f2
 
40e9898
36338f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
""" Script for streamlit demo
    @author: AbinayaM02
"""

# Install necessary libraries
from transformers import AutoTokenizer, AutoModelWithLMHead, pipeline
import streamlit as st
from pprint import pprint
import json

# Read the config
with open("config.json") as f:
    config = json.loads(f.read())

# Set page layout
st.set_page_config(
        page_title="Tamil Language Models",
        layout="wide",
        initial_sidebar_state="expanded"
    )

# Load the model
@st.cache(allow_output_mutation=True)
def load_model(model_name):
    with st.spinner('Waiting for the model to load.....'):
        model = AutoModelWithLMHead.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    st.success('Model loaded!!')
    return model, tokenizer

# Side bar
img = st.sidebar.image("images/tamil_logo.jpg", width=380)

# Choose the model based on selection
page = st.sidebar.selectbox("Model", config["models"])
data = st.sidebar.selectbox("Data", config[page])

# Main page
st.header("Tamil Language Demos")
st.markdown(
    "This demo uses [GPT2 trained on Oscar dataset](https://huggingface.co/flax-community/gpt-2-tamil) "
    "and [GPT2 trained on Oscar & Indic Corpus dataset] (https://huggingface.co/abinayam/gpt-2-tamil) "
    "to show language generation"
)

if page == 'Text Generation' and data == 'Oscar':
    st.title('Tamil text generation with GPT2')
    st.markdown('A simple demo using gpt-2-tamil model trained on Oscar data')
    model, tokenizer = load_model(config[data])
    # Set default options
    seed = st.text_input('Starting text', 'அகர முதல எழுதெல்லம்')
    #seq_num = st.number_input('Number of sentences to generate ', 1, 20, 5)
    max_len = st.number_input('Length of the sentence', 5, 300, 100)
    gen_bt = st.button('Generate')
    if gen_bt:
        try:
            with st.spinner('Generating...'):
                generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
                seqs = generator(seed, max_length=max_len) # num_return_sequences=seq_num)
            st.write(seqs)
        except Exception as e:
            st.exception(f'Exception: {e}')
elif page == 'Text Generation' and data == "Oscar + Indic Corpus":
    st.title('Tamil text generation with GPT2')
    st.markdown('A simple demo using gpt-2-tamil model trained on Oscar data')
    model, tokenizer = load_model(config[data])
    # Set default options
    seed = st.text_input('Starting text', 'அகர முதல எழுதெல்லம்')
    #seq_num = st.number_input('Number of sentences to generate ', 1, 20, 5)
    max_len = st.number_input('Length of the sentence', 5, 300, 100)
    gen_bt = st.button('Generate')
    if gen_bt:
        try:
            with st.spinner('Generating...'):
                generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
                seqs = generator(seed, max_length=max_len) #num_return_sequences=seq_num)
            st.write(seqs)
        except Exception as e:
            st.exception(f'Exception: {e}')
else:
    st.title('Tamil News classification with Finetuned GPT2')
    st.markdown('In progress')