File size: 3,285 Bytes
13a47a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pickle
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
import lightgbm
lr_clf_finbert = pickle.load(open("lr_clf_finread_new.pkl",'rb'))
model_read = SentenceTransformer('ProsusAI/finbert')

def get_readability(text):
  emd = model_read.encode([text])
  ans = 'not readable'
  if lr_clf_finbert.predict(emd)==1:
    ans = 'readable'
  score = round(lr_clf_finbert.predict_proba(emd)[0,1],4)
  return score

# Reference : https://huggingface.co/humarin/chatgpt_paraphraser_on_T5_base
tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids
    
    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res

def get_most_raedable_paraphrse(text):
  li_paraphrases = paraphrase(text)
  li_paraphrases.append(text)
  best = li_paraphrases[0]
  score_max = get_readability(best)
  for i in range(1,len(li_paraphrases)):
    curr = li_paraphrases[i]
    score = get_readability(curr)
    if score > score_max:
      best = curr
      score_max = score
  if best!=text and score_max>.6:
    ans = "The most redable version of text that I can think of is:\n" + best  
  else:
    "Sorry! I am not confident. As per my best knowledge, you already have the most readable version of the text!"
  return ans

def set_example_text(example_text):
    return gr.Textbox.update(value=example_text[0])

with gr.Blocks() as demo:
    gr.Markdown(
    """
    # FinLanSer
    Financial Language Simplifier
    """)
    text = gr.Textbox(label="Enter text you want to simply (make more readable)")
    greet_btn = gr.Button("Simplify/Make Readable")
    output = gr.Textbox(label="Output Box")
    greet_btn.click(fn=get_most_raedable_paraphrse, inputs=text, outputs=output, api_name="get_most_raedable_paraphrse")
    example_text = gr.Dataset(components=[text], samples=[['Inflation is the rate of increase in prices over a given period of time. Inflation is typically a broad measure, such as the overall increase in prices or the increase in the cost of living in a country.'], ['Legally assured line of credit with a bank'], ['A mutual fund is a type of financial vehicle made up of a pool of money collected from many investors to invest in securities like stocks, bonds, money market instruments']])
    example_text.click(fn=set_example_text, inputs=example_text,outputs=example_text.components)  

demo.launch()