xtwigs commited on
Commit
e38de8a
1 Parent(s): 2b20905

update app

Browse files
Files changed (1) hide show
  1. app.py +43 -32
app.py CHANGED
@@ -2,23 +2,26 @@ import streamlit as st
2
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, BartTokenizer, BartForConditionalGeneration, pipeline
3
  import numpy as np
4
  import torch
 
5
  from textstat import textstat
6
 
7
 
8
-
9
  MAX_LEN = 256
10
  NUM_BEAMS = 4
11
  EARLY_STOPPING = True
12
  N_OUT = 4
13
 
14
 
15
-
16
  cwi_tok = AutoTokenizer.from_pretrained('twigs/cwi-regressor')
17
- cwi_model = AutoModelForSequenceClassification.from_pretrained('twigs/cwi-regressor')
 
18
  simpl_tok = BartTokenizer.from_pretrained('twigs/bart-text2text-simplifier')
19
- simpl_model = BartForConditionalGeneration.from_pretrained('twigs/bart-text2text-simplifier')
20
- cwi_pipe = pipeline('text-classification', model=cwi_model, tokenizer=cwi_tok, function_to_apply='none', device=0)
21
- fill_pipe = pipeline('fill-mask', model=simpl_model, tokenizer=simpl_tok, top_k=1, device=0)
 
 
 
22
 
23
 
24
  def id_replace_complex(s, threshold=0.4):
@@ -43,7 +46,8 @@ def id_replace_complex(s, threshold=0.4):
43
 
44
  def generate_candidate_text(s, model, tokenizer, tokenized=False):
45
 
46
- out = simpl_tok([s], max_length=256, padding="max_length", truncation=True, return_tensors='pt').to('cuda') if not tokenized else s
 
47
 
48
  generated_ids = model.generate(
49
  input_ids=out['input_ids'],
@@ -56,39 +60,38 @@ def generate_candidate_text(s, model, tokenizer, tokenized=False):
56
  num_return_sequences=N_OUT
57
  )
58
 
59
- return [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[
60
  1:] for ids in generated_ids]
61
 
62
 
63
  def rank_candidate_text(sentences):
64
- """ Currently being done with simple FKGL """
65
  fkgl_scores = [textstat.flesch_kincaid_grade(s) for s in sentences]
66
  return sentences[np.argmin(fkgl_scores)]
67
-
68
 
69
  def full_pipeline(source, simpl_model, simpl_tok, tokens, lexical=False):
70
-
71
- modified, complex_words = id_replace_complex(source, threshold=0.2) if lexical else source, None
 
72
  cands = generate_candidate_text(tokens+modified, simpl_model, simpl_tok)
73
  output = rank_candidate_text(cands)
74
  return output, complex_words
75
 
 
76
 
77
- aug_tok = ['c_', 'lev_', 'dep_', 'rank_', 'rat_', 'n_syl_']
78
- tokens = ['CharRatio', 'LevSim', 'DependencyTreeDepth',
79
- 'WordComplexity', 'WordRatio']
80
-
81
- default_values = [0.8, 0.6, 0.9, 0.8, 0.9, 1.9]
82
- user_values = default_values
83
- tok_values = dict((t, default_values[idx]) for idx, t in enumerate(tokens))
84
 
85
- example_sentences = ["A matchbook is a small cardboard folder (matchcover) enclosing a quantity of matches and having a coarse striking surface on the exterior.",
86
- "If there are no strong land use controls, buildings are built along a bypass, converting it into an ordinary town road, and the bypass may eventually become as congested as the local streets it was intended to avoid.",
87
- "Plot Captain Caleb Holt (Kirk Cameron) is a firefighter in Albany, Georgia and firmly keeps the cardinal rule of all firemen, \"Never leave your partner behind\".",
88
- "Britpop emerged from the British independent music scene of the early 1990s and was characterised by bands influenced by British guitar pop music of the 1960s and 1970s."]
89
 
 
 
 
 
90
 
91
- def main():
92
 
93
  st.title("Make it Simple")
94
 
@@ -96,7 +99,8 @@ def main():
96
  for s in example_sentences:
97
  st.code(body=s)
98
 
99
- with st.form(key="form"):
 
100
  input_sentence = st.text_area("Original sentence")
101
  tok = st.multiselect(
102
  label="Tokens to augment the sentence", options=tokens, default=tokens)
@@ -110,13 +114,20 @@ def main():
110
  if (submit):
111
 
112
  tokens = [t+str(v) for t, v in zip(aug_tok, user_values)]
113
- output, words = full_pipeline(input_sentence, simpl_model, simpl_tok, tokens)
114
-
115
- with st.container():
116
- st.write("Original sentence:")
117
- st.write(input_sentence)
118
- st.write("Output sentence:")
119
- st.write(output)
 
 
 
 
 
 
 
120
 
121
 
122
  if __name__ == '__main__':
 
2
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, BartTokenizer, BartForConditionalGeneration, pipeline
3
  import numpy as np
4
  import torch
5
+ import re
6
  from textstat import textstat
7
 
8
 
 
9
  MAX_LEN = 256
10
  NUM_BEAMS = 4
11
  EARLY_STOPPING = True
12
  N_OUT = 4
13
 
14
 
 
15
  cwi_tok = AutoTokenizer.from_pretrained('twigs/cwi-regressor')
16
+ cwi_model = AutoModelForSequenceClassification.from_pretrained(
17
+ 'twigs/cwi-regressor')
18
  simpl_tok = BartTokenizer.from_pretrained('twigs/bart-text2text-simplifier')
19
+ simpl_model = BartForConditionalGeneration.from_pretrained(
20
+ 'twigs/bart-text2text-simplifier')
21
+ cwi_pipe = pipeline('text-classification', model=cwi_model,
22
+ tokenizer=cwi_tok, function_to_apply='none', device=0)
23
+ fill_pipe = pipeline('fill-mask', model=simpl_model,
24
+ tokenizer=simpl_tok, top_k=1, device=0)
25
 
26
 
27
  def id_replace_complex(s, threshold=0.4):
 
46
 
47
  def generate_candidate_text(s, model, tokenizer, tokenized=False):
48
 
49
+ out = simpl_tok([s], max_length=256, padding="max_length", truncation=True,
50
+ return_tensors='pt').to('cuda') if not tokenized else s
51
 
52
  generated_ids = model.generate(
53
  input_ids=out['input_ids'],
 
60
  num_return_sequences=N_OUT
61
  )
62
 
63
+ return [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[
64
  1:] for ids in generated_ids]
65
 
66
 
67
  def rank_candidate_text(sentences):
 
68
  fkgl_scores = [textstat.flesch_kincaid_grade(s) for s in sentences]
69
  return sentences[np.argmin(fkgl_scores)]
70
+
71
 
72
  def full_pipeline(source, simpl_model, simpl_tok, tokens, lexical=False):
73
+
74
+ modified, complex_words = id_replace_complex(
75
+ source, threshold=0.2) if lexical else source, None
76
  cands = generate_candidate_text(tokens+modified, simpl_model, simpl_tok)
77
  output = rank_candidate_text(cands)
78
  return output, complex_words
79
 
80
+ def main():
81
 
82
+ aug_tok = ['c_', 'lev_', 'dep_', 'rank_', 'rat_', 'n_syl_']
83
+ tokens = ['CharRatio', 'LevSim', 'DependencyTreeDepth',
84
+ 'WordComplexity', 'WordRatio', 'NumberOfSyllables']
 
 
 
 
85
 
86
+ default_values = [0.8, 0.6, 0.9, 0.8, 0.9, 1.9]
87
+ user_values = default_values
88
+ tok_values = dict((t, default_values[idx]) for idx, t in enumerate(tokens))
 
89
 
90
+ example_sentences = ["A matchbook is a small cardboard folder (matchcover) enclosing a quantity of matches and having a coarse striking surface on the exterior.",
91
+ "If there are no strong land use controls, buildings are built along a bypass, converting it into an ordinary town road, and the bypass may eventually become as congested as the local streets it was intended to avoid.",
92
+ "Plot Captain Caleb Holt (Kirk Cameron) is a firefighter in Albany, Georgia and firmly keeps the cardinal rule of all firemen, \"Never leave your partner behind\".",
93
+ "Britpop emerged from the British independent music scene of the early 1990s and was characterised by bands influenced by British guitar pop music of the 1960s and 1970s."]
94
 
 
95
 
96
  st.title("Make it Simple")
97
 
 
99
  for s in example_sentences:
100
  st.code(body=s)
101
 
102
+
103
+ with st.form(key="simplify"):
104
  input_sentence = st.text_area("Original sentence")
105
  tok = st.multiselect(
106
  label="Tokens to augment the sentence", options=tokens, default=tokens)
 
114
  if (submit):
115
 
116
  tokens = [t+str(v) for t, v in zip(aug_tok, user_values)]
117
+ #output, words = full_pipeline(input_sentence, simpl_model, simpl_tok, tokens)
118
+ output, words = full_pipeline(input_sentence)
119
+
120
+
121
+ c1, c2 = st.columns([1,2])
122
+
123
+ with c1:
124
+ st.markdown("#### Words identified as complex")
125
+ for w in words:
126
+ st.markdown(f"* {w}")
127
+
128
+ with c2:
129
+ st.markdown(f"#### Original Sentence:\n > {input_sentence}")
130
+ st.markdown(f"#### Output Sentence:\n > {output}")
131
 
132
 
133
  if __name__ == '__main__':