ULMER Louis (T0240644) commited on
Commit
51636fd
1 Parent(s): 8746267

updating paraphraser

Browse files
Files changed (6) hide show
  1. .gitignore +1 -0
  2. README.md +3 -3
  3. app.py +10 -4
  4. backend/data_augmenter.py +13 -4
  5. footer.py +76 -0
  6. requirements.txt +2 -1
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.pyc
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: Paraphraser.ai
3
- emoji: 💻
4
- colorFrom: yellow
5
- colorTo: red
6
  sdk: streamlit
7
  sdk_version: 1.10.0
8
  app_file: app.py
 
1
  ---
2
  title: Paraphraser.ai
3
+ emoji: ✍️
4
+ colorFrom: red
5
+ colorTo: green
6
  sdk: streamlit
7
  sdk_version: 1.10.0
8
  app_file: app.py
app.py CHANGED
@@ -1,19 +1,22 @@
1
  import os
2
  import streamlit as st
3
  from backend.data_augmenter import BackTranslatorAugmenter
4
-
5
  os.environ['NO_PROXY'] = '127.0.0.1'
6
- st.set_page_config(layout="wide", page_title="Paraphraser.AI", page_icon="🤖")
7
- st.title('Paraphraser.AI 🤖')
8
  st.header("An intelligent sentence paraphraser")
 
9
 
10
  model_selection = st.sidebar.selectbox(
11
  'Select a paraphraser:',
12
- ['Vladimir 🧑🏼','Maria 👩🏽'],
13
  )
14
 
15
  input_text = st.text_area('Please type the text to paraphrase')
16
 
 
 
17
  class DummyAugmenter:
18
  def __init__(self, in_lang="en", out_lang="ru") -> None:
19
  pass
@@ -25,8 +28,11 @@ if model_selection == 'Vladimir 🧑🏼':
25
  model = BackTranslatorAugmenter(in_lang="en", out_lang="ru")
26
  if model_selection == 'Maria 👩🏽':
27
  model = BackTranslatorAugmenter(in_lang="en", out_lang="es")
 
 
28
 
29
  if input_text:
30
  st.header(f"Paraphrased text :")
31
  st.write("".join(model.back_translate(input_text)))
32
 
 
 
1
  import os
2
  import streamlit as st
3
  from backend.data_augmenter import BackTranslatorAugmenter
4
+ from footer import footer
5
  os.environ['NO_PROXY'] = '127.0.0.1'
6
+ st.set_page_config(layout="wide", page_title="Paraphraser.AI", page_icon="🤖✍️")
7
+ st.title('Paraphraser.AI 🤖✍️')
8
  st.header("An intelligent sentence paraphraser")
9
+ st.markdown('''This is a demo of a system that can rewrite some given paragraphs with slight differences.''')
10
 
11
  model_selection = st.sidebar.selectbox(
12
  'Select a paraphraser:',
13
+ ['Vladimir 🧑🏼','Maria 👩🏽','Jacques 👨'],
14
  )
15
 
16
  input_text = st.text_area('Please type the text to paraphrase')
17
 
18
+
19
+
20
  class DummyAugmenter:
21
  def __init__(self, in_lang="en", out_lang="ru") -> None:
22
  pass
 
28
  model = BackTranslatorAugmenter(in_lang="en", out_lang="ru")
29
  if model_selection == 'Maria 👩🏽':
30
  model = BackTranslatorAugmenter(in_lang="en", out_lang="es")
31
+ if model_selection == 'Jacques 👨':
32
+ model = BackTranslatorAugmenter(in_lang="en", out_lang="fr")
33
 
34
  if input_text:
35
  st.header(f"Paraphrased text :")
36
  st.write("".join(model.back_translate(input_text)))
37
 
38
+ footer()
backend/data_augmenter.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import argparse
2
  import time
3
  from tqdm import tqdm
@@ -7,6 +8,8 @@ import os
7
  import json
8
  import torch
9
  from dotenv import load_dotenv
 
 
10
 
11
  load_dotenv()
12
  from nltk.tokenize import sent_tokenize
@@ -63,9 +66,14 @@ class BackTranslatorAugmenter:
63
  if verbose:
64
  tic = time.time()
65
  encoded_text = self.in_tokenizer(
66
- text, return_tensors="pt", padding=True, truncation=True
67
  ).to(self.device)
68
- in_generated_ids = self.in_model.generate(**encoded_text)
 
 
 
 
 
69
 
70
  in_preds = [
71
  self.in_tokenizer.decode(
@@ -76,9 +84,10 @@ class BackTranslatorAugmenter:
76
  if verbose:
77
  print("in_pred : ", in_preds)
78
  encoded_text = self.out_tokenizer(
79
- in_preds, return_tensors="pt", padding=True, truncation=True
80
  ).to(self.device)
81
- out_generated_ids = self.out_model.generate(**encoded_text)
 
82
  out_preds = [
83
  self.out_tokenizer.decode(
84
  gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True
 
1
+ #%%
2
  import argparse
3
  import time
4
  from tqdm import tqdm
 
8
  import json
9
  import torch
10
  from dotenv import load_dotenv
11
+ #%%
12
+
13
 
14
  load_dotenv()
15
  from nltk.tokenize import sent_tokenize
 
66
  if verbose:
67
  tic = time.time()
68
  encoded_text = self.in_tokenizer(
69
+ text, return_tensors="pt", padding=True, truncation=True, return_overflowing_tokens=True
70
  ).to(self.device)
71
+ if encoded_text['num_truncated_tokens'][0] > 0:
72
+ print('Text is too long ')
73
+ return self.back_translate_long(text,verbose=verbose)
74
+
75
+ in_generated_ids = self.in_model.generate(inputs=encoded_text['input_ids'],
76
+ attention_mask=encoded_text["attention_mask"])
77
 
78
  in_preds = [
79
  self.in_tokenizer.decode(
 
84
  if verbose:
85
  print("in_pred : ", in_preds)
86
  encoded_text = self.out_tokenizer(
87
+ in_preds, return_tensors="pt", padding=True, truncation=True,return_overflowing_tokens=True
88
  ).to(self.device)
89
+ out_generated_ids = self.out_model.generate(inputs=encoded_text['input_ids'],
90
+ attention_mask=encoded_text["attention_mask"])
91
  out_preds = [
92
  self.out_tokenizer.decode(
93
  gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True
footer.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from htbuilder import HtmlElement, div, ul, li, br, hr, a, p, img, styles, classes, fonts
3
+ from htbuilder.units import percent, px
4
+ from htbuilder.funcs import rgba, rgb
5
+
6
+
7
+ def image(src_as_string, **style):
8
+ return img(src=src_as_string, style=styles(**style))
9
+
10
+
11
+ def link(link, text, **style):
12
+ return a(_href=link, _target="_blank", style=styles(**style))(text)
13
+
14
+
15
+ def layout(*args):
16
+
17
+ style = """
18
+ <style>
19
+ # MainMenu {visibility: hidden;}
20
+ footer {visibility: hidden;}
21
+ .stApp { bottom: 105px; }
22
+ </style>
23
+ """
24
+
25
+ style_div = styles(
26
+ position="fixed",
27
+ left=0,
28
+ bottom=0,
29
+ margin=px(0, 0, 0, 0),
30
+ width=percent(100),
31
+ color="black",
32
+ text_align="center",
33
+ height="auto",
34
+ opacity=1
35
+ )
36
+
37
+ style_hr = styles(
38
+ display="block",
39
+ margin=px(8, 8, "auto", "auto"),
40
+ border_style="inset",
41
+ border_width=px(2)
42
+ )
43
+
44
+ body = p()
45
+ foot = div(
46
+ style=style_div
47
+ )(
48
+ hr(
49
+ style=style_hr
50
+ ),
51
+ body
52
+ )
53
+
54
+ st.markdown(style, unsafe_allow_html=True)
55
+
56
+ for arg in args:
57
+ if isinstance(arg, str):
58
+ body(arg)
59
+
60
+ elif isinstance(arg, HtmlElement):
61
+ body(arg)
62
+
63
+ st.markdown(str(foot), unsafe_allow_html=True)
64
+
65
+
66
+ def footer():
67
+ myargs = [
68
+ "Made in ",
69
+ image('https://avatars3.githubusercontent.com/u/45109972?s=400&v=4',
70
+ width=px(25), height=px(25)),
71
+ br(),
72
+ "with ❤️ by Louis Ulmer ",
73
+ br(),
74
+ link("https://www.linkedin.com/in/louisulmer/", image('https://logospng.org/download/linkedin/logo-linkedin-icon-4096.png',width=px(25), height=px(25))),
75
+ ]
76
+ layout(*myargs)
requirements.txt CHANGED
@@ -5,4 +5,5 @@ transformers[sentencepiece]
5
  pandas
6
  scikit-learn
7
  nltk
8
- python-dotenv
 
 
5
  pandas
6
  scikit-learn
7
  nltk
8
+ python-dotenv
9
+ htbuilder