Qifan Zhang commited on
Commit
d654474
1 Parent(s): 3f6f474

add flexibility pipeline, update something

Browse files
Files changed (6) hide show
  1. app.py +46 -23
  2. description.txt +4 -0
  3. output.csv +5 -13
  4. utils/models.py +6 -0
  5. utils/pipeline.py +35 -0
  6. utils/similarity.py +0 -25
app.py CHANGED
@@ -4,7 +4,8 @@ from typing import Optional
4
  import gradio as gr
5
  import pandas as pd
6
 
7
- from utils.similarity import batch_cos_sim
 
8
 
9
 
10
  def read_data(filepath: str) -> Optional[pd.DataFrame]:
@@ -17,22 +18,45 @@ def read_data(filepath: str) -> Optional[pd.DataFrame]:
17
  return df
18
 
19
 
20
- def process(model_name: str,
 
21
  text: str,
22
  file=None,
23
  ):
24
- if file:
25
- df = read_data(file.name)
26
- elif text:
27
- string_io = StringIO(text)
28
- df = pd.read_csv(string_io)
29
- else:
30
- raise Exception('No input provided')
31
- df = batch_cos_sim(df, model_name)
32
- path = 'output.csv'
33
- df.to_csv(path, index=False, encoding='utf-8-sig')
34
- return str(df), path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
 
 
 
 
 
 
 
36
 
37
  model_name_input = gr.components.Textbox(
38
  value='paraphrase-multilingual-MiniLM-L12-v2',
@@ -40,18 +64,14 @@ model_name_input = gr.components.Textbox(
40
  type='text'
41
  )
42
 
43
- model_name_option = gr.components.Dropdown(
44
  label='Model Name',
45
- value='paraphrase-multilingual-MiniLM-L12-v2',
46
- choices=[
47
- 'paraphrase-multilingual-MiniLM-L12-v2',
48
- 'paraphrase-multilingual-mpnet-base-v2',
49
- 'cyclone/simcse-chinese-roberta-wwm-ext'
50
- ]
51
  )
52
 
53
  text_input = gr.components.Textbox(
54
- value='prompt,response\n',
55
  lines=10,
56
  type='text'
57
  )
@@ -61,13 +81,16 @@ text_output = gr.components.Textbox(
61
  type='text'
62
  )
63
 
 
 
64
  file_output = gr.components.File(label='Output File',
65
  file_count='single',
66
  file_types=['', '.', '.csv', '.xls', '.xlsx'])
67
 
68
  app = gr.Interface(
69
  fn=process,
70
- inputs=[model_name_option, text_input, 'file'],
71
- outputs=[text_output, file_output]
 
72
  )
73
  app.launch()
 
4
  import gradio as gr
5
  import pandas as pd
6
 
7
+ from utils import pipeline
8
+ from utils.models import list_models
9
 
10
 
11
  def read_data(filepath: str) -> Optional[pd.DataFrame]:
 
18
  return df
19
 
20
 
21
+ def process(task_name: str,
22
+ model_name: str,
23
  text: str,
24
  file=None,
25
  ):
26
+ try:
27
+ # load file
28
+ if file:
29
+ df = read_data(file.name)
30
+ elif text:
31
+ string_io = StringIO(text)
32
+ df = pd.read_csv(string_io)
33
+ assert len(df) >= 1, 'No input data'
34
+ else:
35
+ raise Exception('No input data')
36
+
37
+ # process
38
+ if task_name == 'Originality':
39
+ df = pipeline.p0_originality(df, model_name)
40
+ elif task_name == 'Flexibility':
41
+ df = pipeline.p1_flexibility(df, model_name)
42
+ else:
43
+ raise Exception('Task not supported')
44
+
45
+ # save
46
+ path = 'output.csv'
47
+ df.to_csv(path, index=False, encoding='utf-8-sig')
48
+ return str(df), path
49
+ except Exception as e:
50
+ return {'Error': e}, None
51
+
52
 
53
+ instructions = 'Please upload a file or paste the text below. '
54
+
55
+ task_name_dropdown = gr.components.Dropdown(
56
+ label='Task Name',
57
+ value='Originality',
58
+ choices=['Originality', 'Flexibility']
59
+ )
60
 
61
  model_name_input = gr.components.Textbox(
62
  value='paraphrase-multilingual-MiniLM-L12-v2',
 
64
  type='text'
65
  )
66
 
67
+ model_name_dropdown = gr.components.Dropdown(
68
  label='Model Name',
69
+ value=list_models[0],
70
+ choices=list_models
 
 
 
 
71
  )
72
 
73
  text_input = gr.components.Textbox(
74
+ value='id,prompt,response\n',
75
  lines=10,
76
  type='text'
77
  )
 
81
  type='text'
82
  )
83
 
84
+ description = open('description.txt', 'r').read()
85
+
86
  file_output = gr.components.File(label='Output File',
87
  file_count='single',
88
  file_types=['', '.', '.csv', '.xls', '.xlsx'])
89
 
90
  app = gr.Interface(
91
  fn=process,
92
+ inputs=[task_name_dropdown, model_name_dropdown, text_input, 'file'],
93
+ outputs=[text_output, file_output],
94
+ description=description
95
  )
96
  app.launch()
description.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Enter your participant (id) + prompt + response data, one per line, with a COMMA between each variable
2
+ If an error occurred, try simplifying your data - does it work with fewer rows? If not, the input format may be wrong.
3
+ Please note that Sbert_mpnet and Sbert_minilm are applicable to multiple languages, SimCSE is only applicable to Chinese.
4
+ For more help, or to report possible bugs in our system, contact ydd409@163.com
output.csv CHANGED
@@ -1,13 +1,5 @@
1
- prompt,response,originality
2
- 床单,当空调被,0.6427325010299683
3
- 床单,保暖,0.5928247570991516
4
- 床单,绑在树上做成吊床,0.5714011490345001
5
- 床单,当燃料烧,0.7625655382871628
6
- 床单,包裹东西,0.41448450088500977
7
- 床单,裁剪成衣服,0.5791812241077423
8
- 牙刷,用来刷首饰,0.5138461589813232
9
- 牙刷,刷鞋,0.5954866111278534
10
- 牙刷,洗水果,0.6339634656906128
11
- 牙刷,捅人,0.5337955951690674
12
- 牙刷,洗马桶,0.5022678673267365
13
- 牙刷,刷桃子的毛,0.6439318358898163
 
1
+ id,prompt,flexibility
2
+ 1,床单,0.60231946905454
3
+ 1,牙刷,0.5810987452665964
4
+ 2,床单,0.585410421093305
5
+ 2,牙刷,0.5599984327952067
 
 
 
 
 
 
 
 
utils/models.py CHANGED
@@ -6,6 +6,12 @@ from sentence_transformers import SentenceTransformer
6
 
7
  DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
8
 
 
 
 
 
 
 
9
 
10
  class SBert:
11
  def __init__(self, path):
 
6
 
7
  DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
8
 
9
+ list_models = [
10
+ 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
11
+ 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
12
+ 'cyclone/simcse-chinese-roberta-wwm-ext'
13
+ ]
14
+
15
 
16
  class SBert:
17
  def __init__(self, path):
utils/pipeline.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sentence_transformers.util import cos_sim
3
+
4
+ from utils.models import SBert
5
+
6
+
7
+ def p0_originality(df: pd.DataFrame, model_name: str) -> pd.DataFrame:
8
+ assert 'prompt' in df.columns
9
+ assert 'response' in df.columns
10
+ model = SBert(model_name)
11
+
12
+ def get_cos_sim(model, prompt: str, response: str) -> float:
13
+ prompt_vec = model(prompt)
14
+ response_vec = model(response)
15
+ score = cos_sim(prompt_vec, response_vec).item()
16
+ return score
17
+
18
+ df['originality'] = df.apply(lambda x: 1 - get_cos_sim(model, x['prompt'], x['response']), axis=1)
19
+ return df
20
+
21
+
22
+ def p1_flexibility(df: pd.DataFrame, model_name: str) -> pd.DataFrame:
23
+ df = p0_originality(df, model_name)
24
+ assert 'id' in df.columns
25
+ df_out = df.groupby(by=['id', 'prompt']) \
26
+ .agg({'id': 'first', 'prompt': 'first', 'originality': 'mean'}) \
27
+ .rename(columns={'originality': 'flexibility'}) \
28
+ .reset_index(drop=True)
29
+ return df_out
30
+
31
+
32
+ if __name__ == '__main__':
33
+ _df_input = pd.read_csv('data/example_3.csv')
34
+ _df_0 = p0_originality(_df_input, 'paraphrase-multilingual-MiniLM-L12-v2')
35
+ _df_1 = p1_flexibility(_df_input, 'paraphrase-multilingual-MiniLM-L12-v2')
utils/similarity.py DELETED
@@ -1,25 +0,0 @@
1
- import pandas as pd
2
- from sentence_transformers.util import cos_sim
3
-
4
- from utils.models import SBert
5
-
6
-
7
- def get_cos_sim(model, prompt: str, response: str) -> float:
8
- prompt_vec = model(prompt)
9
- response_vec = model(response)
10
- score = cos_sim(prompt_vec, response_vec).item()
11
- return score
12
-
13
-
14
- def batch_cos_sim(df: pd.DataFrame, model_name) -> pd.DataFrame:
15
- # df.columns = ['prompt', 'response']
16
- assert 'prompt' in df.columns
17
- assert 'response' in df.columns
18
- model = SBert(model_name)
19
- df['originality'] = df.apply(lambda x: 1 - get_cos_sim(model, x['prompt'], x['response']), axis=1)
20
- return df
21
-
22
-
23
- if __name__ == '__main__':
24
- _df = pd.read_csv('data/example_1.csv')
25
- _df_o = batch_cos_sim(_df, 'paraphrase-multilingual-MiniLM-L12-v2')