Qifan Zhang commited on
Commit
cf575f8
1 Parent(s): f32101e

add feature 1

Browse files
Files changed (5) hide show
  1. .gitignore +3 -0
  2. app.py +53 -0
  3. output.csv +13 -0
  4. utils/models.py +16 -0
  5. utils/similarity.py +25 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ data
2
+ .idea
3
+ *.csv
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import gradio as gr
4
+ import pandas as pd
5
+
6
+ from utils.similarity import batch_cos_sim
7
+
8
+
9
+ def read_data(filepath: str) -> Optional[pd.DataFrame]:
10
+ if not filepath:
11
+ return None
12
+ if filepath.endswith('.xlsx'):
13
+ df = pd.read_csv(filepath)
14
+ elif filepath.endswith('.csv'):
15
+ df = pd.read_csv(filepath)
16
+ else:
17
+ raise Exception('File type not supported')
18
+ return df
19
+
20
+
21
+ def process(model_name: str,
22
+ prompt: str,
23
+ file=None,
24
+ ):
25
+ df = read_data(file.name)
26
+ df = batch_cos_sim(df, model_name)
27
+ path = 'output.csv'
28
+ df.to_csv(path, index=False, encoding='utf-8-sig')
29
+ return df.to_markdown(), path
30
+
31
+
32
+ model_name_input = gr.components.Textbox(
33
+ value='paraphrase-multilingual-MiniLM-L12-v2',
34
+ lines=1,
35
+ type="text"
36
+ )
37
+
38
+ prompt_input = gr.components.Textbox(
39
+ value='prompt,response',
40
+ lines=10,
41
+ type="text"
42
+ )
43
+
44
+ file_output = gr.components.File(label="Output File",
45
+ file_count="single",
46
+ file_types=["", ".", ".csv", ".xls", ".xlsx"])
47
+
48
+ app = gr.Interface(
49
+ fn=process,
50
+ inputs=[model_name_input, prompt_input, "file" ],
51
+ outputs=["text", file_output]
52
+ )
53
+ app.launch()
output.csv ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ prompt,response,originality
2
+ 床单,当空调被,0.6427325010299683
3
+ 床单,保暖,0.5928247570991516
4
+ 床单,绑在树上做成吊床,0.5714011490345001
5
+ 床单,当燃料烧,0.7625655382871628
6
+ 床单,包裹东西,0.41448450088500977
7
+ 床单,裁剪成衣服,0.5791812241077423
8
+ 牙刷,用来刷首饰,0.5138461589813232
9
+ 牙刷,刷鞋,0.5954866111278534
10
+ 牙刷,洗水果,0.6339634656906128
11
+ 牙刷,捅人,0.5337955951690674
12
+ 牙刷,洗马桶,0.5022678673267365
13
+ 牙刷,刷桃子的毛,0.6439318358898163
utils/models.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+
3
+ import torch
4
+ from sentence_transformers import SentenceTransformer
5
+ import numpy as np
6
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
7
+
8
+
9
+ class SBert:
10
+ def __init__(self, path):
11
+ self.model = SentenceTransformer(path, device=DEVICE)
12
+
13
+ @lru_cache(maxsize=10000)
14
+ def __call__(self, x) -> np.ndarray:
15
+ y = self.model.encode(x)
16
+ return y
utils/similarity.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sentence_transformers.util import cos_sim
3
+
4
+ from utils.models import SBert
5
+
6
+
7
+ def get_cos_sim(model, prompt: str, response: str) -> float:
8
+ prompt_vec = model(prompt)
9
+ response_vec = model(response)
10
+ score = cos_sim(prompt_vec, response_vec).item()
11
+ return score
12
+
13
+
14
+ def batch_cos_sim(df: pd.DataFrame, model_name) -> pd.DataFrame:
15
+ # df.columns = ['prompt', 'response']
16
+ assert 'prompt' in df.columns
17
+ assert 'response' in df.columns
18
+ model = SBert(model_name)
19
+ df['originality'] = df.apply(lambda x: 1 - get_cos_sim(model, x['prompt'], x['response']), axis=1)
20
+ return df
21
+
22
+
23
+ if __name__ == '__main__':
24
+ _df = pd.read_csv('data/example_1.csv')
25
+ _df_o = batch_cos_sim(_df, 'paraphrase-multilingual-MiniLM-L12-v2')