AidenYan commited on
Commit
118f25c
1 Parent(s): b404444

Create app_.py

Browse files
Files changed (1) hide show
  1. app_.py +36 -0
app_.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModel
2
+ import torch
3
+ import torch.nn.functional as F
4
+ import pandas as pd
5
+
6
+ # Function to perform mean pooling
7
+ def mean_pooling(model_output, attention_mask):
8
+ token_embeddings = model_output[0]
9
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
10
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
11
+
12
+ # Load CSV data
13
+ df = pd.read_csv('/path/to/your/csvfile.csv')
14
+
15
+ # Assuming 'review' is the column with the text you want to encode
16
+ sentences = df['review'].tolist()
17
+
18
+ # Load model and tokenizer from HuggingFace Hub
19
+ tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
20
+ model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
21
+
22
+ # Tokenize sentences
23
+ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
24
+
25
+ # Compute token embeddings
26
+ with torch.no_grad():
27
+ model_output = model(**encoded_input)
28
+
29
+ # Perform pooling
30
+ sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
31
+
32
+ # Normalize embeddings
33
+ sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
34
+
35
+ print("Sentence embeddings:")
36
+ print(sentence_embeddings)