xiaowenbin commited on
Commit
8fa4182
1 Parent(s): 04a6dc7

Delete mteb_eval_openai.py

Browse files
Files changed (1) hide show
  1. mteb_eval_openai.py +0 -165
mteb_eval_openai.py DELETED
@@ -1,165 +0,0 @@
1
- import os
2
- import sys
3
- import time
4
- import hashlib
5
- from tqdm import tqdm
6
- import numpy as np
7
- import requests
8
-
9
- import logging
10
- import functools
11
- import tiktoken
12
- from mteb import MTEB
13
- #from sentence_transformers import SentenceTransformer
14
- logging.basicConfig(level=logging.INFO)
15
- logger = logging.getLogger("main")
16
-
17
- all_task_list = ['Classification', 'Clustering', 'Reranking', 'Retrieval', 'STS', 'PairClassification']
18
- if len(sys.argv) > 1:
19
- task_list = [t for t in sys.argv[1].split(',') if t in all_task_list]
20
- else:
21
- task_list = all_task_list
22
-
23
- OPENAI_BASE_URL = os.environ.get('OPENAI_BASE_URL', '')
24
- OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', '')
25
- EMB_CACHE_DIR = os.environ.get('EMB_CACHE_DIR', '.cache/embs')
26
- os.makedirs(EMB_CACHE_DIR, exist_ok=True)
27
-
28
- def uuid_for_text(text):
29
- return hashlib.md5(text.encode('utf8')).hexdigest()
30
-
31
- def count_openai_tokens(text, model="text-embedding-3-large"):
32
- encoding = tiktoken.get_encoding("cl100k_base")
33
- #encoding = tiktoken.encoding_for_model(model)
34
- input_ids = encoding.encode(text)
35
- return len(input_ids)
36
-
37
- def request_openai_emb(texts, model="text-embedding-3-large",
38
- base_url='https://api.openai.com', prefix_url='/v1/embeddings',
39
- timeout=120, retry=3, interval=60, caching=True):
40
- if isinstance(texts, str):
41
- texts = [texts]
42
-
43
- data = []
44
- if caching:
45
- for text in texts:
46
- emb_file = f"{EMB_CACHE_DIR}/{uuid_for_text(text)}"
47
- if os.path.isfile(emb_file) and os.path.getsize(emb_file) > 0:
48
- data.append(np.loadtxt(emb_file))
49
- if len(texts) == len(data):
50
- return data
51
-
52
- url = f"{OPENAI_BASE_URL}{prefix_url}" if OPENAI_BASE_URL else f"{base_url}{prefix_url}"
53
- headers = {
54
- "Authorization": f"Bearer {OPENAI_API_KEY}",
55
- "Content-Type": "application/json"
56
- }
57
- payload = {"input": texts, "model": model}
58
-
59
- while retry > 0 and len(data) == 0:
60
- try:
61
- r = requests.post(url, headers=headers, json=payload,
62
- timeout=timeout)
63
- res = r.json()
64
- for x in res["data"]:
65
- data.append(np.array(x["embedding"]))
66
- except Exception as e:
67
- print(f"request openai, retry {retry}, error: {e}", file=sys.stderr)
68
- time.sleep(interval)
69
- retry -= 1
70
-
71
- if len(data) != len(texts):
72
- data = []
73
-
74
- if caching and len(data) > 0:
75
- for text, emb in zip(texts, data):
76
- emb_file = f"{EMB_CACHE_DIR}/{uuid_for_text(text)}"
77
- np.savetxt(emb_file, emb)
78
-
79
- return data
80
-
81
-
82
- class OpenaiEmbModel:
83
-
84
- def __init__(self, model_name, model_dim, *args, **kwargs):
85
- super().__init__(*args, **kwargs)
86
- self.model_name = model_name
87
- self.model_dim = model_dim
88
-
89
- def encode(self, sentences, batch_size=32, **kwargs):
90
- i = 0
91
- max_tokens = 8000
92
- batch_tokens = 0
93
- batch = []
94
- batch_list = []
95
- while i < len(sentences):
96
- num_tokens = count_openai_tokens(sentences[i],
97
- model=self.model_name)
98
- if batch_tokens+num_tokens > max_tokens:
99
- if batch:
100
- batch_list.append(batch)
101
- if num_tokens > max_tokens:
102
- batch = [sentences[i][:2048]]
103
- batch_tokens = count_openai_tokens(sentences[i][:2048],
104
- model=self.model_name)
105
- else:
106
- batch = [sentences[i]]
107
- batch_tokens = num_tokens
108
- else:
109
- batch_list.append([sentences[i][:2048]])
110
- else:
111
- batch.append(sentences[i])
112
- batch_tokens += num_tokens
113
- i += 1
114
- if batch:
115
- batch_list.append(batch)
116
- #print(len(sentences), sum([len(x) for x in batch_list]))
117
-
118
- #batch_size = min(64, batch_size)
119
- #
120
- #for i in range(0, len(sentences), batch_size):
121
- # batch_texts = sentences[i:i+batch_size]
122
- # batch_list.append(batch_texts)
123
-
124
- embs = []
125
- waiting = 0
126
- for batch_i, batch_texts in enumerate(tqdm(batch_list)):
127
- batch_embs = request_openai_emb(batch_texts, model=self.model_name,
128
- caching=True, timeout=120, retry=3, interval=60)
129
-
130
- #assert len(batch_texts) == len(batch_embs), "The batch of texts and embs DONT match!"
131
-
132
- if len(batch_texts) == len(batch_embs):
133
- embs.extend(batch_embs)
134
- waiting = waiting // 2
135
- else:
136
- print(f"The batch-{batch_i} of texts and embs DONT match! {len(batch_texts)}:{len(batch_embs)}", file=sys.stderr)
137
- embs.extend([np.array([0.0 for j in range(self.model_dim)]) for i in range(len(batch_texts))])
138
- waiting = 120 if waiting <= 0 else waiting+120
139
-
140
- if waiting > 3600:
141
- print(f"Frequently failed, break down!", file=sys.stderr)
142
- break
143
- if waiting > 0:
144
- time.sleep(waiting)
145
-
146
- print(f'Encoded texts:embs={len(sentences)}:{len(embs)}')
147
- return embs
148
-
149
-
150
- model_name = "text-embedding-3-large"
151
- model_dim = 3072
152
- model = OpenaiEmbModel(model_name, model_dim)
153
-
154
- ######
155
- # test
156
- #####
157
- #embs = model.encode(['全国', '北京'])
158
- #print(embs)
159
- #exit()
160
-
161
- # languages
162
- task_langs=["zh", "zh-CN"]
163
-
164
- evaluation = MTEB(task_types=task_list, task_langs=task_langs)
165
- evaluation.run(model, output_folder=f"results/zh/{model_name.split('/')[-1]}")