abnerguzman commited on
Commit
90ce60b
1 Parent(s): 7b3fd2d

Delete util.py

Browse files
Files changed (1) hide show
  1. util.py +0 -338
util.py DELETED
@@ -1,338 +0,0 @@
1
- from dotenv import load_dotenv
2
- import os
3
- load_dotenv()
4
-
5
- import concurrent.futures
6
- from collections import defaultdict
7
- import pandas as pd
8
- import numpy as np
9
- import json
10
- import pickle
11
- import pprint
12
- from io import StringIO
13
- import textwrap
14
- import time
15
- import re
16
-
17
- from openai import OpenAI
18
- openai_client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
19
-
20
- import octoai
21
- octoai_client = octoai.client.Client(token=os.getenv('OCTOML_KEY'))
22
-
23
- from pinecone import Pinecone, ServerlessSpec
24
- pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
25
- pc_256 = pc.Index('prorata-postman-ds-256-v2')
26
- pc_128 = pc.Index('prorata-postman-ds-128-v2')
27
-
28
-
29
- from langchain.text_splitter import RecursiveCharacterTextSplitter
30
-
31
- sentence_splitter = RecursiveCharacterTextSplitter(
32
- chunk_size=128,
33
- chunk_overlap=0,
34
- separators=["\n\n", "\n", "."],
35
- keep_separator=False
36
- )
37
-
38
-
39
- from functools import cache
40
-
41
- @cache
42
- def get_embedding(text, model="text-embedding-3-small"):
43
- text = text.replace("\n", " ")
44
- return openai_client.embeddings.create(input = [text], model=model).data[0].embedding
45
-
46
- def get_embedding_l(text_l, model="text-embedding-3-small"):
47
- text_l = [text.replace("\n", " ") for text in text_l]
48
- res = openai_client.embeddings.create(input=text_l, model=model)
49
- embeds = [record.embedding for record in res.data]
50
- return embeds
51
-
52
-
53
-
54
- def parse_json_string(content):
55
- fixed_content = content
56
- for _ in range(20):
57
- try:
58
- result = json.loads(fixed_content)
59
- break
60
- except Exception as e:
61
- print(e)
62
- if "Expecting ',' delimiter" in str(e):
63
- # "Expecting , delimiter: line x column y (char d)"
64
- idx = int(re.findall(r'\(char (\d+)\)', str(e))[0])
65
- fixed_content = fixed_content[:idx] + ',' + fixed_content[idx:]
66
- print(fixed_content)
67
- print()
68
- elif "Expecting property name enclosed in double quotes" in str(e):
69
- # Expecting property name enclosed in double quotes: line x column y (char d)
70
- idx = int(re.findall(r'\(char (\d+)\)', str(e))[0])
71
- fixed_content = fixed_content[:idx-1] + '}' + fixed_content[idx:]
72
- print(fixed_content)
73
- print()
74
- else:
75
- raise ValueError(str(e))
76
- return result
77
-
78
- # prompt_af_template_llama3 = "Please breakdown the following paragraph into independent and atomic facts. Format your response as a signle JSON object, a list of facts:\n\n{}"
79
- prompt_af_template_llama3 = "Please breakdown the following paragraph into independent and atomic facts. Format your response in JSON as a list of 'fact' objects:\n\n{}"
80
-
81
- # prompt_tf_template = "Given the context below, anwer the question that follows. Please format your answer in JSON with a yes or no determination and rationale for the determination. \n\nContext: {}\n\nQuestion: {} Is this claim true or false?"
82
- # prompt_tf_template = "Given the context below, anwer the question that follows. Please format your answer in JSON with a yes or no determination and rationale for the determination. \n\nContext: {}\n\nQuestion: <{}> Is the previous claim (in between <> braces) true or false?"
83
- prompt_tf_template = "Given the context below, anwer the question that follows. Please format your answer in JSON with a yes or no determination and rationale for the determination. \n\nContext: {}\n\nQuestion: <{}> Does the context explicitly support the previous claim (in between <> braces), true or false?"
84
-
85
- def get_atoms_list(answer, file=None):
86
- prompt_af = prompt_af_template_llama3.format(answer)
87
- response, atoms_l = None, []
88
- for _ in range(5):
89
- try:
90
- # response = octoai_client.chat.completions.create(
91
- # model="meta-llama-3-70b-instruct",
92
- # messages=[
93
- # {"role": "system", "content": "You are a helpful assistant."},
94
- # {"role": "user", "content": prompt_af}
95
- # ],
96
- # # response_format={"type": "json_object"},
97
- # max_tokens=512,
98
- # presence_penalty=0,
99
- # temperature=0.1,
100
- # top_p=0.9,
101
- # )
102
- response = octoai_client.chat.completions.create(
103
- model="meta-llama-3-70b-instruct",
104
- messages=[
105
- {"role": "system", "content": "You are a helpful assistant."},
106
- {"role": "user", "content": prompt_af}
107
- ],
108
- # response_format={"type": "json_object"},
109
- max_tokens=512,
110
- presence_penalty=0,
111
- temperature=0.1,
112
- top_p=0.9,
113
- )
114
- content = response.choices[0].message.content
115
- idx1 = content.find('```')
116
- idx2 = idx1+3 + content[idx1+3:].find('```')
117
- # atoms_l = json.loads(content[idx1+3:idx2])
118
- atoms_l = parse_json_string(content[idx1+3:idx2])
119
- atoms_l = [a['fact'] for a in atoms_l]
120
- break
121
- except Exception as error:
122
- print(error, file=file)
123
- print(response, file=file)
124
- print(content[idx1+3:idx2], file=file)
125
- time.sleep(2)
126
- return atoms_l
127
-
128
- def get_topk_matches(atom, k=5, pc_index=pc_256):
129
- embed_atom = get_embedding(atom)
130
- res = pc_index.query(vector=embed_atom, top_k=k, include_metadata=True)
131
- return res['matches']
132
-
133
- def get_match_atom_entailment_determination(_match, atom, file=None):
134
- prompt_tf = prompt_tf_template.format(_match['metadata']['text'], atom)
135
- response = None
136
- chunk_determination = {}
137
- chunk_determination['chunk_id'] = _match['id']
138
- chunk_determination['true'] = False
139
- for _ in range(5):
140
- try:
141
- response = octoai_client.chat.completions.create(
142
- model="meta-llama-3-70b-instruct",
143
- messages=[
144
- {"role": "system", "content": "You are a helpful assistant."},
145
- {"role": "user", "content": prompt_tf}
146
- ],
147
- # response_format={"type": "json_object"},
148
- max_tokens=512,
149
- # presence_penalty=0,
150
- temperature=0.1,
151
- # top_p=0.9,
152
- )
153
- content = response.choices[0].message.content
154
- idx1 = content.find('{')
155
- idx2 = content.find('}')
156
- chunk_determination.update(json.loads(content[idx1:idx2+1]))
157
- _det_lower = chunk_determination['determination'].lower()
158
- chunk_determination['true'] = "true" in _det_lower or "yes" in _det_lower
159
- break
160
- except Exception as error:
161
- print(error, file=file)
162
- print(prompt_tf, file=file)
163
- print(response, file=file)
164
- time.sleep(2)
165
- return chunk_determination
166
-
167
- def get_atom_support(atom, file=None):
168
- topk_matches = get_topk_matches(atom)
169
- atom_support = {}
170
- for _match in topk_matches:
171
- chunk_determination = atom_support.get(_match['metadata']['url'], {})
172
- if not chunk_determination or not chunk_determination['true']:
173
- atom_support[_match['metadata']['url']] = get_match_atom_entailment_determination(_match, atom, file=file)
174
- return atom_support
175
-
176
- def get_atom_support_list(atoms_l, file=None):
177
- return [get_atom_support(a, file=file) for a in atoms_l]
178
-
179
- def credit_atom_support_list(atom_support_l):
180
-
181
- num_atoms = len(atom_support_l)
182
- credit_d = defaultdict(float)
183
-
184
- for atom_support in atom_support_l:
185
-
186
- atom_support_size = 0.0
187
- for url_determination_d in atom_support.values():
188
- if url_determination_d['true']:
189
- atom_support_size += 1.0
190
-
191
- for url, url_determination_d in atom_support.items():
192
- if url_determination_d['true']:
193
- credit_d[url] += 1.0 / atom_support_size
194
-
195
- for url in credit_d.keys():
196
- credit_d[url] = credit_d[url] / num_atoms
197
-
198
- return credit_d
199
-
200
- def print_atom_support(atom_support, prefix='', file=None):
201
- for url, chunk_determination in atom_support.items():
202
- print(f"{prefix}{url}:", file=file)
203
- print(f"{prefix} Determination: {'YES' if chunk_determination['true'] else 'NO'}", file=file)
204
- print(f"{prefix} Rationale: {chunk_determination['rationale']}", file=file)
205
-
206
- def print_credit_dist(credit_dist, prefix='', url_to_id=None, file=None):
207
- credit_l = [(url, w) for url, w in credit_dist.items()]
208
- credit_l = sorted(credit_l, key=lambda x: x[1], reverse=True)
209
-
210
- for url, w in credit_l:
211
- if url_to_id is None:
212
- print(f"{prefix}{url}: {100*w:.2f}%", file=file)
213
- else:
214
- print(f"{prefix}{url_to_id[url]} {url}: {100*w:.2f}%", file=file)
215
-
216
-
217
- # concurrent LLM calls
218
- def get_atom_topk_matches_l_concurrent(atoms_l, max_workers=4):
219
- atom_topkmatches_l = []
220
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
221
- futures = []
222
- for atom in atoms_l:
223
- futures.append(executor.submit(get_topk_matches, atom))
224
-
225
- for f in futures:
226
- r = f.result()
227
- atom_topkmatches_l.append(r)
228
- return atom_topkmatches_l
229
-
230
- def aggregate_atom_topkmatches_l(atom_topkmatches_l):
231
- atom_url_to_aggmacth_maps_l = []
232
- for atom_topkmatches in atom_topkmatches_l:
233
-
234
- atom_url_to_aggmatch_map = {}
235
- atom_url_to_aggmacth_maps_l.append(atom_url_to_aggmatch_map)
236
-
237
- for _match in atom_topkmatches:
238
-
239
- if _match['metadata']['url'] not in atom_url_to_aggmatch_map:
240
- match_copy = {}
241
- match_copy['id'] = _match['id']
242
- match_copy['id_l'] = [_match['id']]
243
- match_copy['offset_l'] = [0]
244
- match_copy['score'] = _match['score']
245
- match_copy['values'] = _match['values']
246
- # TODO: change to list of chunks and then append at query time
247
- match_copy['metadata'] = {}
248
- match_copy['metadata']['url'] = _match['metadata']['url']
249
- match_copy['metadata']['chunk'] = _match['metadata']['chunk']
250
- match_copy['metadata']['text'] = _match['metadata']['text']
251
- match_copy['metadata']['title'] = _match['metadata']['title']
252
-
253
- atom_url_to_aggmatch_map[_match['metadata']['url']] = match_copy
254
- else:
255
- prev_match = atom_url_to_aggmatch_map[_match['metadata']['url']]
256
-
257
- prev_match['id_l'].append(_match['id'])
258
- prev_match['offset_l'].append(len(prev_match['metadata']['text']))
259
- prev_match['metadata']['text'] += f"\n\n{_match['metadata']['text']}"
260
-
261
- atomidx_w_single_url_aggmatch_l = []
262
- for idx, atom_url_to_aggmatch_map in enumerate(atom_url_to_aggmacth_maps_l):
263
- for agg_match in atom_url_to_aggmatch_map.values():
264
- atomidx_w_single_url_aggmatch_l.append((idx, agg_match))
265
-
266
- return atomidx_w_single_url_aggmatch_l
267
-
268
- def get_atmom_support_l_from_atomidx_w_single_url_aggmatch_l_concurrent(atoms_l, atomidx_w_single_url_aggmatch_l, max_workers=4):
269
- atom_support_l = [{} for _ in atoms_l]
270
-
271
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
272
- futures = []
273
-
274
- for atomidx_w_single_url_aggmatch in atomidx_w_single_url_aggmatch_l:
275
- futures.append(executor.submit(
276
- get_match_atom_entailment_determination,
277
- atomidx_w_single_url_aggmatch[1],
278
- atoms_l[atomidx_w_single_url_aggmatch[0]],
279
- )
280
- )
281
-
282
- for f, atomidx_w_single_url_aggmatch in zip(futures, atomidx_w_single_url_aggmatch_l):
283
- aggmatch_determination = f.result()
284
- atom_support = atom_support_l[atomidx_w_single_url_aggmatch[0]]
285
- atom_support[atomidx_w_single_url_aggmatch[1]['metadata']['url']] = aggmatch_determination
286
-
287
- return atom_support_l
288
-
289
- style_str = """
290
- <style>
291
- .doc-title {
292
- /* font-family: cursive, sans-serif; */
293
- font-family: Optima, sans-serif;
294
- width: 100%;
295
- display: inline-block;
296
- font-size: 2em;
297
- font-weight: bolder;
298
- padding-top: 20px;
299
- /* font-style: italic; */
300
- }
301
- .doc-url {
302
- /* font-family: cursive, sans-serif; */
303
- font-size: 1em;
304
- padding-left: 40px;
305
- padding-bottom: 10px;
306
- /* font-weight: bolder; */
307
- /* font-style: italic; */
308
- }
309
- .doc-text {
310
- /* font-family: cursive, sans-serif; */
311
- font-family: Optima, sans-serif;
312
- font-size: 1.5em;
313
- white-space: pre-wrap;
314
- padding-left: 40px;
315
- padding-bottom: 20px;
316
- /* font-weight: bolder; */
317
- /* font-style: italic; */
318
- }
319
- .doc-text .chunk-separator {
320
- /* font-style: italic; */
321
- color: #0000FF;
322
- }
323
- .doc-title > img {
324
- width: 22px;
325
- height: 22px;
326
- border-radius: 50%;
327
- overflow: hidden;
328
- background-color: transparent;
329
- display: inline-block;
330
- vertical-align: middle;
331
- }
332
- .doc-title > score {
333
- font-family: Optima, sans-serif;
334
- font-weight: normal;
335
- float: right;
336
- }
337
- </style>
338
- """