Spaces:
Runtime error
Runtime error
Roland Ding
commited on
Commit
•
b3ed092
1
Parent(s):
13543e6
8.8.21.59 excluded terminal display for all data transformation functions.
Browse files- utility.py +25 -10
utility.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1 |
import json
|
2 |
import regex as re
|
|
|
3 |
|
4 |
from application import *
|
5 |
from pdfminer.high_level import extract_text
|
6 |
from pdfminer.pdfparser import PDFParser
|
7 |
from pdfminer.pdfdocument import PDFDocument
|
8 |
|
|
|
|
|
9 |
'''
|
10 |
universal system functions
|
11 |
'''
|
@@ -205,13 +208,13 @@ def replace_symbols(s):
|
|
205 |
s = s.replace(";","")
|
206 |
s = s.replace("'","")
|
207 |
s = s.replace('"',"")
|
208 |
-
return
|
209 |
|
210 |
'''
|
211 |
following functions are for dynamodb data manipulation
|
212 |
'''
|
213 |
|
214 |
-
@terminal_print
|
215 |
def db_map_to_py_dict(db_map):
|
216 |
'''
|
217 |
this function convert dynamodb map data structure to python dictionary
|
@@ -248,7 +251,7 @@ def db_map_to_py_dict(db_map):
|
|
248 |
|
249 |
return py_dict
|
250 |
|
251 |
-
@terminal_print
|
252 |
def py_dict_to_db_map(py_dict):
|
253 |
'''
|
254 |
this function convert python dictionary to dynamodb map data structure
|
@@ -269,20 +272,20 @@ def py_dict_to_db_map(py_dict):
|
|
269 |
if type(value) is str:
|
270 |
db_map[key] = {"S":value}
|
271 |
elif type(value) is int or type(value) is float:
|
272 |
-
db_map[key] = {"N":value}
|
273 |
elif type(value) is dict:
|
274 |
db_map[key] = {"M":py_dict_to_db_map(value)}
|
275 |
elif type(value) is list:
|
276 |
db_map[key] = {"L":py_list_to_db_list(value)}
|
277 |
elif type(value) is bytes:
|
278 |
-
db_map[key] = {"
|
279 |
elif type(value) is bool:
|
280 |
db_map[key] = {"BOOL":value}
|
281 |
elif value is None:
|
282 |
db_map[key] = {"NULL":True}
|
283 |
return db_map
|
284 |
|
285 |
-
@terminal_print
|
286 |
def db_list_to_py_list(db_list):
|
287 |
'''
|
288 |
this function convert dynamodb list data structure to python list
|
@@ -304,14 +307,25 @@ def db_list_to_py_list(db_list):
|
|
304 |
py_list.append(db_map_to_py_dict(v))
|
305 |
elif t == "L":
|
306 |
py_list.append(db_list_to_py_list(v))
|
307 |
-
elif t =="N"
|
|
|
|
|
|
|
|
|
|
|
308 |
py_list.append(v)
|
|
|
|
|
|
|
|
|
|
|
|
|
309 |
else:
|
310 |
py_list.append(db_map_to_py_dict(v))
|
311 |
|
312 |
return py_list
|
313 |
|
314 |
-
@terminal_print
|
315 |
def py_list_to_db_list(py_list):
|
316 |
'''
|
317 |
this function convert python list to dynamodb list data structure
|
@@ -389,7 +403,7 @@ following functions are used for business logic. (to be moved to business logic
|
|
389 |
'''
|
390 |
|
391 |
@terminal_print
|
392 |
-
def est_cost(
|
393 |
'''
|
394 |
this function calculate the estimated cost of the translation
|
395 |
please note that the rate is per 1000 tokens.
|
@@ -397,7 +411,7 @@ def est_cost(n_tokens,rate):
|
|
397 |
|
398 |
Parameters
|
399 |
----------
|
400 |
-
|
401 |
number of tokens in the text
|
402 |
rate : float
|
403 |
rate per 1000 tokens
|
@@ -406,4 +420,5 @@ def est_cost(n_tokens,rate):
|
|
406 |
-------
|
407 |
float
|
408 |
estimated cost of the translation'''
|
|
|
409 |
return round(rate*n_tokens/1000,4)
|
|
|
1 |
import json
|
2 |
import regex as re
|
3 |
+
import tiktoken
|
4 |
|
5 |
from application import *
|
6 |
from pdfminer.high_level import extract_text
|
7 |
from pdfminer.pdfparser import PDFParser
|
8 |
from pdfminer.pdfdocument import PDFDocument
|
9 |
|
10 |
+
encoding = tiktoken.get_encoding("cl100k_base")
|
11 |
+
|
12 |
'''
|
13 |
universal system functions
|
14 |
'''
|
|
|
208 |
s = s.replace(";","")
|
209 |
s = s.replace("'","")
|
210 |
s = s.replace('"',"")
|
211 |
+
return s
|
212 |
|
213 |
'''
|
214 |
following functions are for dynamodb data manipulation
|
215 |
'''
|
216 |
|
217 |
+
# @terminal_print
|
218 |
def db_map_to_py_dict(db_map):
|
219 |
'''
|
220 |
this function convert dynamodb map data structure to python dictionary
|
|
|
251 |
|
252 |
return py_dict
|
253 |
|
254 |
+
# @terminal_print
|
255 |
def py_dict_to_db_map(py_dict):
|
256 |
'''
|
257 |
this function convert python dictionary to dynamodb map data structure
|
|
|
272 |
if type(value) is str:
|
273 |
db_map[key] = {"S":value}
|
274 |
elif type(value) is int or type(value) is float:
|
275 |
+
db_map[key] = {"N":str(value)}
|
276 |
elif type(value) is dict:
|
277 |
db_map[key] = {"M":py_dict_to_db_map(value)}
|
278 |
elif type(value) is list:
|
279 |
db_map[key] = {"L":py_list_to_db_list(value)}
|
280 |
elif type(value) is bytes:
|
281 |
+
db_map[key] = {"B":value}
|
282 |
elif type(value) is bool:
|
283 |
db_map[key] = {"BOOL":value}
|
284 |
elif value is None:
|
285 |
db_map[key] = {"NULL":True}
|
286 |
return db_map
|
287 |
|
288 |
+
# @terminal_print
|
289 |
def db_list_to_py_list(db_list):
|
290 |
'''
|
291 |
this function convert dynamodb list data structure to python list
|
|
|
307 |
py_list.append(db_map_to_py_dict(v))
|
308 |
elif t == "L":
|
309 |
py_list.append(db_list_to_py_list(v))
|
310 |
+
elif t =="N":
|
311 |
+
if "." in v:
|
312 |
+
py_list.append(float(v))
|
313 |
+
else:
|
314 |
+
py_list.append(int(v))
|
315 |
+
elif t =="S" or t =="BOOL" or t =="SS" or t =="NS":
|
316 |
py_list.append(v)
|
317 |
+
elif t =="B" or t =="BS":
|
318 |
+
py_list.append(bytes(v,"utf-8"))
|
319 |
+
elif t =="NULL":
|
320 |
+
py_list.append(None)
|
321 |
+
elif t =="BOOL":
|
322 |
+
py_list.append(bool(v))
|
323 |
else:
|
324 |
py_list.append(db_map_to_py_dict(v))
|
325 |
|
326 |
return py_list
|
327 |
|
328 |
+
# @terminal_print
|
329 |
def py_list_to_db_list(py_list):
|
330 |
'''
|
331 |
this function convert python list to dynamodb list data structure
|
|
|
403 |
'''
|
404 |
|
405 |
@terminal_print
|
406 |
+
def est_cost(text,rate):
|
407 |
'''
|
408 |
this function calculate the estimated cost of the translation
|
409 |
please note that the rate is per 1000 tokens.
|
|
|
411 |
|
412 |
Parameters
|
413 |
----------
|
414 |
+
text : str
|
415 |
number of tokens in the text
|
416 |
rate : float
|
417 |
rate per 1000 tokens
|
|
|
420 |
-------
|
421 |
float
|
422 |
estimated cost of the translation'''
|
423 |
+
n_tokens = len(encoding.encode(text))
|
424 |
return round(rate*n_tokens/1000,4)
|