File size: 9,689 Bytes
43de500
c5f2cba
b3ed092
43de500
2d719a7
1393bc8
 
 
 
b3ed092
 
e8f5f18
 
 
 
 
 
 
 
 
c6588f2
e8f5f18
 
 
 
c6588f2
e8f5f18
 
 
 
 
96778ed
43de500
 
 
 
e8f5f18
43de500
1393bc8
 
 
 
c5f2cba
 
 
 
 
 
 
 
 
43de500
7359595
c6588f2
1393bc8
 
c6588f2
43de500
c6588f2
7359595
c5f2cba
43de500
c6588f2
1393bc8
 
 
 
c6588f2
43de500
7359595
43de500
 
 
 
 
 
c5f2cba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43de500
 
 
 
 
 
 
 
 
 
 
 
 
 
e8f5f18
43de500
c5f2cba
 
 
 
 
 
 
 
 
 
 
 
 
43de500
 
 
 
 
e8f5f18
43de500
c5f2cba
 
 
 
 
 
 
 
 
 
 
 
 
43de500
 
 
 
e8f5f18
c5f2cba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6588f2
43de500
c5f2cba
 
 
 
 
 
 
 
 
 
 
 
 
43de500
 
e8f5f18
c5f2cba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3ed092
c5f2cba
43de500
 
 
c5f2cba
b3ed092
43de500
c5f2cba
 
 
 
 
 
 
 
 
 
 
 
 
43de500
 
 
 
 
 
 
 
 
 
 
d8c5bc7
 
 
 
c27e51c
 
43de500
 
 
 
 
b3ed092
43de500
c5f2cba
 
 
 
 
 
 
 
 
 
 
 
 
43de500
 
 
 
 
 
b3ed092
43de500
 
 
 
1a5f151
b3ed092
d8c5bc7
 
 
 
43de500
 
b3ed092
43de500
c5f2cba
 
 
 
 
 
 
 
 
 
 
 
 
43de500
 
 
 
 
 
 
b3ed092
 
 
 
 
 
43de500
b3ed092
 
 
 
 
 
1a5f151
 
43de500
 
 
b3ed092
43de500
c5f2cba
 
 
 
 
 
 
 
 
 
 
 
 
43de500
 
 
 
1a5f151
c6588f2
43de500
 
1a5f151
43de500
 
d8c5bc7
 
 
c6588f2
d8c5bc7
 
 
 
43de500
 
 
 
 
0b33c29
 
 
 
 
 
 
e8f5f18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6588f2
e8f5f18
 
 
c6588f2
e8f5f18
 
 
43de500
 
 
 
e8f5f18
b3ed092
844a2ba
c5f2cba
 
 
 
844a2ba
 
b3ed092
c5f2cba
 
 
 
844a2ba
 
c5f2cba
 
b3ed092
c5f2cba
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
import json
import regex as re
import tiktoken

from application import *
from pdfminer.high_level import extract_text
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

encoding = tiktoken.get_encoding("cl100k_base")

'''
universal system functions
'''

def terminal_print(func):
    from datetime import datetime
    # import os
    def wrapper(*args, **kwargs):
        start = datetime.now()
        print(f"{start.strftime('%y-%m-%d %H:%M:%S')} - executing function: {func.__name__}")

        result = func(*args, **kwargs)

        end = datetime.now()
        print(f"{end.strftime('%y-%m-%d %H:%M:%S')} - completed function: {func.__name__}, runtime: {end-start} seconds")

        return result

    return wrapper


'''
following functions are for file manipulation
'''

@terminal_print
def read_pdf(file_path):
    '''
    this function read the pdf file and return the text
    
    Parameters
    ----------
    file_path : str
        path to the pdf file
        
    Returns
    -------
    text : str
        text extracted from the pdf file
    '''
    # open the pdf file
    if type(file_path) is str:
        file_obj = open(file_path, 'rb')    
    # elif type(file_path) is tempfile._TemporaryFileWrapper:
    else:
        file_obj = open(file_path.name, 'rb')

    text = extract_text(file_obj)
    text = remove_symbols(text)
    text = remove_citation(text)

    parser = PDFParser(file_obj)
    doc = PDFDocument(parser)

    meta = doc.info
    # close the pdf file object
    file_obj.close()    

    return text, meta

'''
following functions are for format standard response
'''

def format_response(code,data):
    '''
    this function format the response to be returned to the client.
    this is used for lambda serverless framework to return the response.
    
    Parameters
    ----------
    code : int
        status code
    data : dict
        data to be returned to the client
            
    Returns
    -------
    dict
        formatted response
    '''
    return {
        "statusCode":code,
        "headers":{
            "Access-Control-Allow-Origin": "*",
            "Content-Type": "application/json"
        },
        "body":json.dumps(data),
        "isBase64Encoded": False
    }

'''
following functions are for string manipulation
'''

@terminal_print
def format_text(text,remove_char_ls = ["\\n--\\n","\\n\\n","\n"]):
    '''
    this function format the text output by removing excessive characters

    Parameters
    ----------
    text : str
        text to be processed

    Returns
    -------
    str
        processed text
    '''
    for c in remove_char_ls:
        text = text.replace(c,"")
    
    return text

@terminal_print
def remove_symbols(text):
    '''
    this function remove symbols that are not in unicode

    Parameters
    ----------
    text : str
        text to be processed

    Returns
    -------
    str
        processed text
    '''
    text = text.encode("ascii", "ignore").decode()
    text = text.replace('-\n', '')
    return text

@terminal_print
def remove_citation(text):
    ''' 
    this function remove citation pattern in the text

    Parameters
    ----------
    text : str
        text to be processed

    Returns
    -------
    str
        processed text    
    '''
    return re.sub(r'\(cid:\d+\)','',text)

@terminal_print
def str_to_tuple(s):
    '''
    this function convert string to tuple

    Parameters
    ----------
    s : str
        string to be converted

    Returns
    -------
    tuple
        converted tuple
    '''
    return tuple(s.replace("(","").replace(")","").split(","))

@terminal_print
def replace_symbols(s):
    '''
    this function replace symbols in the string to comply with file names

    Parameters
    ----------
    s : str
        string to be replaced

    Returns
    -------
    str
        replaced string
    '''
    s = s.replace(" ","_")
    s = s.replace(",","")
    s = s.replace(".","")
    s = s.replace("-","_")
    s = s.replace("(","")
    s = s.replace(")","")
    s = s.replace("/","_")
    s = s.replace(":","")
    s = s.replace(";","")
    s = s.replace("'","")
    s = s.replace('"',"")
    return s

'''
following functions are for dynamodb data manipulation
'''

# @terminal_print
def db_map_to_py_dict(db_map):
    '''
    this function convert dynamodb map data structure to python dictionary

    Parameters
    ----------
    db_map : dict
        dynamodb map

    Returns
    -------
    dict
        python dictionary
    '''
    py_dict = {}
    for k,i in db_map.items():
        for l,v in i.items():
            if l == "M":
                py_dict[k] = db_map_to_py_dict(v)
            elif l == "S": 
                py_dict[k] = v
            elif l == "N":
                py_dict[k] = int(v) if float(v)%1 ==0 else float(v)
            elif l == "L":
                py_dict[k] = db_list_to_py_list(v)
            elif l == "BS":
                py_dict[k] = v
            elif l == "BOOL":
                py_dict[k] = v
            elif l =="NULL":
                py_dict[k] = None
            else:
                py_dict[k] = v
        
    return py_dict

# @terminal_print
def py_dict_to_db_map(py_dict):
    '''
    this function convert python dictionary to dynamodb map data structure

    Parameters
    ----------
    py_dict : dict
        python dictionary

    Returns
    -------
    dict
        dynamodb map
    '''
    db_map = {}
    for key,value in py_dict.items():
        key = str(key)
        if type(value) is str:
            db_map[key] = {"S":value}
        elif type(value) is int or type(value) is float:
            db_map[key] = {"N":str(value)}
        elif type(value) is dict:
            db_map[key] = {"M":py_dict_to_db_map(value)}
        elif type(value) is list:
            db_map[key] = {"L":py_list_to_db_list(value)}
        elif type(value) is bytes:
            db_map[key] = {"B":value}
        elif type(value) is bool:
            db_map[key] = {"BOOL":value}
        elif value is None:
            db_map[key] = {"NULL":True}
    return db_map

# @terminal_print
def db_list_to_py_list(db_list):
    '''
    this function convert dynamodb list data structure to python list

    Parameters
    ----------
    db_list : list
        dynamodb list

    Returns
    -------
    list
        python list
    '''
    py_list = []
    for d in db_list:
        for t,v in d.items():
            if t == "M":
                py_list.append(db_map_to_py_dict(v))
            elif t == "L":
                py_list.append(db_list_to_py_list(v))
            elif t =="N":
                if "." in v:
                    py_list.append(float(v))
                else:
                    py_list.append(int(v))
            elif t =="S" or t =="BOOL" or t =="SS" or t =="NS":
                py_list.append(v)
            elif t =="B" or t =="BS":
                py_list.append(bytes(v,"utf-8"))
            elif t =="NULL":
                py_list.append(None)
            elif t =="BOOL":
                py_list.append(bool(v))
            else:
                py_list.append(db_map_to_py_dict(v))
                
    return py_list

# @terminal_print
def py_list_to_db_list(py_list):
    '''
    this function convert python list to dynamodb list data structure

    Parameters
    ----------
    py_list : list
        python list

    Returns
    -------
    list
        dynamodb list
    '''
    db_list = []
    for value in py_list:
        if type(value) is str:
            item = {"S":value}
        elif type(value) is int or type(value) is float:
            item = {"N":str(value)}
        elif type(value) is dict:
            item = {"M":py_dict_to_db_map(value)}
            # item = py_dict_to_db_map(value)
        elif type(value) is list:
            item = {"L":py_list_to_db_list(value)}
        elif type(value) is tuple:
            item = {"L":py_list_to_db_list(value)}
        elif type(value) is bytes:
            item = {"B":value}
        elif type(value) is bool:
            item = {"BOOL":value}
        elif value is None:
            item = {"NULL":True}
        
        db_list.append(item)
        
    return db_list

def list_dict_to_dict(ls,key):
    result_dict = {}
    for d in ls:
        if key in d:
            result_dict[d[key]] = d
    return result_dict

'''
following functions are for markdown table creation
'''

@terminal_print
def create_md_table(array):
    '''
    create markdown tables for an array.

    Parameters
    ----------
    array: list
        a table in the form of a list of lists

    Returns
    -------
        md_table: str
    '''
    md_table = ""

    for i,row in enumerate(array):
        md_row = ""
        for item in row:
            md_item = f"| {item} "
            md_row += md_item

        md_row += "|\n"
        md_table += md_row

        if i == 0:
            md_table += f"| {' | '.join(['---' for _ in range(len(row))])} |\n"

    return md_table

'''
following functions are used for business logic. (to be moved to business logic layer)
'''

@terminal_print
def est_cost(text,rate):
    '''
    this function calculate the estimated cost of the translation
    please note that the rate is per 1000 tokens.
    the structure of the charging function is aligned with openai's api pricing structure.
    
    Parameters
    ----------
    text : str
        number of tokens in the text
    rate : float
        rate per 1000 tokens
        
    Returns
    -------
    float
        estimated cost of the translation'''
    n_tokens = len(encoding.encode(text))
    return round(rate*n_tokens/1000,4)