| import json |
| import os |
| import PyPDF2 |
|
|
| from application import * |
|
|
| ''' |
| following functions are for file manipulation |
| ''' |
|
|
| |
| def read_pdf(file_path): |
| |
| try: |
| filename = file_path |
| pdfFileObj = open(file_path, 'rb') |
| except TypeError: |
| filename = file_path.name |
| pdfFileObj = open(file_path.name, 'rb') |
|
|
| |
| pdfReader = PyPDF2.PdfReader(pdfFileObj) |
|
|
| |
| num_pages = len(pdfReader.pages) |
|
|
| |
| text = '' |
|
|
| |
| for page_num in range(num_pages): |
| page_obj = pdfReader.pages[page_num] |
| text += page_obj.extract_text () |
|
|
| |
| pdfFileObj.close() |
|
|
| text = remove_symbols(text) |
|
|
| with open(f"{filename.split('.')[0]}.txt", "w") as f: |
| f.write(text) |
|
|
| |
| return text, pdfReader.metadata |
|
|
| ''' |
| following functions are for format standard response |
| ''' |
|
|
| |
| def format_response(code,data): |
| return { |
| "statusCode":code, |
| "headers":{ |
| "Access-Control-Allow-Origin": "*", |
| "Content-Type": "application/json" |
| }, |
| "body":json.dumps(data), |
| "isBase64Encoded": False |
| } |
|
|
| ''' |
| following functions are for string manipulation |
| ''' |
|
|
| |
| def format_text(text,remove_char_ls = ["\\n--\\n","\\n\\n","\n"]): |
| for c in remove_char_ls: |
| text = text.replace(c,"") |
| |
| return text |
|
|
| |
| def remove_symbols(text): |
| |
| text = text.encode("ascii", "ignore").decode() |
| |
| text = text.replace('-\n', '') |
| return text |
|
|
| def str_to_tuple(s): |
| return tuple(s.replace("(","").replace(")","").split(",")) |
|
|
| ''' |
| following functions are for dynamodb data manipulation |
| ''' |
| |
| def db_map_to_py_dict(db_map): |
| py_dict = {} |
| for k,i in db_map.items(): |
| for l,v in i.items(): |
| if l == "M": |
| py_dict[k] = db_map_to_py_dict(v) |
| elif l == "S": |
| py_dict[k] = v |
| elif l == "N": |
| py_dict[k] = int(v) if float(v)%1 ==0 else float(v) |
| elif l == "L": |
| py_dict[k] = db_list_to_py_list(v) |
| else: |
| py_dict[k] = v |
| |
| return py_dict |
|
|
| |
| def py_dict_to_db_map(py_dict): |
| db_map = {} |
| for key,value in py_dict.items(): |
| key = str(key) |
| if type(value) is str: |
| db_map[key] = {"S":value} |
| elif type(value) is int or type(value) is float: |
| db_map[key] = {"N":value} |
| elif type(value) is dict: |
| db_map[key] = {"M":py_dict_to_db_map(value)} |
| elif type(value) is list: |
| db_map[key] = {"L":py_list_to_db_list(value)} |
| |
| return db_map |
|
|
| |
| def db_list_to_py_list(db_list): |
| py_list = [] |
| for d in db_list: |
| for t,v in d.items(): |
| if t == "M": |
| py_list.append(db_map_to_py_dict(v)) |
| elif t == "L": |
| py_list.append(db_list_to_py_list(v)) |
| else: |
| py_list.append(v) |
| |
| return py_list |
|
|
| |
| def py_list_to_db_list(py_list): |
| db_list = [] |
| for value in py_list: |
| if type(value) is str: |
| item = {"S":value} |
| elif type(value) is int or float: |
| item = {"N":value} |
| elif type(value) is dict: |
| item = {"M":py_dict_to_db_map(value)} |
| elif type(value) is list: |
| item = {"L":py_list_to_db_list(value)} |
| |
| db_list.append(item) |
| |
| return db_list |
|
|
| ''' |
| following functions are used for business logic. (to be moved to business logic layer) |
| ''' |
|
|
| |
| def est_cost(n_tokens,rate): |
| return round(rate*n_tokens/1000,4) |
|
|