Spaces:

amra-ai
/

profile

Runtime error

profile / utility.py

Roland Ding

1.0.1.1 ready for public relase

617260a almost 3 years ago

4.24 kB

	import json
	import os
	import PyPDF2

	from application import *

	'''
	following functions are for file manipulation
	'''

	# read pdf file and return text
	def read_pdf(file_path):
	# open the pdf file
	try:
	filename = file_path
	pdfFileObj = open(file_path, 'rb')
	except TypeError:
	filename = file_path.name
	pdfFileObj = open(file_path.name, 'rb')

	# create a pdf reader object
	pdfReader = PyPDF2.PdfReader(pdfFileObj)

	# get the number of pages in the pdf file
	num_pages = len(pdfReader.pages)

	# create an empty string
	text = ''

	# iterate through all the pages
	for page_num in range(num_pages):
	page_obj = pdfReader.pages[page_num]
	text += page_obj.extract_text ()

	# close the pdf file object
	pdfFileObj.close()

	text = remove_symbols(text)

	with open(f"{filename.split('.')[0]}.txt", "w") as f:
	f.write(text)

	# return the string of text
	return text, pdfReader.metadata

	'''
	following functions are for format standard response
	'''

	# format standard response for status code and data
	def format_response(code,data):
	return {
	"statusCode":code,
	"headers":{
	"Access-Control-Allow-Origin": "*",
	"Content-Type": "application/json"
	},
	"body":json.dumps(data),
	"isBase64Encoded": False
	}

	'''
	following functions are for string manipulation
	'''

	# format text output by removing excessive characters
	def format_text(text,remove_char_ls = ["\\n--\\n","\\n\\n","\n"]):
	for c in remove_char_ls:
	text = text.replace(c,"")

	return text

	# function to remove symbols that are not in unicode
	def remove_symbols(text):
	# remove symbols that are not in unicode
	text = text.encode("ascii", "ignore").decode()
	# remove the break word new line return
	text = text.replace('-\n', '')
	return text

	def str_to_tuple(s):
	return tuple(s.replace("(","").replace(")","").split(","))

	'''
	following functions are for dynamodb data manipulation
	'''
	# convert dynamodb map to python dictionary
	def db_map_to_py_dict(db_map):
	py_dict = {}
	for k,i in db_map.items():
	for l,v in i.items():
	if l == "M":
	py_dict[k] = db_map_to_py_dict(v)
	elif l == "S":
	py_dict[k] = v
	elif l == "N":
	py_dict[k] = int(v) if float(v)%1 ==0 else float(v)
	elif l == "L":
	py_dict[k] = db_list_to_py_list(v)
	else:
	py_dict[k] = v

	return py_dict

	# convert python dictionary to dynamodb map
	def py_dict_to_db_map(py_dict):
	db_map = {}
	for key,value in py_dict.items():
	key = str(key)
	if type(value) is str:
	db_map[key] = {"S":value}
	elif type(value) is int or type(value) is float:
	db_map[key] = {"N":value}
	elif type(value) is dict:
	db_map[key] = {"M":py_dict_to_db_map(value)}
	elif type(value) is list:
	db_map[key] = {"L":py_list_to_db_list(value)}

	return db_map

	# convert dynamodb list to python list
	def db_list_to_py_list(db_list):
	py_list = []
	for d in db_list:
	for t,v in d.items():
	if t == "M":
	py_list.append(db_map_to_py_dict(v))
	elif t == "L":
	py_list.append(db_list_to_py_list(v))
	else:
	py_list.append(v)

	return py_list

	# convert python list to dynamodb list
	def py_list_to_db_list(py_list):
	db_list = []
	for value in py_list:
	if type(value) is str:
	item = {"S":value}
	elif type(value) is int or float:
	item = {"N":value}
	elif type(value) is dict:
	item = {"M":py_dict_to_db_map(value)}
	elif type(value) is list:
	item = {"L":py_list_to_db_list(value)}

	db_list.append(item)

	return db_list

	'''
	following functions are used for business logic. (to be moved to business logic layer)
	'''

	# function to calculate the estimated cost of the translation
	def est_cost(n_tokens,rate):
	return round(rate*n_tokens/1000,4)