Spaces:

darthPanda
/

table_detector

Runtime error

App Files Files Community

table_detector / app.py

darthPanda

uploader enabled on credentials only

45cfcf7 about 2 years ago

raw

history blame contribute delete

5.76 kB

	import streamlit as st
	import os
	import shutil
	import fitz
	import pandas as pd
	import easyocr
	from openai import OpenAI
	from dotenv import load_dotenv
	import ast

	load_dotenv()

	@st.cache_data
	def convert_df(df):
	# IMPORTANT: Cache the conversion to prevent computation on every rerun
	return df.to_csv().encode('utf-8')

	def list_files(directory):
	for root, dirs, files in os.walk(directory):
	for name in files:
	yield os.path.join(root, name)

	def correct_list(client, list_str):
	base_prompt = '''Above python list has syntax error.
	Correct the syntax without changing the values. Output should only be the corrected list.
	'''
	prompt = list_str + base_prompt

	chat_completion = client.chat.completions.create(
	messages=[
	{
	"role": "user",
	"content": prompt,
	}
	],
	model="gpt-3.5-turbo",
	)
	list_str_correct = chat_completion.choices[0].message.content

	return list_str_correct

	if os.path.exists('prediction') and os.path.isdir('prediction'):
	shutil.rmtree('prediction')

	if os.path.exists('temp_pdf') and os.path.isdir('temp_pdf'):
	shutil.rmtree('temp_pdf')

	# Check if the directory exists
	if not os.path.exists('temp_pdf'):
	# If it does not exist, create it
	os.makedirs('temp_pdf')
	print('not_found')
	else:
	print('found')

	temp_file_path = 'temp//temp.pdf'

	reader = easyocr.Reader(['en'])

	def main():
	# Set the title of the app
	st.title("Transcript parser")

	credential = st.text_input('Credential')

	if credential is not '':
	st.markdown(credential)
	# credential = os.environ.get("OPENAI_API_KEY")

	# Create a file uploader to upload PDF files
	uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

	if uploaded_file is not None:
	# Create a temporary directory

	with open(temp_file_path, 'wb') as f:
	f.write(uploaded_file.getbuffer())


	image_paths=[]
	input_path = "temp//temp.pdf"
	# st.markdown('### Images of detected tables')
	with st.spinner('Performing OCR...'):
	doc = fitz.open(input_path)
	zoom = 4
	mat = fitz.Matrix(zoom, zoom)
	count = 0
	context = ''
	for p in doc:
	count += 1
	if count>4:
	count=4
	st.error('Page limit exceeded. processing first 4 images')
	for i in range(count):
	st.markdown(f"Processing page {i+1}...")
	val = f"image_{i}.png"
	val = os.path.join('temp_pdf', val)
	page = doc.load_page(i)
	pix = page.get_pixmap(matrix=mat)
	pix.save(val)
	image_paths.append(val)
	text = reader.readtext(val, detail=0)
	context = context + ' '.join(text)
	doc.close()

	print(context)

	st.success('OCR completed')
	# context = ''
	# with st.spinner('Performing OCR on tables to extract images...'):
	# for image in image_paths:
	# text = reader.readtext(image, detail=0)
	# # print(text)
	# context = context + ' '.join(text)
	# print(context)
	# pass

	with st.spinner('Parsing extracted text...'):
	st.markdown('### Extracted data from transcripts')
	base_prompt='''Above is the OCR extracted transcript.
	Extract student's grade along with subject. Output should only be a lists of dict with course and grade as its keys.
	'''

	base_prompt='''Above is the OCR extracted transcript.
	Extract student's points/scores along with subject. Output should only be a lists of dict with course and points/scores as its keys.
	'''
	client = OpenAI(
	# This is the default and can be omitted
	api_key=credential,
	)

	prompt = context + base_prompt

	chat_completion = client.chat.completions.create(
	messages=[
	{
	"role": "user",
	"content": prompt,
	}
	],
	model="gpt-3.5-turbo",
	)

	list_str = chat_completion.choices[0].message.content

	print(list_str)

	try:
	actual_list = ast.literal_eval(list_str)
	except:
	list_str_correct = correct_list(client, list_str)
	actual_list = ast.literal_eval(list_str_correct)

	df = pd.DataFrame(columns=['Courses', 'Grade'])

	# Saving the keys in a variable (as a list)
	keys_list = list(actual_list[0].keys())

	print(keys_list)

	# for subject in actual_list:
	# df.loc[len(df)] = [subject['course'], subject['grade']]
	for subject in actual_list:
	df.loc[len(df)] = [subject[keys_list[0]], subject[keys_list[1]]]

	st.dataframe(df)

	csv = convert_df(df)

	st.download_button(
	label="Download Parsed transcript",
	data=csv,
	file_name='transcript.csv',
	mime='text/csv',
	)


	st.success('Transcript Processing Completed!')

	# Run the app
	if __name__ == "__main__":
	main()