Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import os | |
| import shutil | |
| import fitz | |
| import pandas as pd | |
| import easyocr | |
| from openai import OpenAI | |
| from dotenv import load_dotenv | |
| import ast | |
| load_dotenv() | |
| def convert_df(df): | |
| # IMPORTANT: Cache the conversion to prevent computation on every rerun | |
| return df.to_csv().encode('utf-8') | |
| def list_files(directory): | |
| for root, dirs, files in os.walk(directory): | |
| for name in files: | |
| yield os.path.join(root, name) | |
| def correct_list(client, list_str): | |
| base_prompt = '''Above python list has syntax error. | |
| Correct the syntax without changing the values. Output should only be the corrected list. | |
| ''' | |
| prompt = list_str + base_prompt | |
| chat_completion = client.chat.completions.create( | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": prompt, | |
| } | |
| ], | |
| model="gpt-3.5-turbo", | |
| ) | |
| list_str_correct = chat_completion.choices[0].message.content | |
| return list_str_correct | |
| if os.path.exists('prediction') and os.path.isdir('prediction'): | |
| shutil.rmtree('prediction') | |
| if os.path.exists('temp_pdf') and os.path.isdir('temp_pdf'): | |
| shutil.rmtree('temp_pdf') | |
| # Check if the directory exists | |
| if not os.path.exists('temp_pdf'): | |
| # If it does not exist, create it | |
| os.makedirs('temp_pdf') | |
| print('not_found') | |
| else: | |
| print('found') | |
| temp_file_path = 'temp//temp.pdf' | |
| reader = easyocr.Reader(['en']) | |
| def main(): | |
| # Set the title of the app | |
| st.title("Transcript parser") | |
| credential = st.text_input('Credential') | |
| if credential is not '': | |
| st.markdown(credential) | |
| # credential = os.environ.get("OPENAI_API_KEY") | |
| # Create a file uploader to upload PDF files | |
| uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") | |
| if uploaded_file is not None: | |
| # Create a temporary directory | |
| with open(temp_file_path, 'wb') as f: | |
| f.write(uploaded_file.getbuffer()) | |
| image_paths=[] | |
| input_path = "temp//temp.pdf" | |
| # st.markdown('### Images of detected tables') | |
| with st.spinner('Performing OCR...'): | |
| doc = fitz.open(input_path) | |
| zoom = 4 | |
| mat = fitz.Matrix(zoom, zoom) | |
| count = 0 | |
| context = '' | |
| for p in doc: | |
| count += 1 | |
| if count>4: | |
| count=4 | |
| st.error('Page limit exceeded. processing first 4 images') | |
| for i in range(count): | |
| st.markdown(f"Processing page {i+1}...") | |
| val = f"image_{i}.png" | |
| val = os.path.join('temp_pdf', val) | |
| page = doc.load_page(i) | |
| pix = page.get_pixmap(matrix=mat) | |
| pix.save(val) | |
| image_paths.append(val) | |
| text = reader.readtext(val, detail=0) | |
| context = context + ' '.join(text) | |
| doc.close() | |
| print(context) | |
| st.success('OCR completed') | |
| # context = '' | |
| # with st.spinner('Performing OCR on tables to extract images...'): | |
| # for image in image_paths: | |
| # text = reader.readtext(image, detail=0) | |
| # # print(text) | |
| # context = context + ' '.join(text) | |
| # print(context) | |
| # pass | |
| with st.spinner('Parsing extracted text...'): | |
| st.markdown('### Extracted data from transcripts') | |
| base_prompt='''Above is the OCR extracted transcript. | |
| Extract student's grade along with subject. Output should only be a lists of dict with course and grade as its keys. | |
| ''' | |
| base_prompt='''Above is the OCR extracted transcript. | |
| Extract student's points/scores along with subject. Output should only be a lists of dict with course and points/scores as its keys. | |
| ''' | |
| client = OpenAI( | |
| # This is the default and can be omitted | |
| api_key=credential, | |
| ) | |
| prompt = context + base_prompt | |
| chat_completion = client.chat.completions.create( | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": prompt, | |
| } | |
| ], | |
| model="gpt-3.5-turbo", | |
| ) | |
| list_str = chat_completion.choices[0].message.content | |
| print(list_str) | |
| try: | |
| actual_list = ast.literal_eval(list_str) | |
| except: | |
| list_str_correct = correct_list(client, list_str) | |
| actual_list = ast.literal_eval(list_str_correct) | |
| df = pd.DataFrame(columns=['Courses', 'Grade']) | |
| # Saving the keys in a variable (as a list) | |
| keys_list = list(actual_list[0].keys()) | |
| print(keys_list) | |
| # for subject in actual_list: | |
| # df.loc[len(df)] = [subject['course'], subject['grade']] | |
| for subject in actual_list: | |
| df.loc[len(df)] = [subject[keys_list[0]], subject[keys_list[1]]] | |
| st.dataframe(df) | |
| csv = convert_df(df) | |
| st.download_button( | |
| label="Download Parsed transcript", | |
| data=csv, | |
| file_name='transcript.csv', | |
| mime='text/csv', | |
| ) | |
| st.success('Transcript Processing Completed!') | |
| # Run the app | |
| if __name__ == "__main__": | |
| main() |