import os import gradio as gr import pandas as pd from lexoid.api import parse parser_options = ["LLM_PARSE", "STATIC_PARSE", "AUTO"] # Function to set the environment variables and parse the document def run_parser( file, parser_type, model, pages_per_split, max_processes, as_pdf, x_tolerance, y_tolerance, save_dir, page_nums, router_priority, framework, temperature, depth, google_api_key, openai_api_key, huggingfacehub_api_token, together_api_key, openrouter_api_key, ): # Set environment variables os.environ["GOOGLE_API_KEY"] = google_api_key os.environ["OPENAI_API_KEY"] = openai_api_key os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingfacehub_api_token os.environ["TOGETHER_API_KEY"] = together_api_key os.environ["OPENROUTER_API_KEY"] = openrouter_api_key if file is None: return "Please upload a file to parse." kwargs = { "model": model, "pages_per_split": pages_per_split, "max_processes": max_processes, "as_pdf": as_pdf, "x_tolerance": x_tolerance, "y_tolerance": y_tolerance, "save_dir": save_dir, "page_nums": ( [int(num.strip()) for num in page_nums.split(",")] if page_nums else None ), "router_priority": router_priority, "framework": framework, "temperature": temperature, "depth": depth, } # Clean None values kwargs = {k: v for k, v in kwargs.items() if v is not None} result = parse(path=file.name, parser_type=parser_type, **kwargs) if "raw" in result: return result["raw"] elif "segments" in result: return "\n\n".join([seg.get("content", "") for seg in result["segments"]]) else: return str(result) with gr.Blocks(title="Lexoid Document Parser") as app: gr.Markdown( "## 📄 Lexoid Document Parser\nUpload a document and customize how you'd like to parse it." ) with gr.Row(): file_input = gr.File( label="Upload Document", file_types=[".pdf", ".docx", ".html", ".txt"], type="filepath", ) parser_type = gr.Dropdown( choices=parser_options, value="AUTO", label="Parser Type" ) model_input = gr.Textbox(value="gemini-2.0-flash", label="LLM ID") framework = gr.Textbox( value="pdfplumber", label="Static Framework", placeholder="e.g., pdfplumber, slate", ) with gr.Accordion("Advanced Options", open=False): pages_per_split = gr.Slider( minimum=1, maximum=20, value=4, step=1, label="Pages per Split" ) max_processes = gr.Slider( minimum=1, maximum=16, value=4, step=1, label="Max Parallel Processes" ) as_pdf = gr.Checkbox(label="Convert to PDF before parsing") x_tolerance = gr.Number(label="X-axis Tolerance", value=None) y_tolerance = gr.Number(label="Y-axis Tolerance", value=None) save_dir = gr.Textbox( label="Save Directory", placeholder="Path to save intermediate files (optional)", ) page_nums = gr.Textbox( label="Page Numbers", placeholder="Comma-separated page numbers (e.g., 1,3,5)", ) router_priority = gr.Dropdown( choices=["speed", "accuracy"], value="accuracy", label="Router Priority" ) temperature = gr.Number(label="LLM Temperature", value=None) depth = gr.Number(label="Recursive Depth", value=None) # Adding the text boxes for the environment variables with gr.Row(): google_api_key = gr.Textbox( label="Google API Key", placeholder="Enter Google API Key" ) openai_api_key = gr.Textbox( label="OpenAI API Key", placeholder="Enter OpenAI API Key" ) huggingfacehub_api_token = gr.Textbox( label="HuggingFaceHub API Token", placeholder="Enter HuggingFaceHub API Token", ) together_api_key = gr.Textbox( label="Together API Key", placeholder="Enter Together API Key" ) openrouter_api_key = gr.Textbox( label="OpenRouter API Key", placeholder="Enter OpenRouter API Key" ) output = gr.Markdown(label="Parsed Output") parse_button = gr.Button("Parse Document") parse_button.click( fn=run_parser, inputs=[ file_input, parser_type, model_input, pages_per_split, max_processes, as_pdf, x_tolerance, y_tolerance, save_dir, page_nums, router_priority, framework, temperature, depth, google_api_key, openai_api_key, huggingfacehub_api_token, together_api_key, openrouter_api_key, ], outputs=output, ) # Leaderboard loaded from leaderboard.csv df = pd.read_csv("leaderboard.csv") leaderboard = gr.Dataframe( value=df, label="Leaderboard", ) app.launch()