import gradio as gr import os import tempfile import dwani import logging import re # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Configure dwani API settings from environment variables dwani.api_key = os.getenv("DWANI_API_KEY") dwani.api_base = os.getenv("DWANI_API_BASE_URL") if not dwani.api_key or not dwani.api_base: logger.error("API key or base URL not set. Please set DWANI_API_KEY and DWANI_API_BASE_URL environment variables.") raise RuntimeError("Please set DWANI_API_KEY and DWANI_API_BASE_URL environment variables.") # Language options for dropdowns (display name and code) language_options = [ ("English", "eng_Latn"), ("Kannada", "kan_Knda"), ("Hindi", "hin_Deva") ] language_names = [lang[0] for lang in language_options] lang_code_map = {lang[0]: lang[1] for lang in language_options} def parse_page_numbers(pages_str): """ Parse a string of comma-separated page numbers/ranges into a sorted list of unique integers. Example inputs: "1,3,5" "1-3,5" """ pages = set() for part in pages_str.split(","): part = part.strip() if "-" in part: try: start, end = map(int, part.split("-")) if start > end or start < 1: continue pages.update(range(start, end + 1)) except ValueError: continue else: try: page = int(part) if page >= 1: pages.add(page) except ValueError: continue return sorted(pages) def simple_format_resume(text): """ Basic formatting for resume text: - Convert '** text **' to bold text - Replace multiple dots or underscores with horizontal lines - Preserve line breaks with
""" # Convert ** bold ** to bold text = re.sub(r"\*\*\s*(.*?)\s*\*\*", r"\1", text) # Replace long sequences of dots or underscores with
text = re.sub(r"([.\-_])\1{5,}", "
", text) # Preserve line breaks text = text.replace('\r\n', '\n').replace('\r', '\n') text = text.replace('\n', '
') return text def results_to_html(results): """ Convert the results dictionary into an HTML formatted string, applying basic formatting to preserve resume style. """ html_lines = [] for page, content in results.items(): html_lines.append(f"

{page}

") if "error" in content: html_lines.append(f"

Error: {content['error']}

") else: html_lines.append("

Original Text:

") original_html = simple_format_resume(content.get('Original Text', '')) html_lines.append(f"
{original_html}
") response_text = content.get('Response', '') if response_text: html_lines.append(f"

Response:
{response_text}

") html_lines.append(f"

Processed Page: {content.get('Processed Page', '')}

") translated_html = simple_format_resume(content.get('Translated Response', '')) html_lines.append("

Translated Resume:

") html_lines.append(f"
{translated_html}
") html_lines.append("
") return "\n".join(html_lines) def results_to_markdown(results): """ Convert the results dictionary into a Markdown formatted string, preserving the translated resume content in code blocks. """ md_lines = [] for page, content in results.items(): md_lines.append(f"## {page}\n") if "error" in content: md_lines.append(f"**Error:** {content['error']}\n") else: md_lines.append("**Original Text:**\n\n```") md_lines.append(content.get('Original Text', '') + "\n") md_lines.append("```\n") response_text = content.get('Response', '') if response_text: md_lines.append("Response:\n\n" + response_text + "\n") md_lines.append("**Processed Page:** " + str(content.get('Processed Page', '')) + "\n") translated = content.get('Translated Response', '') md_lines.append("**Translated Resume:**\n\n```") md_lines.append(translated + "\n") md_lines.append("```\n") md_lines.append("\n---\n") return "\n".join(md_lines) def process_pdf(pdf_file, pages_str, prompt, src_lang, tgt_lang): logger.info(f"Processing PDF: {pdf_file}, Pages: {pages_str}, Prompt: {prompt}, Source: {src_lang}, Target: {tgt_lang}") if not pdf_file: return "Error: Please upload a PDF file", None if not prompt.strip(): return "Error: Please provide a non-empty prompt", None pages = parse_page_numbers(pages_str) if not pages: return "Error: Please provide valid page numbers (e.g., 1,3,5 or 1-3)", None src_lang_code = lang_code_map.get(src_lang) tgt_lang_code = lang_code_map.get(tgt_lang) if not src_lang_code or not tgt_lang_code: return "Error: Invalid source or target language selection", None file_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file results = {} for page_number in pages: try: # Call Dwani API (without 'prompt' arg as it's unsupported) result = dwani.Documents.run_extract( file_path=file_path, page_number=page_number, src_lang=src_lang_code, tgt_lang=tgt_lang_code ) logger.debug(f"API response for page {page_number}: {result}") page_data = None for p in result.get('pages', []): if p.get('processed_page') == page_number: page_data = p break if page_data is None: results[f"Page {page_number}"] = {"error": "No data returned for this page"} continue results[f"Page {page_number}"] = { "Processed Page": page_data.get("processed_page", "N/A"), "Original Text": page_data.get("page_content", "N/A"), "Translated Response": page_data.get("translated_content", "N/A"), "Response": "" } except dwani.exceptions.DwaniAPIError as e: logger.error(f"Dwani API error on page {page_number}: {e}") results[f"Page {page_number}"] = {"error": f"API error: {str(e)}"} except Exception as e: logger.error(f"Unexpected error on page {page_number}: {e}") results[f"Page {page_number}"] = {"error": f"Unexpected error: {str(e)}"} # Convert results to HTML for display html_text = results_to_html(results) # Save markdown for download markdown_text = results_to_markdown(results) temp_md_file = tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode='w', encoding='utf-8') temp_md_file.write(markdown_text) temp_md_file.close() return html_text, temp_md_file.name # Gradio UI with gr.Blocks(title="Resume Translator") as demo: gr.Markdown("# Resume Translator") gr.Markdown( "Upload your resume PDF, specify pages to translate, enter a prompt (e.g., 'Translate the resume'), " "and select source and target languages." ) with gr.Row(): with gr.Column(): pdf_input = gr.File(label="Upload Resume PDF", file_types=[".pdf"]) pages_input = gr.Textbox( label="Page Numbers", placeholder="e.g., 1,3,5 or 1-3", value="1", lines=1 ) prompt_input = gr.Textbox( label="Custom Prompt", placeholder="e.g., Translate the resume", value="Translate the resume", lines=2 ) src_lang_input = gr.Dropdown( label="Source Language", choices=language_names, value="English" ) tgt_lang_input = gr.Dropdown( label="Target Language", choices=language_names, value="Kannada" ) submit_btn = gr.Button("Translate") with gr.Column(): output_html = gr.HTML(label="Translated Resume Output") download_md = gr.File(label="Download Translated Resume (Markdown)") submit_btn.click( fn=process_pdf, inputs=[pdf_input, pages_input, prompt_input, src_lang_input, tgt_lang_input], outputs=[output_html, download_md] ) if __name__ == "__main__": demo.launch()