Spaces:

arithescientist
/

lincolnlegal

Sleeping

lincolnlegal / app.py

Ari

Update app.py

cffceba verified 4 months ago

1.81 kB

	import gradio as gr
	from docx import Document # For .docx handling
	from gtts import gTTS
	import os
	import pdfkit

	# Path to the wkhtmltopdf executable (update this path based on the Dockerfile output)
	WKHTMLTOPDF_PATH = "/usr/local/bin/wkhtmltopdf" # You will get this from the Dockerfile output

	# Configure pdfkit to use the correct wkhtmltopdf path
	config = pdfkit.configuration(wkhtmltopdf=WKHTMLTOPDF_PATH)

	# Function to extract text from a .docx file and convert to PDF
	def docx_to_pdf(docx_file):
	try:
	# Extract text from the .docx file
	doc = Document(docx_file.name)
	full_text = []
	for para in doc.paragraphs:
	full_text.append(para.text)
	extracted_text = '\n'.join(full_text)

	# Convert the extracted text into an HTML format for pdfkit
	html_content = f"""
	<html>
	<head><meta charset="UTF-8"></head>
	<body><pre>{extracted_text}</pre></body>
	</html>
	"""

	# Generate the PDF using pdfkit with the custom wkhtmltopdf path
	pdf_output_path = "document_output.pdf"
	pdfkit.from_string(html_content, pdf_output_path, configuration=config)

	# Convert the text to audio using gTTS
	tts = gTTS(text=extracted_text, lang='en', slow=False)
	audio_output_path = "document_audio.wav"
	tts.save(audio_output_path)

	return audio_output_path, extracted_text, pdf_output_path

	except Exception as e:
	return None, f"An error occurred: {str(e)}", None

	# Gradio interface
	iface = gr.Interface(
	fn=docx_to_pdf,
	inputs=gr.File(label="Upload .docx File"),
	outputs=[gr.Audio(label="Generated Audio"), gr.Textbox(label="Extracted Text"), gr.File(label="Generated PDF")]
	)

	if __name__ == "__main__":
	iface.launch()