Voice-To-Text / app.py
lodhrangpt's picture
Update app.py
778fecb verified
import gradio as gr
import requests
from fpdf import FPDF
import nltk
import os
import tempfile
from nltk.tokenize import sent_tokenize
import random
from groq import Groq
api_key = os.environ.get("GROQ_API_KEY")
# Attempt to download punkt tokenizer
try:
nltk.download("punkt")
except:
print("NLTK punkt tokenizer download failed. Using custom tokenizer.")
def custom_sent_tokenize(text):
return text.split(". ")
def transcribe(audio_path):
with open(audio_path, "rb") as audio_file:
audio_data = audio_file.read()
groq_api_endpoint = "https://api.groq.com/openai/v1/audio/transcriptions"
headers = {
"Authorization": f"Bearer {api_key}", # Fix: api_key is used properly
}
files = {
'file': ('audio.wav', audio_data, 'audio/wav'),
}
data = {
'model': 'whisper-large-v3-turbo',
'response_format': 'json',
'language': 'en',
}
response = requests.post(groq_api_endpoint, headers=headers, files=files, data=data)
if response.status_code == 200:
result = response.json()
transcript = result.get("text", "No transcription available.")
return generate_notes(transcript)
else:
error_msg = response.json().get("error", {}).get("message", "Unknown error.")
print(f"API Error: {error_msg}")
return create_error_pdf(f"API Error: {error_msg}")
def generate_notes(transcript):
# try:
# sentences = sent_tokenize(transcript)
# except LookupError:
# sentences = custom_sent_tokenize(transcript)
# # Generate long questions
# long_questions = [f"Explain the concept discussed in: '{sentence}'." for sentence in sentences[:5]]
# # Generate short questions
# short_questions = [f"What does '{sentence.split()[0]}' mean in the context of this text?" for sentence in sentences[:5]]
# # Generate MCQs with relevant distractors
# mcqs = []
# for sentence in sentences[:5]:
# if len(sentence.split()) > 1: # Ensure there are enough words to create meaningful options
# key_word = sentence.split()[0] # Use the first word as a key term
# distractors = ["Term A", "Term B", "Term C"] # Replace with relevant terms if needed
# options = [key_word] + distractors
# random.shuffle(options) # Shuffle options for randomness
# mcq = {
# "question": f"What is '{key_word}' based on the context?",
# "options": options,
# "answer": key_word
# }
# mcqs.append(mcq)
client = Groq(api_key="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")
chat_completion = client.chat.completions.create(
#
# Required parameters
#
messages=[
# Set an optional system message. This sets the behavior of the
# assistant and can be used to provide specific instructions for
# how it should behave throughout the conversation.
{
"role": "system",
"content": "you are expert question generator from content. Generate one long question,possible number of short questions and mcqs.plz also provide the notes"
},
# Set a user message for the assistant to respond to.
{
"role": "user",
"content": transcript,
}
],
# The language model which will generate the completion.
model="llama3-8b-8192",
#
# Optional parameters
#
# Controls randomness: lowering results in less random completions.
# As the temperature approaches zero, the model will become deterministic
# and repetitive.
temperature=0.5,
# The maximum number of tokens to generate. Requests can use up to
# 32,768 tokens shared between prompt and completion.
max_tokens=1024,
# Controls diversity via nucleus sampling: 0.5 means half of all
# likelihood-weighted options are considered.
top_p=1,
# A stop sequence is a predefined or user-specified text string that
# signals an AI to stop generating content, ensuring its responses
# remain focused and concise. Examples include punctuation marks and
# markers like "[end]".
stop=None,
# If set, partial message deltas will be sent.
stream=False,
)
# Print the completion returned by the LLM.
res=chat_completion.choices[0].message.content
# Generate and save a structured PDF
pdf_path = create_pdf(res,transcript)
return pdf_path
def create_pdf(question,transcript):
pdf = FPDF()
pdf.add_page()
# Add title
pdf.set_font("Arial", "B", 16)
pdf.cell(200, 10, "Transcription Notes and Questions", ln=True, align="C")
# Add transcription content
pdf.set_font("Arial", "", 12)
pdf.multi_cell(0, 10, f"Transcription:\n{transcript.encode('latin1', 'replace').decode('latin1')}\n\n")
# Add long questions
pdf.set_font("Arial", "B", 14)
pdf.cell(200, 10, "Questions", ln=True)
pdf.set_font("Arial", "", 12)
pdf.multi_cell(0, 10, f"- {question.encode('latin1', 'replace').decode('latin1')}\n")
# # Add short questions
# pdf.set_font("Arial", "B", 14)
# pdf.cell(200, 10, "Short Questions", ln=True)
# pdf.set_font("Arial", "", 12)
# for question in short_questions:
# pdf.multi_cell(0, 10, f"- {question.encode('latin1', 'replace').decode('latin1')}\n")
# # Add MCQs
# pdf.set_font("Arial", "B", 14)
# pdf.cell(200, 10, "Multiple Choice Questions (MCQs)", ln=True)
# pdf.set_font("Arial", "", 12)
# for mcq in mcqs:
# pdf.multi_cell(0, 10, f"Q: {mcq['question'].encode('latin1', 'replace').decode('latin1')}")
# for option in mcq["options"]:
# pdf.multi_cell(0, 10, f" - {option.encode('latin1', 'replace').decode('latin1')}")
# pdf.multi_cell(0, 10, f"Answer: {mcq['answer'].encode('latin1', 'replace').decode('latin1')}\n")
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
pdf.output(temp_pdf.name)
pdf_path = temp_pdf.name
return pdf_path
def create_error_pdf(message):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", "B", 16)
pdf.cell(200, 10, "Error Report", ln=True, align="C")
pdf.set_font("Arial", "", 12)
pdf.multi_cell(0, 10, message.encode('latin1', 'replace').decode('latin1'))
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
pdf.output(temp_pdf.name)
error_pdf_path = temp_pdf.name
return error_pdf_path
iface = gr.Interface(
fn=transcribe,
inputs=gr.Audio(type="filepath"),
outputs=gr.File(label="Download PDF with Notes or Error Report"),
title="Voice to Text Converter and Notes Generator",
description="This app converts audio to text and generates academic questions including long, short, and multiple-choice questions."
)
iface.launch()