F_A_S / app.py
maliahson's picture
Update app.py
e089604 verified
import torch
from transformers import pipeline
import librosa
import os
from hugchat import hugchat
from hugchat.login import Login
import gradio as gr
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Model and device configuration for transcription
MODEL_NAME = "openai/whisper-large-v3-turbo"
device = 0 if torch.cuda.is_available() else "cpu"
# Initialize Whisper pipeline
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
# Hugging Face Chatbot credentials from environment variables (preferred for Spaces)
EMAIL = os.getenv("EMAIL", "fearfreed007@gmail.com") # Fallback for local testing
PASSWD = os.getenv("PASSWD", "uS&m?UrB)7Y7XTP") # Fallback for local testing
# Directory to save cookies
cookie_path_dir = "./cookies/"
os.makedirs(cookie_path_dir, exist_ok=True)
# Initialize chatbot with error handling
chatbot = None
try:
sign = Login(EMAIL, PASSWD)
cookies = sign.login(cookie_dir_path=cookie_path_dir, save_cookies=True)
chatbot = hugchat.ChatBot(cookies=cookies.get_dict())
logger.info("Chatbot initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize chatbot: {e}")
def transcribe_audio(audio_path):
"""Transcribe a local audio file using the Whisper pipeline."""
try:
if not os.path.exists(audio_path):
raise FileNotFoundError("Audio file not found")
audio, sr = librosa.load(audio_path, sr=16000, mono=True)
transcription = pipe(audio, batch_size=8, generate_kwargs={"language": "urdu"})["text"]
return transcription
except Exception as e:
return f"Error processing audio: {str(e)}"
def extract_info_from_filename(filename):
"""Extract agent, file_number, city, and country from the filename."""
try:
parts = filename.split('_')
if len(parts) < 4:
raise ValueError("Filename must have at least 4 parts: agentX_N_City_Country")
agent = parts[0]
file_number = int(parts[1])
city = parts[2]
country = parts[3].split('.')[0] # Remove file extension if present
return agent, file_number, city, country
except (ValueError, IndexError):
return None, None, None, None
def process_audio(audio_path):
"""Process audio: Extract info from filename, transcribe, and generate JSON."""
if not audio_path:
return '{"error": "No audio file provided"}', "", ""
# Extract filename and info
filename = os.path.basename(audio_path)
agent, file_number, city, country = extract_info_from_filename(filename)
if agent is None:
return '{"error": "Invalid filename format. Use format: agentX_N_City_Country.wav"}', "", filename
# Transcribe audio
transcription = transcribe_audio(audio_path)
if "Error" in transcription:
return f'{{"error": "{transcription}"}}', transcription, filename
# Fallback JSON if chatbot is not initialized
if chatbot is None:
logger.warning("Chatbot unavailable, returning transcription-only JSON")
return (
f'{{"records": [{{"Recording_name": "{filename}", "agent": "{agent}", "file_number": {file_number}, '
f'"city": "{city}", "country": "{country}", "transcription": "{transcription}"}}]}}',
transcription,
filename
)
# Construct prompt with extracted data
prompt = f"""
Correct the given Urdu text for grammar, word accuracy, and contextual meaning without adding anything extra.
Then, translate the corrected text into English.
Next, create a JSON file that detects crops and their diseases, following this format:
{{
"records": [
{{
"Recording_name": "{filename}",
"agent": "{agent}",
"file_number": {file_number},
"city": "{city}",
"country": "{country}",
"crops": [
{{
"name": "<detected_crop>",
"season": "<appropriate_season>",
"harvest_months": ["<months>"],
"regions": ["<regions>"],
"diseases": [
{{
"name": "<disease>",
"description": "<description>",
"wikipedia_link": "<link>"
}}
]
}}
],
"issues": ["<detected_issues>"],
"disease_linking": {{
"<crop_name>": ["<disease_names>"]
}}
}}
]
}}
The Urdu text to process is:
{transcription}
Only provide the JSON output, do not include any additional text.
"""
# Process with chatbot and return JSON
try:
response = chatbot.chat(prompt).wait_until_done()
return response, transcription, filename
except Exception as e:
logger.error(f"Chatbot processing failed: {e}")
return (
f'{{"records": [{{"Recording_name": "{filename}", "agent": "{agent}", "file_number": {file_number}, '
f'"city": "{city}", "country": "{country}", "transcription": "{transcription}", '
f'"error": "Chatbot processing failed: {str(e)}"}}]}}',
transcription,
filename
)
# Gradio Interface
with gr.Blocks(title="Audio Transcription and Crop Analysis") as interface:
gr.Markdown("## Audio Transcription and Crop Disease Analysis")
with gr.Row():
audio_input = gr.Audio(type="filepath", label="Upload Audio File (e.g., agent1_2_Multan_Pakistan.wav)")
with gr.Row():
json_output = gr.Textbox(label="JSON Output", interactive=False, lines=10)
transcription_output = gr.Textbox(label="Transcription (Urdu)", interactive=False, lines=5)
filename_output = gr.Textbox(label="Processed Filename", interactive=False)
process_button = gr.Button("Process Audio")
process_button.click(
fn=process_audio,
inputs=[audio_input],
outputs=[json_output, transcription_output, filename_output],
)
if __name__ == "__main__":
interface.launch(server_name="0.0.0.0", server_port=7860)