|
import torch |
|
from transformers import pipeline |
|
import librosa |
|
import os |
|
from hugchat import hugchat |
|
from hugchat.login import Login |
|
import gradio as gr |
|
import logging |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
MODEL_NAME = "openai/whisper-large-v3-turbo" |
|
device = 0 if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
pipe = pipeline( |
|
task="automatic-speech-recognition", |
|
model=MODEL_NAME, |
|
chunk_length_s=30, |
|
device=device, |
|
) |
|
|
|
|
|
EMAIL = os.getenv("EMAIL", "fearfreed007@gmail.com") |
|
PASSWD = os.getenv("PASSWD", "uS&m?UrB)7Y7XTP") |
|
|
|
|
|
cookie_path_dir = "./cookies/" |
|
os.makedirs(cookie_path_dir, exist_ok=True) |
|
|
|
|
|
chatbot = None |
|
try: |
|
sign = Login(EMAIL, PASSWD) |
|
cookies = sign.login(cookie_dir_path=cookie_path_dir, save_cookies=True) |
|
chatbot = hugchat.ChatBot(cookies=cookies.get_dict()) |
|
logger.info("Chatbot initialized successfully") |
|
except Exception as e: |
|
logger.error(f"Failed to initialize chatbot: {e}") |
|
|
|
def transcribe_audio(audio_path): |
|
"""Transcribe a local audio file using the Whisper pipeline.""" |
|
try: |
|
if not os.path.exists(audio_path): |
|
raise FileNotFoundError("Audio file not found") |
|
audio, sr = librosa.load(audio_path, sr=16000, mono=True) |
|
transcription = pipe(audio, batch_size=8, generate_kwargs={"language": "urdu"})["text"] |
|
return transcription |
|
except Exception as e: |
|
return f"Error processing audio: {str(e)}" |
|
|
|
def extract_info_from_filename(filename): |
|
"""Extract agent, file_number, city, and country from the filename.""" |
|
try: |
|
parts = filename.split('_') |
|
if len(parts) < 4: |
|
raise ValueError("Filename must have at least 4 parts: agentX_N_City_Country") |
|
agent = parts[0] |
|
file_number = int(parts[1]) |
|
city = parts[2] |
|
country = parts[3].split('.')[0] |
|
return agent, file_number, city, country |
|
except (ValueError, IndexError): |
|
return None, None, None, None |
|
|
|
def process_audio(audio_path): |
|
"""Process audio: Extract info from filename, transcribe, and generate JSON.""" |
|
if not audio_path: |
|
return '{"error": "No audio file provided"}', "", "" |
|
|
|
|
|
filename = os.path.basename(audio_path) |
|
agent, file_number, city, country = extract_info_from_filename(filename) |
|
|
|
if agent is None: |
|
return '{"error": "Invalid filename format. Use format: agentX_N_City_Country.wav"}', "", filename |
|
|
|
|
|
transcription = transcribe_audio(audio_path) |
|
if "Error" in transcription: |
|
return f'{{"error": "{transcription}"}}', transcription, filename |
|
|
|
|
|
if chatbot is None: |
|
logger.warning("Chatbot unavailable, returning transcription-only JSON") |
|
return ( |
|
f'{{"records": [{{"Recording_name": "{filename}", "agent": "{agent}", "file_number": {file_number}, ' |
|
f'"city": "{city}", "country": "{country}", "transcription": "{transcription}"}}]}}', |
|
transcription, |
|
filename |
|
) |
|
|
|
|
|
prompt = f""" |
|
Correct the given Urdu text for grammar, word accuracy, and contextual meaning without adding anything extra. |
|
Then, translate the corrected text into English. |
|
Next, create a JSON file that detects crops and their diseases, following this format: |
|
{{ |
|
"records": [ |
|
{{ |
|
"Recording_name": "{filename}", |
|
"agent": "{agent}", |
|
"file_number": {file_number}, |
|
"city": "{city}", |
|
"country": "{country}", |
|
"crops": [ |
|
{{ |
|
"name": "<detected_crop>", |
|
"season": "<appropriate_season>", |
|
"harvest_months": ["<months>"], |
|
"regions": ["<regions>"], |
|
"diseases": [ |
|
{{ |
|
"name": "<disease>", |
|
"description": "<description>", |
|
"wikipedia_link": "<link>" |
|
}} |
|
] |
|
}} |
|
], |
|
"issues": ["<detected_issues>"], |
|
"disease_linking": {{ |
|
"<crop_name>": ["<disease_names>"] |
|
}} |
|
}} |
|
] |
|
}} |
|
The Urdu text to process is: |
|
{transcription} |
|
Only provide the JSON output, do not include any additional text. |
|
""" |
|
|
|
|
|
try: |
|
response = chatbot.chat(prompt).wait_until_done() |
|
return response, transcription, filename |
|
except Exception as e: |
|
logger.error(f"Chatbot processing failed: {e}") |
|
return ( |
|
f'{{"records": [{{"Recording_name": "{filename}", "agent": "{agent}", "file_number": {file_number}, ' |
|
f'"city": "{city}", "country": "{country}", "transcription": "{transcription}", ' |
|
f'"error": "Chatbot processing failed: {str(e)}"}}]}}', |
|
transcription, |
|
filename |
|
) |
|
|
|
|
|
with gr.Blocks(title="Audio Transcription and Crop Analysis") as interface: |
|
gr.Markdown("## Audio Transcription and Crop Disease Analysis") |
|
|
|
with gr.Row(): |
|
audio_input = gr.Audio(type="filepath", label="Upload Audio File (e.g., agent1_2_Multan_Pakistan.wav)") |
|
|
|
with gr.Row(): |
|
json_output = gr.Textbox(label="JSON Output", interactive=False, lines=10) |
|
transcription_output = gr.Textbox(label="Transcription (Urdu)", interactive=False, lines=5) |
|
filename_output = gr.Textbox(label="Processed Filename", interactive=False) |
|
|
|
process_button = gr.Button("Process Audio") |
|
|
|
process_button.click( |
|
fn=process_audio, |
|
inputs=[audio_input], |
|
outputs=[json_output, transcription_output, filename_output], |
|
) |
|
|
|
if __name__ == "__main__": |
|
interface.launch(server_name="0.0.0.0", server_port=7860) |