Spaces:

filipzawadka
/

polish_whisper

Runtime error

File size: 3,615 Bytes

import gradio as gr
from transformers import pipeline
import numpy as np
import requests
import subprocess
import os
import urllib.parse

term = 9

transcriber = pipeline("automatic-speech-recognition", model="filipzawadka/whisper-small-pl-2")
#transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small.en")

def offset_time(link, start_offset, clip_length):

    # Parse the URL
    parsed_url = urllib.parse.urlparse(link)
    query_params = urllib.parse.parse_qs(parsed_url.query)

    # Change the start and stop time by a set number
    # For example, adding 1000 to both start and stop times
    start_time = int(query_params['startTime'][0]) + start_offset
    stop_time = start_time + clip_length

    # Rebuild the query with the new times
    new_query_params = {'startTime': [str(start_time)], 'stopTime': [str(stop_time)]}
    new_query = urllib.parse.urlencode(new_query_params, doseq=True)

    # Rebuild the entire URL
    return urllib.parse.urlunparse((
        parsed_url.scheme,
        parsed_url.netloc,
        parsed_url.path,
        parsed_url.params,
        new_query,
        parsed_url.fragment
    ))

def get_sejm_videos(term):
    # Replace 'term9' with the desired term
    url = f"https://api.sejm.gov.pl/sejm/term{term}/videos"
    
    # Send a GET request to the API
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        return response.json()
    else:
        return f"Error: {response.status_code}"

def get_today_sejm_videos(term):
    # Replace 'term9' with the desired term
    url = f"https://api.sejm.gov.pl/sejm/term{term}/videos/today"
    
    # Send a GET request to the API
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        return response.json()
    else:
        return f"Error: {response.status_code}"
def get_sejm_videos(term, since=None, till=None, title=None, video_type=None, comm=None):
    base_url = f"https://api.sejm.gov.pl/sejm/term{term}/videos"
    params = {}

    if since:
        params['since'] = since
    if till:
        params['till'] = till
    if title:
        params['title'] = title
    if video_type:
        params['type'] = video_type
    if comm:
        params['comm'] = comm

    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        return f"Error: {response.status_code}"

def download_video(video_url, video_path):
    response = requests.get(video_url)
    if response.status_code == 200:
        with open(video_path, 'wb') as file:
            file.write(response.content)
        return True
    else:
        print(f"Error downloading video: {response.status_code}")
        return False

def extract_audio(video_path, audio_path):
    command = ['ffmpeg', '-i', video_path, '-q:a', '0', '-map', 'a', audio_path, '-y']
    subprocess.run(command)
    if os.path.exists(audio_path):
        print("Audio extracted successfully.")
    else:
        print("Error extracting audio.")
 
# 600000,10000

def transcribe(num1,num2):

    videos = get_sejm_videos(term)
    print(videos[0]['videoLink'])

    if download_video(offset_time(videos[0]['videoLink'],num1,num2), "./video.mov"):
        extract_audio("./video.mov", "./audio.mp3")
        return transcriber("./audio.mp3")["text"]


demo = gr.Interface(
    fn=transcribe,
    #inputs=gr.Audio(type="filepath"),
    inputs=[gr.Number(label="Number 1"), gr.Number(label="Number 2")], 
    outputs="text",
)

demo.launch()