Spaces:
Running
Running
import requests | |
import pandas as pd | |
from io import BytesIO | |
from markitdown import MarkItDown | |
from langchain_core.tools import tool | |
def extract_transcript_from_youtube(url: str) -> str: | |
""" | |
Extracts the transcript from a YouTube video given its URL. | |
Args: | |
url (str): The YouTube video URL. | |
Returns: | |
transcript (str): The transcript of the video, or an error message if extraction fails. | |
""" | |
transcript_str = "### Transcript" | |
md = MarkItDown(enable_plugins=True) | |
try: | |
result = md.convert(url) | |
except Exception as e: | |
return f"Failed to extract transcript from YouTube video: {str(e)}" | |
parts = result.text_content.split(transcript_str) | |
if len(parts) < 2: | |
return result.text_content | |
transcript = (transcript_str + "\n" + parts[1]).strip() | |
return transcript | |
def extract_data_from_excel(url: str) -> str: | |
""" | |
Downloads and extracts data from an Excel file at the given URL. | |
Args: | |
url (str): The URL of the Excel file. | |
Returns: | |
str: A string representation of the data in the first sheet of the Excel file. | |
""" | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
excel_file = BytesIO(response.content) | |
df = pd.read_excel(excel_file) | |
# Optional: Remove unnamed columns often created by Excel | |
df = df.loc[:, ~df.columns.str.contains('^Unnamed')] | |
# Convert all numeric columns to float | |
for col in df.select_dtypes(include=["number"]).columns: | |
df[col] = df[col].astype(float) | |
return df.to_string(index=False) | |
except Exception as e: | |
return f"Failed to process Excel file from URL: {str(e)}" | |
def extract_transcript_from_audio(url: str) -> str: | |
""" | |
Extracts the transcript from an audio file given its URL. | |
Supported formats: mp3, wav. | |
Args: | |
url (str): The URL of the audio file. | |
Returns: | |
str: The transcript of the audio file, or an error message if extraction fails. | |
""" | |
md = MarkItDown(enable_plugins=True) | |
try: | |
result = md.convert(url) | |
except Exception as e: | |
return f"Failed to extract transcript from audio: {str(e)}" | |
return result.text_content |