super_agent

Running

super_agent / tools /extraction.py

lezaf

Update tools and prompt

9c88759 7 days ago

2.3 kB

	import requests
	import pandas as pd
	from io import BytesIO
	from markitdown import MarkItDown
	from langchain_core.tools import tool

	@tool
	def extract_transcript_from_youtube(url: str) -> str:
	"""
	Extracts the transcript from a YouTube video given its URL.

	Args:
	url (str): The YouTube video URL.
	Returns:
	transcript (str): The transcript of the video, or an error message if extraction fails.
	"""
	transcript_str = "### Transcript"
	md = MarkItDown(enable_plugins=True)

	try:
	result = md.convert(url)
	except Exception as e:
	return f"Failed to extract transcript from YouTube video: {str(e)}"

	parts = result.text_content.split(transcript_str)
	if len(parts) < 2:
	return result.text_content

	transcript = (transcript_str + "\n" + parts[1]).strip()

	return transcript


	@tool
	def extract_data_from_excel(url: str) -> str:
	"""
	Downloads and extracts data from an Excel file at the given URL.

	Args:
	url (str): The URL of the Excel file.

	Returns:
	str: A string representation of the data in the first sheet of the Excel file.
	"""
	try:
	response = requests.get(url)
	response.raise_for_status()

	excel_file = BytesIO(response.content)
	df = pd.read_excel(excel_file)

	# Optional: Remove unnamed columns often created by Excel
	df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

	# Convert all numeric columns to float
	for col in df.select_dtypes(include=["number"]).columns:
	df[col] = df[col].astype(float)

	return df.to_string(index=False)

	except Exception as e:
	return f"Failed to process Excel file from URL: {str(e)}"


	@tool
	def extract_transcript_from_audio(url: str) -> str:
	"""
	Extracts the transcript from an audio file given its URL.
	Supported formats: mp3, wav.

	Args:
	url (str): The URL of the audio file.
	Returns:
	str: The transcript of the audio file, or an error message if extraction fails.
	"""
	md = MarkItDown(enable_plugins=True)

	try:
	result = md.convert(url)
	except Exception as e:
	return f"Failed to extract transcript from audio: {str(e)}"

	return result.text_content