Samarth991's picture
adding you tube processing LLM
2b89dc1
raw
history blame
No virus
5.22 kB
import os
import gradio as gr
import time
import logging
from langchain.document_loaders import PDFMinerLoader,CSVLoader ,UnstructuredWordDocumentLoader,TextLoader,OnlinePDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain import HuggingFaceHub
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from youtube_transcript_api import YouTubeTranscriptApi
from . import chatops
logger = logging.getLogger(__name__)
DEVICE = 'cpu'
MAX_NEW_TOKENS = 4096
DEFAULT_TEMPERATURE = 0.1
DEFAULT_MAX_NEW_TOKENS = 2048
MAX_INPUT_TOKEN_LENGTH = 4000
DEFAULT_CHAR_LENGTH = 1000
def loading_file():
return "Loading..."
def get_text_from_youtube_link(video_link,max_video_length=800):
video_text = ""
video_id = video_link.split("watch?v=")[1].split("&")[0]
srt = YouTubeTranscriptApi.get_transcript(video_id)
for text_data in srt:
video_text = video_text + " " + text_data.get("text")
if len(video_text) > max_video_length:
return video_text[0:max_video_length]
else:
return video_text
def process_documents(documents,data_chunk=1500,chunk_overlap=100):
text_splitter = CharacterTextSplitter(chunk_size=data_chunk, chunk_overlap=chunk_overlap,separator='\n')
texts = text_splitter.split_documents(documents)
return texts
def process_youtube_link(link, document_name="youtube-content"):
try:
metadata = {"source": f"{document_name}.txt"}
return [Document(page_content=get_text_from_youtube_link(video_link=link), metadata=metadata)]
except Exception as err:
logger.error(f'Error in reading document. {err}')
def youtube_chat(youtube_link,API_key,llm='HuggingFace',temperature=0.1,max_tokens=1096,char_length=1500):
document = process_youtube_link(link=youtube_link)
embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-base',model_kwargs={"device": DEVICE})
texts = process_documents(documents=document)
global vector_db
vector_db = FAISS.from_documents(documents=texts, embedding= embedding_model)
global qa
qa = RetrievalQA.from_chain_type(llm=chatops.chat_application(llm_service=llm,key=API_key,
temperature=temperature,
max_tokens=max_tokens
),
chain_type='stuff',
retriever=vector_db.as_retriever(),
# chain_type_kwargs=chain_type_kwargs,
return_source_documents=True
)
return "Youtube link Processing completed ..."
css="""
#col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
"""
title = """
<div style="text-align: center;max-width: 700px;">
<h1>Chat on You Tube video data • OpenAI/HuggingFace</h1>
<p style="text-align: center;">Upload a You tube Link, to create its captions and load them as embeddings <br />
once status is ready, you can start asking questions about the content you uploaded.<br />
The repo provides you an option to use HuggingFace/OpenAI as LLM's, make sure to add your API Key before proceding.
</p>
</div>
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.HTML(title)
with gr.Group():
chatbot = gr.Chatbot(height=300)
with gr.Row():
question = gr.Textbox(label="Type your question !",lines=1).style(full_width=True)
submit_btn = gr.Button(value="Send message", variant="primary", scale = 1)
clean_chat_btn = gr.Button("Delete Chat")
with gr.Column():
with gr.Box():
LLM_option = gr.Dropdown(['HuggingFace','OpenAI'],label='Large Language Model Selection',info='LLM Service')
API_key = gr.Textbox(label="Add API key", type="password",autofocus=True)
with gr.Accordion(label='Advanced options', open=False):
max_new_tokens = gr.Slider(
label='Max new tokens',
minimum=2048,
maximum=MAX_NEW_TOKENS,
step=1,
value=DEFAULT_MAX_NEW_TOKENS,
)
temperature = gr.Slider(
label='Temperature',
minimum=0.1,
maximum=4.0,
step=0.1,
value=DEFAULT_TEMPERATURE,
)
char_length = gr.Slider(
label='Max Character',
minimum= DEFAULT_CHAR_LENGTH,
maximum = 5*DEFAULT_CHAR_LENGTH,
step = 500,
value= 1500
)
with gr.Column():
with gr.Box():
add_link = gr.Textbox(label="Add your you tube Link",text_align='left',autofocus=True)