import os import gradio as gr import time import logging from langchain.document_loaders import PDFMinerLoader,CSVLoader ,UnstructuredWordDocumentLoader,TextLoader,OnlinePDFLoader from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import SentenceTransformerEmbeddings from langchain.vectorstores import FAISS from langchain import HuggingFaceHub from langchain.chains import RetrievalQA from langchain.prompts import PromptTemplate from langchain.docstore.document import Document from youtube_transcript_api import YouTubeTranscriptApi from . import chatops logger = logging.getLogger(__name__) DEVICE = 'cpu' MAX_NEW_TOKENS = 4096 DEFAULT_TEMPERATURE = 0.1 DEFAULT_MAX_NEW_TOKENS = 2048 MAX_INPUT_TOKEN_LENGTH = 4000 DEFAULT_CHAR_LENGTH = 1000 def loading_file(): return "Loading..." def get_text_from_youtube_link(video_link,max_video_length=800): video_text = "" video_id = video_link.split("watch?v=")[1].split("&")[0] srt = YouTubeTranscriptApi.get_transcript(video_id) for text_data in srt: video_text = video_text + " " + text_data.get("text") if len(video_text) > max_video_length: return video_text[0:max_video_length] else: return video_text def process_documents(documents,data_chunk=1500,chunk_overlap=100): text_splitter = CharacterTextSplitter(chunk_size=data_chunk, chunk_overlap=chunk_overlap,separator='\n') texts = text_splitter.split_documents(documents) return texts def process_youtube_link(link, document_name="youtube-content"): try: metadata = {"source": f"{document_name}.txt"} return [Document(page_content=get_text_from_youtube_link(video_link=link), metadata=metadata)] except Exception as err: logger.error(f'Error in reading document. {err}') def youtube_chat(youtube_link,API_key,llm='HuggingFace',temperature=0.1,max_tokens=1096,char_length=1500): document = process_youtube_link(link=youtube_link) embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-base',model_kwargs={"device": DEVICE}) texts = process_documents(documents=document) global vector_db vector_db = FAISS.from_documents(documents=texts, embedding= embedding_model) global qa qa = RetrievalQA.from_chain_type(llm=chatops.chat_application(llm_service=llm,key=API_key, temperature=temperature, max_tokens=max_tokens ), chain_type='stuff', retriever=vector_db.as_retriever(), # chain_type_kwargs=chain_type_kwargs, return_source_documents=True ) return "Youtube link Processing completed ..." css=""" #col-container {max-width: 700px; margin-left: auto; margin-right: auto;} """ title = """
Upload a You tube Link, to create its captions and load them as embeddings
once status is ready, you can start asking questions about the content you uploaded.
The repo provides you an option to use HuggingFace/OpenAI as LLM's, make sure to add your API Key before proceding.