Samarth991 commited on
Commit
2b89dc1
1 Parent(s): d1cd7f1

adding you tube processing LLM

Browse files
Files changed (3) hide show
  1. app.py +127 -0
  2. chatops.py +23 -0
  3. requirements.txt +11 -0
app.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import time
4
+ import logging
5
+ from langchain.document_loaders import PDFMinerLoader,CSVLoader ,UnstructuredWordDocumentLoader,TextLoader,OnlinePDFLoader
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain.embeddings import SentenceTransformerEmbeddings
8
+ from langchain.vectorstores import FAISS
9
+ from langchain import HuggingFaceHub
10
+ from langchain.chains import RetrievalQA
11
+ from langchain.prompts import PromptTemplate
12
+ from langchain.docstore.document import Document
13
+ from youtube_transcript_api import YouTubeTranscriptApi
14
+ from . import chatops
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ DEVICE = 'cpu'
19
+ MAX_NEW_TOKENS = 4096
20
+ DEFAULT_TEMPERATURE = 0.1
21
+ DEFAULT_MAX_NEW_TOKENS = 2048
22
+ MAX_INPUT_TOKEN_LENGTH = 4000
23
+ DEFAULT_CHAR_LENGTH = 1000
24
+
25
+ def loading_file():
26
+ return "Loading..."
27
+
28
+
29
+ def get_text_from_youtube_link(video_link,max_video_length=800):
30
+ video_text = ""
31
+ video_id = video_link.split("watch?v=")[1].split("&")[0]
32
+ srt = YouTubeTranscriptApi.get_transcript(video_id)
33
+ for text_data in srt:
34
+ video_text = video_text + " " + text_data.get("text")
35
+ if len(video_text) > max_video_length:
36
+ return video_text[0:max_video_length]
37
+ else:
38
+ return video_text
39
+
40
+ def process_documents(documents,data_chunk=1500,chunk_overlap=100):
41
+ text_splitter = CharacterTextSplitter(chunk_size=data_chunk, chunk_overlap=chunk_overlap,separator='\n')
42
+ texts = text_splitter.split_documents(documents)
43
+ return texts
44
+
45
+ def process_youtube_link(link, document_name="youtube-content"):
46
+ try:
47
+ metadata = {"source": f"{document_name}.txt"}
48
+ return [Document(page_content=get_text_from_youtube_link(video_link=link), metadata=metadata)]
49
+ except Exception as err:
50
+ logger.error(f'Error in reading document. {err}')
51
+
52
+
53
+ def youtube_chat(youtube_link,API_key,llm='HuggingFace',temperature=0.1,max_tokens=1096,char_length=1500):
54
+
55
+ document = process_youtube_link(link=youtube_link)
56
+ embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-base',model_kwargs={"device": DEVICE})
57
+ texts = process_documents(documents=document)
58
+ global vector_db
59
+ vector_db = FAISS.from_documents(documents=texts, embedding= embedding_model)
60
+ global qa
61
+ qa = RetrievalQA.from_chain_type(llm=chatops.chat_application(llm_service=llm,key=API_key,
62
+ temperature=temperature,
63
+ max_tokens=max_tokens
64
+ ),
65
+ chain_type='stuff',
66
+ retriever=vector_db.as_retriever(),
67
+ # chain_type_kwargs=chain_type_kwargs,
68
+ return_source_documents=True
69
+ )
70
+ return "Youtube link Processing completed ..."
71
+
72
+
73
+ css="""
74
+ #col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
75
+ """
76
+
77
+ title = """
78
+ <div style="text-align: center;max-width: 700px;">
79
+ <h1>Chat on You Tube video data • OpenAI/HuggingFace</h1>
80
+ <p style="text-align: center;">Upload a You tube Link, to create its captions and load them as embeddings <br />
81
+ once status is ready, you can start asking questions about the content you uploaded.<br />
82
+ The repo provides you an option to use HuggingFace/OpenAI as LLM's, make sure to add your API Key before proceding.
83
+ </p>
84
+ </div>
85
+ """
86
+
87
+ with gr.Blocks(css=css) as demo:
88
+ with gr.Column(elem_id="col-container"):
89
+ gr.HTML(title)
90
+
91
+ with gr.Group():
92
+ chatbot = gr.Chatbot(height=300)
93
+ with gr.Row():
94
+ question = gr.Textbox(label="Type your question !",lines=1).style(full_width=True)
95
+ submit_btn = gr.Button(value="Send message", variant="primary", scale = 1)
96
+ clean_chat_btn = gr.Button("Delete Chat")
97
+
98
+ with gr.Column():
99
+ with gr.Box():
100
+ LLM_option = gr.Dropdown(['HuggingFace','OpenAI'],label='Large Language Model Selection',info='LLM Service')
101
+ API_key = gr.Textbox(label="Add API key", type="password",autofocus=True)
102
+ with gr.Accordion(label='Advanced options', open=False):
103
+ max_new_tokens = gr.Slider(
104
+ label='Max new tokens',
105
+ minimum=2048,
106
+ maximum=MAX_NEW_TOKENS,
107
+ step=1,
108
+ value=DEFAULT_MAX_NEW_TOKENS,
109
+ )
110
+ temperature = gr.Slider(
111
+ label='Temperature',
112
+ minimum=0.1,
113
+ maximum=4.0,
114
+ step=0.1,
115
+ value=DEFAULT_TEMPERATURE,
116
+ )
117
+ char_length = gr.Slider(
118
+ label='Max Character',
119
+ minimum= DEFAULT_CHAR_LENGTH,
120
+ maximum = 5*DEFAULT_CHAR_LENGTH,
121
+ step = 500,
122
+ value= 1500
123
+ )
124
+
125
+ with gr.Column():
126
+ with gr.Box():
127
+ add_link = gr.Textbox(label="Add your you tube Link",text_align='left',autofocus=True)
chatops.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+
4
+
5
+
6
+ def get_openai_chat_model(API_key):
7
+ try:
8
+ from langchain.llms import OpenAI
9
+ except ImportError as err:
10
+ raise "{}, unable to load openAI. Please install openai and add OPENAIAPI_KEY"
11
+ os.environ["OPENAI_API_KEY"] = API_key
12
+ llm = OpenAI()
13
+ return llm
14
+
15
+ def get_hugging_face_model(model_id,API_key,temperature=0.1,max_tokens=4096):
16
+ try:
17
+ from langchain import HuggingFaceHub
18
+ except ImportError as err:
19
+ raise "{}, unable to load openAI. Please install openai and add OPENAIAPI_KEY"
20
+ chat_llm = HuggingFaceHub(huggingfacehub_api_token=API_key,
21
+ repo_id=model_id,
22
+ model_kwargs={"temperature": temperature, "max_new_tokens": max_tokens})
23
+ return chat_llm
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openai
2
+ tiktoken
3
+ chromadb
4
+ langchain
5
+ unstructured
6
+ unstructured[local-inference]
7
+ transformers
8
+ torch
9
+ faiss-cpu
10
+ sentence-transformers
11
+ youtube-transcript-api