Rauhan commited on
Commit
ac9adab
1 Parent(s): e6741bc

UPDATE: YT Transcripts

Browse files
Files changed (3) hide show
  1. app.py +5 -1
  2. functions.py +13 -1
  3. requirements.txt +1 -0
app.py CHANGED
@@ -155,4 +155,8 @@ async def getCount(vectorstore: str):
155
  df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
156
  return {
157
  "currentCount": df[(df['username'] == username) & (df['chatbotname'] == chatbotName)]['charactercount'].iloc[0]
158
- }
 
 
 
 
 
155
  df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
156
  return {
157
  "currentCount": df[(df['username'] == username) & (df['chatbotname'] == chatbotName)]['charactercount'].iloc[0]
158
+ }
159
+
160
+ @app.post("/getYoutubeTranscript")
161
+ async def getYTTranscript(url: str):
162
+ return getTranscript(url = url)
functions.py CHANGED
@@ -10,6 +10,7 @@ from langchain_core.runnables.history import RunnableWithMessageHistory
10
  from langchain.memory import ChatMessageHistory
11
  from langchain_core.chat_history import BaseChatMessageHistory
12
  from langchain.storage import InMemoryStore
 
13
  from langchain.docstore.document import Document
14
  from langchain_huggingface import HuggingFaceEmbeddings
15
  from langchain.retrievers import ContextualCompressionRetriever
@@ -294,4 +295,15 @@ def getTextFromImagePDF(pdfBytes):
294
  allImages = convert_from_bytes(pdfBytes)
295
  allImages = [np.array(image) for image in allImages]
296
  text = "\n\n\n".join(["\n".join([text[1] for text in reader.readtext(image, paragraph=True)]) for image in allImages])
297
- return text
 
 
 
 
 
 
 
 
 
 
 
 
10
  from langchain.memory import ChatMessageHistory
11
  from langchain_core.chat_history import BaseChatMessageHistory
12
  from langchain.storage import InMemoryStore
13
+ from langchain_community.document_loaders import YoutubeLoader
14
  from langchain.docstore.document import Document
15
  from langchain_huggingface import HuggingFaceEmbeddings
16
  from langchain.retrievers import ContextualCompressionRetriever
 
295
  allImages = convert_from_bytes(pdfBytes)
296
  allImages = [np.array(image) for image in allImages]
297
  text = "\n\n\n".join(["\n".join([text[1] for text in reader.readtext(image, paragraph=True)]) for image in allImages])
298
+ return text
299
+
300
+
301
+ def getTranscript(url: str):
302
+ loader = YoutubeLoader.from_youtube_url(
303
+ url, add_video_info=False
304
+ )
305
+ try:
306
+ doc = " ".join([x.page_content for x in loader.load()])
307
+ except:
308
+ doc = "ENGLISH TRANSCRIPT UNAVAILABLE"
309
+ return doc
requirements.txt CHANGED
@@ -18,6 +18,7 @@ python-dotenv
18
  pydantic
19
  pandas
20
  easyocr
 
21
  pdf2image
22
  sentence-transformers
23
  supabase
 
18
  pydantic
19
  pandas
20
  easyocr
21
+ youtube-transcript-api
22
  pdf2image
23
  sentence-transformers
24
  supabase