podsni LaoCzi commited on
Commit
cd4446f
0 Parent(s):

Duplicate from LaoCzi/YouTube_Summarize2

Browse files

Co-authored-by: Alex <LaoCzi@users.noreply.huggingface.co>

Files changed (5) hide show
  1. .gitattributes +34 -0
  2. README.md +14 -0
  3. ap.py +113 -0
  4. app.py +113 -0
  5. requirements.txt +6 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: YouTube Summarize
3
+ emoji: 👀
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 3.19.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: cc
11
+ duplicated_from: LaoCzi/YouTube_Summarize2
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
ap.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+
3
+ from youtube_transcript_api import YouTubeTranscriptApi
4
+ from dotenv import load_dotenv
5
+ from langchain.embeddings.openai import OpenAIEmbeddings
6
+ from langchain.vectorstores.faiss import FAISS
7
+ from langchain.text_splitter import CharacterTextSplitter
8
+ from langchain.llms import OpenAI
9
+ from langchain.chains import ChatVectorDBChain
10
+ from langchain.prompts import PromptTemplate
11
+ from pathlib import Path
12
+ import os
13
+ import openai
14
+ import gradio as gr
15
+
16
+ load_dotenv()
17
+ OPENAI_KEY = os.getenv('OPENAI_KEY')
18
+
19
+
20
+
21
+
22
+ _template = """ Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
23
+ Chat History:
24
+ {chat_history}
25
+ Follow Up Input: {question}
26
+ Standalone question:"""
27
+ CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
28
+
29
+ template = """You are an AI version of the youtuber {name} .
30
+ You are given the following extracted parts of a long document and a question. Provide a conversational answer.
31
+ Question: {question}
32
+ =========
33
+ {context}
34
+ =========
35
+ Answer:"""
36
+ QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context", "name"])
37
+
38
+ video1 = "ReeLQR7KCcM"
39
+ youtuberName = ""
40
+
41
+ def gpt_api (input_text):
42
+ completion = openai.Completion.create(
43
+ engine="text-davinci-003",
44
+ prompt=input_text,
45
+ top_p=1,
46
+ frequency_penalty=0,
47
+ presence_penalty=0,
48
+ max_tokens=300,
49
+ n=1,
50
+ stop="",
51
+ temperature=0.6,
52
+ )
53
+ response = completion.choices[0].text.strip()
54
+ return response
55
+
56
+ def generate(video_url, question):
57
+ if (video_url ==""): return ""
58
+ if "youtube.com/watch?v=" in video_url: x=111
59
+ else: return "Неверный URL"
60
+
61
+ video_id = video_url[-11:]
62
+ try:
63
+ t = YouTubeTranscriptApi.get_transcript(video_id,languages=["en"])
64
+ # do something with the transcript
65
+ except Exception as e:
66
+ return "An error occurred:"+e
67
+
68
+ finalString = ""
69
+ for item in t:
70
+ text = item['text']
71
+ finalString += text + " "
72
+ print("Transcript:",finalString)
73
+ print("Transcript lenght:",len(finalString))
74
+ if (len(finalString)>15000): finalString = finalString[:15000]
75
+
76
+ # load data sources to text (yt->text)
77
+ text_splitter = CharacterTextSplitter()
78
+ chunks = text_splitter.split_text(finalString)
79
+ vectorStorePkl = Path("vectorstore.pkl")
80
+ vectorStore = None
81
+ # if vectorStorePkl.is_file():
82
+ # print("vector index found.. ")
83
+ # with open('vectorstore.pkl', 'rb') as f:
84
+ # vectorStore = pickle.load(f)
85
+ # else:
86
+ print("regenerating search index vector store..")
87
+ # It uses OpenAI API to create embeddings (i.e. a feature vector)
88
+ # https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
89
+ vectorStore = FAISS.from_texts(chunks, OpenAIEmbeddings(openai_api_key=OPENAI_KEY))
90
+ with open("vectorstore.pkl", "wb") as f:
91
+ pickle.dump(vectorStore, f)
92
+
93
+ qa = ChatVectorDBChain.from_llm(OpenAI(temperature=0, openai_api_key=OPENAI_KEY),
94
+ vectorstore=vectorStore, qa_prompt=QA_PROMPT)
95
+
96
+ chat_history = []
97
+ userInput = question
98
+
99
+ response = qa({"name": youtuberName, "question": userInput, "chat_history": chat_history}, return_only_outputs=True)
100
+ print("Result:",response["answer"])
101
+ return response["answer"]
102
+ #======================================
103
+
104
+
105
+ title = "YouTube Summorize (only english video < 15 min)"
106
+ demo = gr.Interface(fn=generate, css=".gradio-container {background-color: lightblue}",
107
+ inputs=[
108
+ gr.Textbox(lines=1, label="Video URL"),
109
+ gr.Textbox(lines=1, label="Question", value="What is this video about?"),
110
+ ],
111
+ outputs=[gr.Textbox(lines=4, label="Ответ:")],
112
+ title = title)
113
+ demo.launch(share=False, debug=True)
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+
3
+ from youtube_transcript_api import YouTubeTranscriptApi
4
+ from dotenv import load_dotenv
5
+ from langchain.embeddings.openai import OpenAIEmbeddings
6
+ from langchain.vectorstores.faiss import FAISS
7
+ from langchain.text_splitter import CharacterTextSplitter
8
+ from langchain.llms import OpenAI
9
+ from langchain.chains import ChatVectorDBChain
10
+ from langchain.prompts import PromptTemplate
11
+ from pathlib import Path
12
+ import os
13
+ import openai
14
+ import gradio as gr
15
+
16
+ load_dotenv()
17
+ OPENAI_KEY = os.getenv('OPENAI_KEY')
18
+
19
+
20
+
21
+
22
+ _template = """ Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
23
+ Chat History:
24
+ {chat_history}
25
+ Follow Up Input: {question}
26
+ Standalone question:"""
27
+ CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
28
+
29
+ template = """You are an AI version of the youtuber {name} .
30
+ You are given the following extracted parts of a long document and a question. Provide a conversational answer.
31
+ Question: {question}
32
+ =========
33
+ {context}
34
+ =========
35
+ Answer:"""
36
+ QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context", "name"])
37
+
38
+ video1 = "ReeLQR7KCcM"
39
+ youtuberName = ""
40
+
41
+ def gpt_api (input_text):
42
+ completion = openai.Completion.create(
43
+ engine="text-davinci-003",
44
+ prompt=input_text,
45
+ top_p=1,
46
+ frequency_penalty=0,
47
+ presence_penalty=0,
48
+ max_tokens=300,
49
+ n=1,
50
+ stop="",
51
+ temperature=0.6,
52
+ )
53
+ response = completion.choices[0].text.strip()
54
+ return response
55
+
56
+ def generate(video_url, question):
57
+ if (video_url ==""): return ""
58
+ if "youtube.com/watch?v=" in video_url: x=111
59
+ else: return "Неверный URL"
60
+
61
+ video_id = video_url[-11:]
62
+ try:
63
+ t = YouTubeTranscriptApi.get_transcript(video_id,languages=["en"])
64
+ # do something with the transcript
65
+ except Exception as e:
66
+ return "An error occurred:"+e
67
+
68
+ finalString = ""
69
+ for item in t:
70
+ text = item['text']
71
+ finalString += text + " "
72
+ print("Transcript:",finalString)
73
+ print("Transcript lenght:",len(finalString))
74
+ if (len(finalString)>15000): finalString = finalString[:15000]
75
+
76
+ # load data sources to text (yt->text)
77
+ text_splitter = CharacterTextSplitter()
78
+ chunks = text_splitter.split_text(finalString)
79
+ vectorStorePkl = Path("vectorstore.pkl")
80
+ vectorStore = None
81
+ # if vectorStorePkl.is_file():
82
+ # print("vector index found.. ")
83
+ # with open('vectorstore.pkl', 'rb') as f:
84
+ # vectorStore = pickle.load(f)
85
+ # else:
86
+ print("regenerating search index vector store..")
87
+ # It uses OpenAI API to create embeddings (i.e. a feature vector)
88
+ # https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
89
+ vectorStore = FAISS.from_texts(chunks, OpenAIEmbeddings(openai_api_key=OPENAI_KEY))
90
+ with open("vectorstore.pkl", "wb") as f:
91
+ pickle.dump(vectorStore, f)
92
+
93
+ qa = ChatVectorDBChain.from_llm(OpenAI(temperature=0, openai_api_key=OPENAI_KEY),
94
+ vectorstore=vectorStore, qa_prompt=QA_PROMPT)
95
+
96
+ chat_history = []
97
+ userInput = question
98
+
99
+ response = qa({"name": youtuberName, "question": userInput, "chat_history": chat_history}, return_only_outputs=True)
100
+ print("Result:",response["answer"])
101
+ return response["answer"]
102
+ #======================================
103
+
104
+
105
+ title = "YouTube Summorize (only english video < 15 min)"
106
+ demo = gr.Interface(fn=generate, css=".gradio-container {background-color: lightblue}",
107
+ inputs=[
108
+ gr.Textbox(lines=1, label="Video URL"),
109
+ gr.Textbox(lines=1, label="Question", value="What is this video about?"),
110
+ ],
111
+ outputs=[gr.Textbox(lines=4, label="Ответ:")],
112
+ title = title)
113
+ demo.launch(share=False, debug=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ langchain==0.0.82
2
+ openai==0.26.4
3
+ python-dotenv==0.21.1
4
+ streamlit==1.12.0
5
+ youtube_transcript_api==0.5.0
6
+ faiss-cpu==1.7.3