ucalyptus commited on
Commit
53ffbd2
1 Parent(s): 5912886

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -40
app.py CHANGED
@@ -12,56 +12,48 @@ from langchain import OpenAI
12
  from langchain.vectorstores.base import VectorStoreRetriever
13
  import os
14
 
15
- video_data_cache = {}
16
-
17
  def get_answer(api_key, video_link, question):
18
  os.environ["OPENAI_API_KEY"] = api_key
19
 
20
- if video_link not in video_data_cache:
21
- video = pytube.YouTube(video_link)
22
- audio = video.streams.get_audio_only()
23
- fn = audio.download(output_path="tmp.mp3")
24
- model = whisper.load_model("base")
25
- transcription = model.transcribe(fn)
26
- res = transcription['text']
27
-
28
- def store_segments(text):
29
- segment_size = 1000
30
- segments = [{'text': text[i:i+segment_size], 'start': i} for i in range(0, len(text), segment_size)]
31
 
32
- texts = []
33
- start_times = []
 
34
 
35
- for segment in segments:
36
- text = segment['text']
37
- start = segment['start']
38
 
39
- start_datetime = datetime.fromtimestamp(start)
40
- formatted_start_time = start_datetime.strftime('%H:%M:%S')
 
41
 
42
- texts.append(text)
43
- start_times.append(formatted_start_time)
44
 
45
- return texts, start_times
 
46
 
47
- texts, start_times = store_segments(res)
48
 
49
- text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n")
50
- docs = []
51
- metadatas = []
52
- for i, d in enumerate(texts):
53
- splits = text_splitter.split_text(d)
54
- docs.extend(splits)
55
- metadatas.extend([{"source": start_times[i]}] * len(splits))
56
 
57
- embeddings = OpenAIEmbeddings()
58
- store = FAISS.from_texts(docs, embeddings, metadatas=metadatas)
59
- faiss.write_index(store.index, f"docs.index")
 
 
 
 
60
 
61
- video_data_cache[video_link] = f"docs.index"
62
-
63
- index_file = video_data_cache[video_link]
64
- store = faiss.read_index(index_file)
65
 
66
  retri = VectorStoreRetriever(vectorstore=store)
67
 
@@ -71,13 +63,12 @@ def get_answer(api_key, video_link, question):
71
 
72
  return result['answer'], result['sources']
73
 
74
-
75
  iface = gr.Interface(
76
  fn=get_answer,
77
  inputs=["text", "text", "text"],
78
  outputs=["text", "text"],
79
  examples=[
80
- ["sk-kVc5h5YtNXyD6WxUd4aSxIyWuGc", "https://www.youtube.com/watch?v=xNAm9O_duSA", "Who could be the next Prime Minister ?"]
81
  ],
82
  )
83
 
 
12
  from langchain.vectorstores.base import VectorStoreRetriever
13
  import os
14
 
 
 
15
  def get_answer(api_key, video_link, question):
16
  os.environ["OPENAI_API_KEY"] = api_key
17
 
18
+ video = pytube.YouTube(video_link)
19
+ audio = video.streams.get_audio_only()
20
+ fn = audio.download(output_path="tmp.mp3")
21
+ model = whisper.load_model("base")
22
+ transcription = model.transcribe(fn)
23
+ res = transcription['text']
 
 
 
 
 
24
 
25
+ def store_segments(text):
26
+ segment_size = 1000
27
+ segments = [{'text': text[i:i+segment_size], 'start': i} for i in range(0, len(text), segment_size)]
28
 
29
+ texts = []
30
+ start_times = []
 
31
 
32
+ for segment in segments:
33
+ text = segment['text']
34
+ start = segment['start']
35
 
36
+ start_datetime = datetime.fromtimestamp(start)
37
+ formatted_start_time = start_datetime.strftime('%H:%M:%S')
38
 
39
+ texts.append(text)
40
+ start_times.append(formatted_start_time)
41
 
42
+ return texts, start_times
43
 
44
+ texts, start_times = store_segments(res)
 
 
 
 
 
 
45
 
46
+ text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n")
47
+ docs = []
48
+ metadatas = []
49
+ for i, d in enumerate(texts):
50
+ splits = text_splitter.split_text(d)
51
+ docs.extend(splits)
52
+ metadatas.extend([{"source": start_times[i]}] * len(splits))
53
 
54
+ embeddings = OpenAIEmbeddings()
55
+ store = FAISS.from_texts(docs, embeddings, metadatas=metadatas)
56
+ faiss.write_index(store.index, "docs.index")
 
57
 
58
  retri = VectorStoreRetriever(vectorstore=store)
59
 
 
63
 
64
  return result['answer'], result['sources']
65
 
 
66
  iface = gr.Interface(
67
  fn=get_answer,
68
  inputs=["text", "text", "text"],
69
  outputs=["text", "text"],
70
  examples=[
71
+ [os.environ["OPENAI_API_KEY"], "https://www.youtube.com/watch?v=xNAm9O_duSA", "Who could be the next Prime Minister ?"]
72
  ],
73
  )
74