AlbertoFH98 commited on
Commit
6996634
1 Parent(s): b52e154

Load utils + javascript files

Browse files
Files changed (3) hide show
  1. my_functions.js +24 -0
  2. script.js +27 -0
  3. utils.py +244 -0
my_functions.js ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ var tag = document.createElement('script');
2
+ tag.src = 'https://www.youtube.com/iframe_api';
3
+ var firstScriptTag = document.getElementsByTagName('script')[0];
4
+ firstScriptTag.parentNode.insertBefore(tag, firstScriptTag);
5
+ var player, seconds = 0;
6
+
7
+
8
+ function onPlayerReady(event) {
9
+ event.target.playVideo();
10
+ }
11
+ function seek(sec){
12
+ var documentContainer = document;
13
+ var iframe = documentContainer.getElementById('player');
14
+ console.log("iframe");
15
+ console.log(iframe);
16
+ player = new YT.Player(iframe, {
17
+ events: {
18
+ 'onReady': onPlayerReady
19
+ }
20
+ });
21
+ if(player){
22
+ player.seekTo(sec, true);
23
+ }
24
+ }
script.js ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ var tag = document.createElement('script');
2
+ tag.src = 'https://www.youtube.com/iframe_api';
3
+ var firstScriptTag = document.getElementsByTagName('script')[0];
4
+ firstScriptTag.parentNode.insertBefore(tag, firstScriptTag);
5
+
6
+ function onYouTubeIframeAPIReady() {
7
+ var iframe = document.getElementById('player');
8
+ player = new YT.Player(iframe, {
9
+ events: {
10
+ 'onReady': onPlayerReady
11
+ }
12
+ });
13
+ }
14
+ function onPlayerReady(event) {
15
+ event.target.playVideo();
16
+ }
17
+ var iframe = document.getElementById('player');
18
+ player = new YT.Player(iframe, {
19
+ events: {
20
+ 'onReady': onPlayerReady
21
+ }
22
+ });
23
+ function seek(sec){
24
+ if(player){
25
+ player.seekTo(sec, true);
26
+ }
27
+ }
utils.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -- Utils .py file
2
+ # -- Libraries
3
+ from typing import Any, Dict, List, Mapping, Optional
4
+ from pydantic import Extra, Field, root_validator
5
+ from langchain.llms.base import LLM
6
+ from langchain.utils import get_from_dict_or_env
7
+ from langchain.vectorstores import Chroma
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain.chains import RetrievalQA
10
+ from langchain.document_loaders import TextLoader
11
+ from langchain.embeddings import HuggingFaceEmbeddings
12
+ from googletrans import Translator
13
+ import streamlit as st
14
+ import together
15
+ import textwrap
16
+ import spacy
17
+ import os
18
+ import re
19
+
20
+ os.environ["TOGETHER_API_KEY"] = "6101599d6e33e3bda336b8d007ca22e35a64c72cfd52c2d8197f663389fc50c5"
21
+
22
+ # -- LLM class
23
+ class TogetherLLM(LLM):
24
+ """Together large language models."""
25
+
26
+ model: str = "togethercomputer/llama-2-70b-chat"
27
+ """model endpoint to use"""
28
+
29
+ together_api_key: str = os.environ["TOGETHER_API_KEY"]
30
+ """Together API key"""
31
+
32
+ temperature: float = 0.7
33
+ """What sampling temperature to use."""
34
+
35
+ max_tokens: int = 512
36
+ """The maximum number of tokens to generate in the completion."""
37
+
38
+ original_transcription: str = ""
39
+ """Original transcription"""
40
+
41
+ class Config:
42
+ extra = Extra.forbid
43
+
44
+ #@root_validator(skip_on_failure=True)
45
+ def validate_environment(cls, values: Dict) -> Dict:
46
+ """Validate that the API key is set."""
47
+ api_key = get_from_dict_or_env(
48
+ values, "together_api_key", "TOGETHER_API_KEY"
49
+ )
50
+ values["together_api_key"] = api_key
51
+ return values
52
+
53
+ @property
54
+ def _llm_type(self) -> str:
55
+ """Return type of LLM."""
56
+ return "together"
57
+
58
+ def clean_duplicates(self, transcription: str) -> str:
59
+ transcription = transcription.strip().replace('/n/n ', """
60
+ """)
61
+ new_transcription_aux = []
62
+ for text in transcription.split('\n\n'):
63
+ if text not in new_transcription_aux:
64
+ new_transcription_aux.append(text)
65
+ return '\n\n'.join(new_transcription_aux)
66
+
67
+ def _call(
68
+ self,
69
+ prompt: str,
70
+ **kwargs: Any,
71
+ ) -> str:
72
+ """Call to Together endpoint."""
73
+ regex_transcription = r'CONTEXTO:(\n.*)+PREGUNTA'
74
+ regex_init_transcription = r"Desde el instante [0-9]+:[0-9]+:[0-9]+(?:\.[0-9]+)? hasta [0-9]+:[0-9]+:[0-9]+(?:\.[0-9]+)? [a-zA-Z ]+ dice: ?"
75
+
76
+ # -- Extract transcription
77
+ together.api_key = self.together_api_key
78
+ cleaned_prompt = self.clean_duplicates(prompt)
79
+ print(cleaned_prompt)
80
+ resultado = re.search(regex_transcription, cleaned_prompt, re.DOTALL)
81
+
82
+ resultado = re.sub(regex_init_transcription, "", resultado.group(1).strip()).replace('\"', '')
83
+ resultado_alpha_num = [re.sub(r'\W+', ' ', resultado_aux).strip().lower() for resultado_aux in resultado.split('\n\n')]
84
+
85
+ # -- Setup new transcription format, without duplicates and with its correspondent speaker
86
+ new_transcription = []
87
+ for transcription in self.original_transcription.split('\n\n'):
88
+ transcription_cleaned = re.sub(regex_init_transcription, "", transcription.strip()).replace('\"', '')
89
+ transcription_cleaned = re.sub(r'\W+', ' ', transcription_cleaned).strip().lower()
90
+ for resultado_aux in resultado_alpha_num:
91
+ if resultado_aux in transcription_cleaned or transcription_cleaned in resultado_aux:
92
+ init_transcription = re.findall(regex_init_transcription, transcription)[0]
93
+ new_transcription.append(init_transcription + '\"' + resultado_aux + '\"')
94
+ # -- Merge with original transcription
95
+ new_transcription = '\n\n'.join(list(set(new_transcription)))
96
+ new_cleaned_prompt = re.sub(regex_transcription, f"""CONTEXTO:
97
+ {new_transcription}
98
+ PREGUNTA:""", cleaned_prompt, re.DOTALL)
99
+ print(new_cleaned_prompt)
100
+ output = together.Complete.create(new_cleaned_prompt,
101
+ model=self.model,
102
+ max_tokens=self.max_tokens,
103
+ temperature=self.temperature,
104
+ )
105
+ text = output['output']['choices'][0]['text']
106
+ return text
107
+
108
+ # -- Python function to setup basic features: translator, SpaCy pipeline and LLM model
109
+ @st.cache_resource
110
+ def setup_app(transcription_path, emb_model, model, _logger):
111
+ # -- Setup enviroment and features
112
+ translator = Translator(service_urls=['translate.googleapis.com'])
113
+ nlp = spacy.load('es_core_news_lg')
114
+
115
+ _logger.info('Setup environment and features...')
116
+
117
+ # -- Setup LLM
118
+ together.api_key = os.environ["TOGETHER_API_KEY"]
119
+ # List available models and descriptons
120
+ models = together.Models.list()
121
+ # Set llama2 7b LLM
122
+ together.Models.start(model)
123
+ _logger.info('Setup environment and features - FINISHED!')
124
+
125
+ # -- Read translated transcription
126
+ _logger.info('Loading transcription...')
127
+ loader = TextLoader(transcription_path)
128
+ documents = loader.load()
129
+ # Splitting the text into chunks
130
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
131
+ texts = text_splitter.split_documents(documents)
132
+ _logger.info('Loading transcription - FINISHED!')
133
+
134
+ # -- Load embedding
135
+ _logger.info('Loading embedding...')
136
+ encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
137
+ model_norm = HuggingFaceEmbeddings(
138
+ model_name=emb_model,
139
+ model_kwargs={'device': 'cpu'},
140
+ encode_kwargs=encode_kwargs
141
+ )
142
+ _logger.info('Loading embedding - FINISHED!')
143
+
144
+ # -- Create document database
145
+ _logger.info('Creating document database...')
146
+ # Embed and store the texts
147
+ # Supplying a persist_directory will store the embeddings on disk
148
+ persist_directory = 'db'
149
+ ## Here is the nmew embeddings being used
150
+ embedding = model_norm
151
+
152
+ vectordb = Chroma.from_documents(documents=texts,
153
+ embedding=embedding,
154
+ persist_directory=persist_directory)
155
+
156
+ # -- Make a retreiver
157
+ retriever = vectordb.as_retriever(search_kwargs={"k": 5})
158
+ _logger.info('Creating document database - FINISHED!')
159
+ _logger.info('Setup finished!')
160
+ return translator, nlp, retriever
161
+
162
+ # -- Function to get prompt template
163
+ def get_prompt(instruction, system_prompt, b_sys, e_sys, b_inst, e_inst, _logger):
164
+ new_system_prompt = b_sys + system_prompt + e_sys
165
+ prompt_template = b_inst + new_system_prompt + instruction + e_inst
166
+ _logger.info('Prompt template created: {}'.format(instruction))
167
+ return prompt_template
168
+
169
+ # -- Function to create the chain to answer questions
170
+ @st.cache_resource
171
+ def create_llm_chain(model, _retriever, _chain_type_kwargs, _logger, transcription_path):
172
+ _logger.info('Creating LLM chain...')
173
+ # -- Keep original transcription
174
+ with open(transcription_path, 'r') as f:
175
+ formatted_transcription = f.read()
176
+
177
+ llm = TogetherLLM(
178
+ model= model,
179
+ temperature = 0.0,
180
+ max_tokens = 1024,
181
+ original_transcription = formatted_transcription
182
+ )
183
+ qa_chain = RetrievalQA.from_chain_type(llm=llm,
184
+ chain_type="stuff",
185
+ retriever=_retriever,
186
+ chain_type_kwargs=_chain_type_kwargs,
187
+ return_source_documents=True)
188
+ _logger.info('Creating LLM chain - FINISHED!')
189
+ return qa_chain
190
+
191
+ # -------------------------------------------
192
+ # -- Auxiliar functions
193
+ def wrap_text_preserve_newlines(text, width=110):
194
+ # Split the input text into lines based on newline characters
195
+ lines = text.split('\n')
196
+
197
+ # Wrap each line individually
198
+ wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
199
+
200
+ # Join the wrapped lines back together using newline characters
201
+ wrapped_text = '\n'.join(wrapped_lines)
202
+
203
+ return wrapped_text
204
+
205
+ def process_llm_response(llm_response, nlp):
206
+ response = llm_response['result']
207
+ return wrap_text_preserve_newlines(response)
208
+
209
+
210
+ def time_to_seconds(time_str):
211
+ parts = time_str.split(':')
212
+ hours, minutes, seconds = map(float, parts)
213
+ return int((hours * 3600) + (minutes * 60) + seconds)
214
+
215
+ # -- Extract seconds from transcription
216
+ def add_hyperlink_and_convert_to_seconds(text):
217
+ time_pattern = r'(\d{2}:\d{2}:\d{2}(?:.\d{6})?)'
218
+
219
+ def get_seconds(match):
220
+ start_time_str, end_time_str = match[0], match[1]
221
+ start_time_seconds = time_to_seconds(start_time_str)
222
+ end_time_seconds = time_to_seconds(end_time_str)
223
+ return start_time_str, start_time_seconds, end_time_str, end_time_seconds
224
+ start_time_str, start_time_seconds, end_time_str, end_time_seconds = get_seconds(re.findall(time_pattern, text))
225
+ return start_time_str, start_time_seconds, end_time_str, end_time_seconds
226
+
227
+ # -- Streamlit HTML template
228
+ def typewrite(youtube_video_url, i=0):
229
+ youtube_video_url = youtube_video_url.replace("?enablejsapi=1", "")
230
+ margin = "{margin: 0;}"
231
+ html = f"""
232
+ <html>
233
+ <style>
234
+ p {margin}
235
+ </style>
236
+ <body>
237
+ <script src="https://www.youtube.com/player_api"></script>
238
+ <p align="center">
239
+ <iframe id="player_{i}" src="{youtube_video_url}" width="600" height="450"></iframe>
240
+ </p>
241
+ </body>
242
+ </html>
243
+ """
244
+ return html