ofermend commited on
Commit
2d02ed4
1 Parent(s): 0e3c0ad

Upload 4 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +89 -71
  3. header-image-1.png +3 -0
  4. query.py +66 -16
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  header-image-2.png filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  header-image-2.png filter=lfs diff=lfs merge=lfs -text
37
+ header-image-1.png filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,10 +1,12 @@
1
  from omegaconf import OmegaConf
2
  from query import VectaraQuery
3
  import os
4
- import requests
5
 
6
  import streamlit as st
7
  from PIL import Image
 
 
 
8
 
9
  def inject_custom_css():
10
  st.markdown(
@@ -17,12 +19,18 @@ def inject_custom_css():
17
  color: #333;
18
  }
19
  body {
 
 
 
20
  padding-top: 0px;
 
21
  }
22
  .stApp {
23
- padding-top: 10px;
 
24
  }
25
  .stButton>button {
 
26
  background-color: #4CAF50;
27
  color: white;
28
  padding: 10px 24px;
@@ -61,80 +69,90 @@ def inject_custom_css():
61
  .css-1d391kg { /* This targets the sidebar headings */
62
  color: #333 !important;
63
  }
64
- .form-container {
65
- display: flex;
66
- justify-content: space-between;
67
- align-items: center;
68
- }
69
- .form-container .stTextInput {
70
- flex: 1;
71
- }
72
- .form-container .stButton {
73
- margin-left: 10px;
74
- }
75
  </style>
76
  """,
77
  unsafe_allow_html=True
78
  )
79
 
80
- def launch_bot():
81
-
82
- if 'cfg' not in st.session_state:
83
- cfg = OmegaConf.create({
84
- 'customer_id': str(os.environ['VECTARA_CUSTOMER_ID']),
85
- 'corpus_id': str(os.environ['VECTARA_CORPUS_ID']),
86
- 'api_key': str(os.environ['VECTARA_API_KEY']),
87
- 'streaming': False
88
- })
89
- st.session_state.cfg = cfg
90
- st.session_state.vq = VectaraQuery(cfg.api_key, cfg.customer_id, [cfg.corpus_id],
91
- "vectara-summary-ext-24-05-large")
92
-
93
- cfg = st.session_state.cfg
94
- vq = st.session_state.vq
95
- st.set_page_config(page_title="Media Demo", layout="wide")
96
- inject_custom_css()
97
-
98
- header_image = Image.open('header-image-2.png')
99
- cropped_image = header_image.crop((0, 0, header_image.width, 200))
100
- st.image(cropped_image, use_column_width=True)
101
-
102
- # left side content
103
- with st.sidebar:
104
- image = Image.open('vectara-logo.png')
105
- st.markdown("## Welcome to Media Demo\n\n"
106
- "This demo uses Vectara to find the movie where a quote is from\n\n"
107
- "Covers movies from this [playlist](https://www.youtube.com/playlist?list=PLHPTxTxtC0ibVZrT2_WKWUl2SAxsKuKwx) of free movies")
108
-
109
- st.markdown("---")
110
- st.markdown(
111
- "## How this works?\n"
112
- "This app was built with [Vectara](https://vectara.com).\n"
113
- )
114
- st.markdown("---")
115
- st.image(image, width=250)
116
-
117
- st.markdown("<center> <h3>\"Find that movie\" demo</h3> </center>", unsafe_allow_html=True)
118
-
119
- st.markdown('<div class="form-container">', unsafe_allow_html=True)
120
- with st.form(key='my_form'):
121
- question = st.text_input("Enter your movie quote:")
122
- submit_button = st.form_submit_button(label='Find the Match')
123
- st.markdown('</div>', unsafe_allow_html=True)
124
-
125
- if submit_button and len(question) > 5:
126
- movie_name, match_url, score = vq.submit_query(question)
127
- if score < 0.7:
128
- st.write("Sorry, I couldn't find a match for that quote. Please try another one.")
129
- else:
130
- video_url, start_time = match_url.split('&t=')
131
- start_time = start_time[:-1] # remove the trailing 's'
132
-
133
- col1, col2, col3 = st.columns([1, 2, 1])
134
- with col2:
135
- st.write(f"Here's a useful video for you: {movie_name}")
136
- st.video(video_url, start_time=int(float(start_time)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  if __name__ == "__main__":
139
- launch_bot()
140
 
 
1
  from omegaconf import OmegaConf
2
  from query import VectaraQuery
3
  import os
 
4
 
5
  import streamlit as st
6
  from PIL import Image
7
+ import concurrent.futures
8
+
9
+ SCORE_THRESHOLD = 0.7
10
 
11
  def inject_custom_css():
12
  st.markdown(
 
19
  color: #333;
20
  }
21
  body {
22
+ font-family: 'Roboto', sans-serif;
23
+ background-color: #f5f5f5;
24
+ color: #333;
25
  padding-top: 0px;
26
+ margin-top: 0px;
27
  }
28
  .stApp {
29
+ padding-top: 0px;
30
+ margin-top: 0px;
31
  }
32
  .stButton>button {
33
+ margin-top: 25px;
34
  background-color: #4CAF50;
35
  color: white;
36
  padding: 10px 24px;
 
69
  .css-1d391kg { /* This targets the sidebar headings */
70
  color: #333 !important;
71
  }
 
 
 
 
 
 
 
 
 
 
 
72
  </style>
73
  """,
74
  unsafe_allow_html=True
75
  )
76
 
77
+ def fetch_summary(vq, matching_text, doc_id):
78
+ return vq.get_summary(matching_text, doc_id)
79
+
80
+ def launch_app():
81
+ with concurrent.futures.ThreadPoolExecutor() as executor:
82
+
83
+ if 'cfg' not in st.session_state:
84
+ cfg = OmegaConf.create({
85
+ 'customer_id': str(os.environ['VECTARA_CUSTOMER_ID']),
86
+ 'corpus_id': str(os.environ['VECTARA_CORPUS_ID']),
87
+ 'api_key': str(os.environ['VECTARA_API_KEY']),
88
+ 'streaming': False
89
+ })
90
+ st.session_state.cfg = cfg
91
+ st.session_state.vq = VectaraQuery(cfg.api_key, cfg.customer_id, [cfg.corpus_id])
92
+
93
+ cfg = st.session_state.cfg
94
+ vq = st.session_state.vq
95
+ st.set_page_config(page_title="Media Demo", layout="wide")
96
+ inject_custom_css()
97
+
98
+ header_image = Image.open('header-image-2.png')
99
+ cropped_image = header_image.crop((0, 0, header_image.width, 150))
100
+ st.image(cropped_image, use_column_width=True)
101
+
102
+ # left side content
103
+ with st.sidebar:
104
+ image = Image.open('vectara-logo.png')
105
+ st.markdown("## Welcome to Media Demo\n\n"
106
+ "This demo uses Vectara to find the movie where a quote is from.\n\n"
107
+ "Covers movies from this [playlist](https://www.youtube.com/playlist?list=PLHPTxTxtC0ibVZrT2_WKWUl2SAxsKuKwx) of free movies.")
108
+
109
+ st.markdown("---")
110
+ st.markdown(
111
+ "## How this works?\n"
112
+ "This app was built with [Vectara](https://vectara.com).\n"
113
+ )
114
+ st.markdown("---")
115
+ st.image(image, width=250)
116
+
117
+ st.markdown("<center> <h3>\"Where did I hear that line?\"</h3> </center>", unsafe_allow_html=True)
118
+
119
+ _, q_col, _ = st.columns([1, 4, 1])
120
+ with q_col:
121
+ quote = st.text_input("quote", label_visibility="hidden", placeholder="Enter a quote from a movie.")
122
+ prev_quote = st.session_state.get('prev_quote', '')
123
+ if quote != prev_quote:
124
+ st.session_state.quote = quote
125
+ st.session_state.prev_quote = quote
126
+ st.session_state.movie_name, st.session_state.match_url, st.session_state.score, doc_id, matching_text = vq.submit_query(quote)
127
+ if st.session_state.score < SCORE_THRESHOLD:
128
+ st.session_state.movie_name = None
129
+ else:
130
+ future = executor.submit(fetch_summary, vq, matching_text, doc_id)
131
+ st.session_state.summary_future = future
132
+
133
+ if 'score' in st.session_state and st.session_state.score:
134
+ if st.session_state.movie_name is None:
135
+ st.write("Sorry, I couldn't find a match for that quote. Please try another one.")
136
+ else:
137
+ video_url, start_time = st.session_state.match_url.split('&t=')
138
+ video_url = f"{video_url}&cc_load_policy=1"
139
+ start_time = start_time[:-1] # remove the trailing 's'
140
+
141
+ _, video_col, summary_col = st.columns([1, 4, 3])
142
+ with video_col:
143
+ st.video(video_url, start_time=int(float(start_time)))
144
+ with summary_col:
145
+ # Display the summary when it's ready
146
+ if 'summary_future' in st.session_state:
147
+ if st.session_state.summary_future.done():
148
+ st.markdown("**Summary:**")
149
+ st.session_state.summary = st.session_state.summary_future.result()
150
+ st.markdown(st.session_state.summary)
151
+
152
+ if not st.session_state.summary_future.done():
153
+ st.rerun()
154
+
155
 
156
  if __name__ == "__main__":
157
+ launch_app()
158
 
header-image-1.png ADDED

Git LFS Details

  • SHA256: b5d85c71049219b41d52abd59f3f0ee00157967ae90e3f9aa2dfca258072813d
  • Pointer size: 132 Bytes
  • Size of remote file: 1.8 MB
query.py CHANGED
@@ -1,36 +1,67 @@
1
  import requests
2
  import json
3
- import re
4
 
5
  class VectaraQuery():
6
- def __init__(self, api_key: str, customer_id: str, corpus_ids: list[str], prompt_name: str = None):
7
  self.customer_id = customer_id
8
  self.corpus_ids = corpus_ids
9
  self.api_key = api_key
10
- self.prompt_name = prompt_name if prompt_name else "vectara-experimental-summary-ext-2023-12-11-sml"
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- def get_body(self, query_str: str):
13
  corpora_key_list = [{
14
- 'customer_id': self.customer_id, 'corpus_id': corpus_id, 'lexical_interpolation_config': {'lambda': 0.005}
15
  } for corpus_id in self.corpus_ids
16
  ]
17
- return {
 
 
 
 
 
 
18
  'query': [
19
  {
20
  'query': query_str,
21
  'start': 0,
22
- 'numResults': 10,
23
  'corpusKey': corpora_key_list,
24
- 'context_config': {
25
- 'sentences_before': 2,
26
- 'sentences_after': 2,
27
- 'start_tag': "%START_SNIPPET%",
28
- 'end_tag': "%END_SNIPPET%",
29
  },
30
- 'rerankingConfig': { 'rerankerId': 272725719 }
31
  }
32
  ]
33
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def get_headers(self):
36
  return {
@@ -44,7 +75,7 @@ class VectaraQuery():
44
  def submit_query(self, query_str: str):
45
 
46
  endpoint = "https://api.vectara.io/v1/query"
47
- body = self.get_body(query_str)
48
 
49
  response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers())
50
  if response.status_code != 200:
@@ -56,11 +87,12 @@ class VectaraQuery():
56
  responses = res['responseSet'][0]['response'][:top_k]
57
  documents = res['responseSet'][0]['document']
58
 
59
-
60
  metadatas = []
61
  for x in responses:
62
  md = {m["name"]: m["value"] for m in x["metadata"]}
63
  doc_num = x["documentIndex"]
 
 
64
  doc_md = {f'doc_{m["name"]}': m["value"] for m in documents[doc_num]["metadata"]}
65
  md.update(doc_md)
66
  metadatas.append(md)
@@ -68,6 +100,24 @@ class VectaraQuery():
68
  movie_title = metadatas[0].get("doc_title", None)
69
  snippet_url = metadatas[0].get("url", None)
70
  score = responses[0]["score"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- return movie_title, snippet_url, score
73
 
 
1
  import requests
2
  import json
 
3
 
4
  class VectaraQuery():
5
+ def __init__(self, api_key: str, customer_id: str, corpus_ids: list[str]):
6
  self.customer_id = customer_id
7
  self.corpus_ids = corpus_ids
8
  self.api_key = api_key
9
+ self.START_TAG = "<em_start>"
10
+ self.END_TAG = "<em_end>"
11
+ self.prompt_name = "vectara-summary-ext-24-05-med"
12
+ self.prompt_text = '''
13
+ [{"role": "system", "content": "Follow these detailed step-by-step instructions, your task is to generate an accurate and coherent summary of the first search result.
14
+ - You will receive a single search result enclosed in triple quotes, which includes part of a script from a movie.
15
+ - the search result can be a part of a larger movie scence, and may be incomplete.
16
+ - the text is a sequence of subtitles from the movie itself.
17
+ - Base your summary only on the information provided in the search result, do not use any other sources.
18
+ - Do no include the word summary in your response, just the summary itself.
19
+ - Summarize the scene including who the characters are, what they do and any other important detail."},
20
+ {"role": "user", "content": "#foreach ($qResult in $vectaraQueryResults) Search Result $esc.java($foreach.index + 1): \'\'\'$esc.java($qResult.text())\'\'\'.#end"}
21
+ ]
22
+ '''
23
 
24
+ def get_body(self, query_str: str, filter: str = None, summarize: bool = True):
25
  corpora_key_list = [{
26
+ 'customerId': self.customer_id, 'corpusId': corpus_id, 'lexicalInterpolationConfig': {'lambda': 0.005}
27
  } for corpus_id in self.corpus_ids
28
  ]
29
+ if filter:
30
+ for key in corpora_key_list:
31
+ key['filter'] = filter
32
+
33
+ sent_before = 15 if summarize else 1
34
+ sent_after = 15 if summarize else 1
35
+ body = {
36
  'query': [
37
  {
38
  'query': query_str,
39
  'start': 0,
40
+ 'numResults': 50,
41
  'corpusKey': corpora_key_list,
42
+ 'contextConfig': {
43
+ 'sentences_before': sent_before,
44
+ 'sentences_after': sent_after,
45
+ 'start_tag': self.START_TAG,
46
+ 'end_tag': self.END_TAG
47
  },
 
48
  }
49
  ]
50
  }
51
+ if summarize:
52
+ body['query'][0]['summary'] = [
53
+ {
54
+ 'responseLang': 'eng',
55
+ 'maxSummarizedResults': 1,
56
+ 'summarizerPromptName': self.prompt_name,
57
+ 'promptText': self.prompt_text
58
+ }
59
+ ]
60
+ else:
61
+ body['query'][0]['rerankingConfig'] = { 'rerankerId': 272725719 } # rerank only in main query, not when summarizing
62
+
63
+ return body
64
+
65
 
66
  def get_headers(self):
67
  return {
 
75
  def submit_query(self, query_str: str):
76
 
77
  endpoint = "https://api.vectara.io/v1/query"
78
+ body = self.get_body(query_str, filter=None, summarize=False)
79
 
80
  response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers())
81
  if response.status_code != 200:
 
87
  responses = res['responseSet'][0]['response'][:top_k]
88
  documents = res['responseSet'][0]['document']
89
 
 
90
  metadatas = []
91
  for x in responses:
92
  md = {m["name"]: m["value"] for m in x["metadata"]}
93
  doc_num = x["documentIndex"]
94
+ doc_id = documents[doc_num]["id"]
95
+ md['doc_id'] = doc_id
96
  doc_md = {f'doc_{m["name"]}': m["value"] for m in documents[doc_num]["metadata"]}
97
  md.update(doc_md)
98
  metadatas.append(md)
 
100
  movie_title = metadatas[0].get("doc_title", None)
101
  snippet_url = metadatas[0].get("url", None)
102
  score = responses[0]["score"]
103
+ doc_id = metadatas[0]["doc_id"]
104
+ matching_text = responses[0]["text"].split(self.START_TAG)[1].split(self.END_TAG)[0].strip()
105
+
106
+ return movie_title, snippet_url, score, doc_id, matching_text
107
+
108
+ def get_summary(self, query_str: str, doc_id: str):
109
+
110
+ endpoint = "https://api.vectara.io/v1/query"
111
+ filter = f"doc.id == '{doc_id}'"
112
+ body = self.get_body(query_str, filter, summarize=True)
113
+
114
+ response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers())
115
+ if response.status_code != 200:
116
+ print(f"Query failed with code {response.status_code}, reason {response.reason}, text {response.text}")
117
+ return "Sorry, something went wrong in my brain. Please try again later."
118
+
119
+ res = response.json()
120
+ summary = res['responseSet'][0]['summary'][0]['text']
121
 
122
+ return summary
123