FreshP commited on
Commit
6779746
1 Parent(s): aae3e83

Added examples

Browse files
Files changed (1) hide show
  1. app.py +51 -42
app.py CHANGED
@@ -10,77 +10,86 @@ from gensim.models.fasttext import load_facebook_model
10
 
11
  ACCESS_KEY = os.environ.get('ACCESS_KEY')
12
 
13
- # download model from huggingface hub
 
14
  url = hf_hub_url(repo_id="simonschoe/call2vec", filename="model.bin")
15
  cached_download(url)
16
-
17
- # load model via gensim
18
  model = load_facebook_model(cached_download(url))
19
 
20
- def process(_input, topn):
 
21
 
22
- # split by delimiting characters
23
  _input = re.split('[,;\n]', _input)
24
-
25
- # convert input to lower, replace whitespaces by underscores
26
  _input = [s.strip().lower().replace(' ', '_') for s in _input if s]
27
 
28
  if _input[0] != ACCESS_KEY:
29
  with open('log.txt', 'a') as f:
30
  f.write(str(datetime.now()) + '+++' + '___'.join(_input) + '\n')
31
 
32
- # apply model
33
- if len(_input)>1:
34
- # compute average seed embedding
35
  avg_input = np.stack([model.wv[w] for w in _input], axis=0).mean(axis=0)
36
- # find similarities
37
- nearest_neighbors = model.wv.most_similar(positive=avg_input, topn=topn)
38
- frequencies = [model.wv.get_vecattr(nn[0], 'count') for nn in nearest_neighbors]
39
  else:
40
- # find similarities
41
- nearest_neighbors = model.wv.most_similar(positive=_input[0], topn=topn)
42
- frequencies = [model.wv.get_vecattr(nn[0], 'count') for nn in nearest_neighbors]
43
-
44
- result = pd.DataFrame([(a[0],a[1],b) for a,b in zip(nearest_neighbors, frequencies)], columns=['Token', 'Cosine Similarity', 'Frequency'])
45
  if _input[0] == ACCESS_KEY:
46
  with open('log.txt', 'r') as f:
47
  prompts = f.readlines()
48
  prompts = [p.strip().split('+++') for p in prompts]
49
- result = pd.DataFrame(prompts, columns=['Time', 'Prompt'])
 
 
 
 
 
50
  result.to_csv('result.csv')
51
  return result, 'result.csv', '\n'.join(_input)
52
 
53
- def save(df):
54
- df.to_csv('result.csv')
55
- return 'result.csv'
56
-
57
- demo = gr.Blocks()
58
 
59
- with demo:
60
  gr.Markdown("# Call2Vec")
61
- gr.Markdown("## Earnings call transformation project")
62
  with gr.Row():
63
  with gr.Column():
64
- gr.Markdown("""
 
65
  #### Project Description
66
- Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.""")
 
 
 
67
  gr.Markdown(
68
- """#### App usage:
69
- Add your input prompts to the text field on the right. To use multiple input prompts at once separate
70
- them by comma, semicolon or a new line
71
- ##### Examples
72
- - Climate change
73
- - Financial risk, energy dependency, climate neutrality
74
  """
75
  )
76
  with gr.Column():
77
- text_input = gr.Textbox(lines=1)
78
  with gr.Row():
79
- n_output = gr.Slider(minimum=5, maximum=50, step=1)
80
- compute_button = gr.Button("Compute")
81
- df_output = gr.Dataframe(interactive=False)
82
- file_out = gr.File(interactive=False)
83
-
84
- compute_button.click(process, inputs=[text_input, n_output], outputs=[df_output, file_out, text_input])
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
- demo.launch()
 
10
 
11
  ACCESS_KEY = os.environ.get('ACCESS_KEY')
12
 
13
+
14
+ # Setup model
15
  url = hf_hub_url(repo_id="simonschoe/call2vec", filename="model.bin")
16
  cached_download(url)
 
 
17
  model = load_facebook_model(cached_download(url))
18
 
19
+ def semantic_search(_input, n):
20
+ """ Perform semantic search """
21
 
 
22
  _input = re.split('[,;\n]', _input)
 
 
23
  _input = [s.strip().lower().replace(' ', '_') for s in _input if s]
24
 
25
  if _input[0] != ACCESS_KEY:
26
  with open('log.txt', 'a') as f:
27
  f.write(str(datetime.now()) + '+++' + '___'.join(_input) + '\n')
28
 
29
+ if len(_input) > 1:
 
 
30
  avg_input = np.stack([model.wv[w] for w in _input], axis=0).mean(axis=0)
31
+ nearest_neighbours = model.wv.most_similar(positive=avg_input, topn=n)
32
+ frequencies = [model.wv.get_vecattr(nn[0], 'count') for nn in nearest_neighbours]
 
33
  else:
34
+ nearest_neighbours = model.wv.most_similar(positive=_input[0], topn=n)
35
+ frequencies = [model.wv.get_vecattr(nn[0], 'count') for nn in nearest_neighbours]
36
+
 
 
37
  if _input[0] == ACCESS_KEY:
38
  with open('log.txt', 'r') as f:
39
  prompts = f.readlines()
40
  prompts = [p.strip().split('+++') for p in prompts]
41
+ result = pd.DataFrame(prompts,
42
+ columns=['Time', 'Prompt'])
43
+ else:
44
+ result = pd.DataFrame([(a[0],a[1],b) for a,b in zip(nearest_neighbours, frequencies)],
45
+ columns=['Token', 'Cosine Similarity', 'Corpus Frequency'])
46
+
47
  result.to_csv('result.csv')
48
  return result, 'result.csv', '\n'.join(_input)
49
 
50
+ app = gr.Blocks()
 
 
 
 
51
 
52
+ with app:
53
  gr.Markdown("# Call2Vec")
54
+ gr.Markdown("## Semantic Search in Quarterly Earnings Conference Calls")
55
  with gr.Row():
56
  with gr.Column():
57
+ gr.Markdown(
58
+ """
59
  #### Project Description
60
+ Call2Vec is a [fastText](https://fasttext.cc/) word embedding model trained via [Gensim](https://radimrehurek.com/gensim/). It maps each token in the vocabulary into a dense, 300-dimensional vector space, designed for performing semantic search.
61
+ The model is trained on a large sample of quarterly earnings conference calls, held by U.S. firms during the 2006-2022 period. In particular, the training data is restriced to the (rather sponentous) executives' remarks of the Q&A section of the call. The data has been preprocessed prior to model training via stop word removal, lemmatization, named entity masking, and coocurrence modeling.
62
+ """
63
+ )
64
  gr.Markdown(
65
+ """
66
+ #### App usage
67
+ The model is intented to be used for **semantic search**: It encodes the search query (entered in the textbox on the right) in a dense vector space and finds semantic neighbours, i.e., token which frequently occur within similar contexts in the underlying training data.
68
+ The model allows for two use cases:
69
+ 1. *Single Search:* The input query consists of a single word. When provided a bi-, tri-, or even fourgram, the quality of the model output depends on the presence of the query token in the model's vocabulary. N-grams should be concated by an underscore (e.g., "machine_learning" or "artifical_intelligence").
70
+ 2. *Multi Search:* The input query may consist of several words or n-grams, seperated by comma, semi-colon or newline. It then computes the average vector over all inputs and performs semantic search based on the average input token.
71
  """
72
  )
73
  with gr.Column():
74
+ text_in = gr.Textbox(lines=1, placeholder="Insert text", label="Search Query")
75
  with gr.Row():
76
+ n = gr.Slider(value=50, minimum=5, maximum=250, step=5, label="Number of Neighbours")
77
+ compute_bt = gr.Button("Start\nSearch")
78
+ df_out = gr.Dataframe(interactive=False)
79
+ f_out = gr.File(interactive=False, label="Download")
80
+ gr.Examples(
81
+ examples = [["transformation", 3], ["climate_change", 3], ["risk, political_risk, uncertainty", 5]],
82
+ inputs = [text_in, n],
83
+ outputs = [df_out, f_out, text_in],
84
+ fn = semantic_search,
85
+ cache_examples=True
86
+ )
87
+ gr.Markdown(
88
+ """
89
+ <div style='text-align: center;'>Call2Vec by X and Y</center></div>
90
+ <p class="aligncenter"><img 'id="visitor-badge" alt="visitor badge" src="https://visitor-badge.glitch.me/badge?page_id=simonschoe.call2vec&left_color=green&right_color=blue" /></p>
91
+ """
92
+ )
93
+ compute_bt.click(semantic_search, inputs=[text_in, n], outputs=[df_out, f_out, text_in])
94
 
95
+ app.launch()