File size: 5,167 Bytes
64ab470
 
d055610
64ab470
d055610
 
 
64ab470
d055610
64ab470
c00a75f
d055610
 
 
64ab470
d055610
c00a75f
64ab470
 
 
 
 
d055610
64ab470
 
c00a75f
64ab470
d055610
c00a75f
64ab470
d055610
c00a75f
d055610
64ab470
d055610
64ab470
 
c00a75f
 
 
 
 
d055610
64ab470
 
 
d055610
64ab470
2cc38ad
d055610
 
 
 
 
 
 
 
 
e45808b
 
6cfc464
d055610
e45808b
 
b06fb6c
 
 
 
 
 
 
 
 
 
 
d055610
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64ab470
2cc38ad
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
import re
from datetime import datetime

import gradio as gr
import numpy as np
import pandas as pd
from gensim.models.fasttext import load_facebook_model
from huggingface_hub import hf_hub_download


ACCESS_KEY = os.environ.get('ACCESS_KEY')
url = hf_hub_download(repo_id="simonschoe/call2vec", filename="model.bin") #hf_hub_url(repo_id="simonschoe/call2vec", filename="model.bin")
model = load_facebook_model(url)

def semantic_search(_input, neighbours):
    """ Perform semantic search """

    _input = re.split('[,;\n]', _input)
    _input = [s.strip().lower().replace(' ', '_') for s in _input if s]

    if _input[0] != ACCESS_KEY:
        with open('log.txt', 'a', encoding='utf-8') as f:
            f.write(str(datetime.now()) + '+++' + '___'.join(_input) + '\n')

    if len(_input) > 1:
        avg_input = np.stack([model.wv[w] for w in _input], axis=0).mean(axis=0)
        nearest_neighbours = model.wv.most_similar(positive=avg_input, topn=neighbours)
        frequencies = [model.wv.get_vecattr(nn[0], 'count') for nn in nearest_neighbours]
    else:
        nearest_neighbours = model.wv.most_similar(positive=_input[0], topn=neighbours)
        frequencies = [model.wv.get_vecattr(nn[0], 'count') for nn in nearest_neighbours]

    if _input[0] == ACCESS_KEY:
        with open('log.txt', 'r', encoding='utf-8') as f:
            prompts = f.readlines()
            prompts = [p.strip().split('+++') for p in prompts]
        result = pd.DataFrame(prompts,
                              columns=['Time', 'Prompt'])
    else:
        result = pd.DataFrame([(a[0],a[1],b) for a,b in zip(nearest_neighbours, frequencies)],
                              columns=['Token', 'Cosine Similarity', 'Corpus Frequency'])

    result.to_csv('result.csv')
    return result, 'result.csv', '\n'.join(_input)

app = gr.Blocks(theme=gr.themes.Default(), css='#component-0 {max-width: 730px; margin: auto; padding-top: 1.5rem}')

with app:
    gr.Markdown(
        """
        # Call2Vec
        ## Semantic Search in Quarterly Earnings Conference Calls
        """
    )

    with gr.Tabs() as tabs:
        with gr.TabItem("πŸ” Model", id=0):
            text_in = gr.Textbox(lines=1, placeholder="Insert text", label="Search Query")
            with gr.Row():
                n = gr.Slider(value=50, minimum=5, maximum=500, step=5, label="Number of Neighbours")
                btn = gr.Button("Search")
            df_out = gr.Dataframe(interactive=False)
            f_out = gr.File(interactive=False, label="Download")
            gr.Examples(
                examples = [
                    ["transformation", 20],
                    ["climate_change", 50],
                    ["risk, political_risk, uncertainty", 250],
                ],
                inputs = [text_in, n],
                outputs = [df_out, f_out, text_in],
                fn = semantic_search,
                cache_examples=True
            )
        with gr.TabItem("πŸ“ Usage", id=1):
            gr.Markdown(
                """
                #### App usage
                The model is intended to be used for **semantic search**: It encodes the search query (entered in the textbox on the right) in a dense vector space and finds semantic neighbours, i.e., token which frequently occur within similar contexts in the underlying training data.
                The model allows for two use cases:
                1. *Single Search:* The input query consists of a single word. When provided a bi-, tri-, or even fourgram, the quality of the model output depends on the presence of the query token in the model's vocabulary. N-grams should be concated by an underscore (e.g., "machine_learning" or "artifical_intelligence").
                2. *Multi Search:* The input query may consist of several words or n-grams, seperated by comma, semi-colon or newline. It then computes the average vector over all inputs and performs semantic search based on the average input token.
                """
            )
        with gr.TabItem("πŸ“– About", id=2):
            gr.Markdown(
                """
                #### Project Description
                Call2Vec is a [fastText](https://fasttext.cc/) word embedding model trained via [Gensim](https://radimrehurek.com/gensim/). It maps each token in the vocabulary into a dense, 300-dimensional vector space, designed for performing semantic search.
                The model is trained on a large sample of quarterly earnings conference calls, held by U.S. firms during the 2006-2022 period. In particular, the training data is restriced to the (rather sponentous) executives' remarks of the Q&A section of the call. The data has been preprocessed prior to model training via stop word removal, lemmatization, named entity masking, and coocurrence modeling.
                """
            )

    with gr.Accordion("πŸ“™ Citation", open=False):
        citation_button = gr.Textbox(
            value='Placeholder',
            label='Copy to cite these results.',
            show_copy_button=True
        )

    btn.click(semantic_search, inputs=[text_in, n], outputs=[df_out, f_out, text_in])

app.launch()