File size: 8,269 Bytes
a455a1e
 
 
 
 
 
 
 
e362020
a455a1e
 
 
3df9565
 
a455a1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce757ef
a455a1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e362020
 
 
ff32e39
e362020
a455a1e
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import pandas as pd
from glob import glob
from scipy import spatial 
from collections import defaultdict

import tiktoken
import openai
import gradio as gr
from tenacity import retry, stop_after_attempt, wait_random_exponential



#df = pd.read_json('https://www.dropbox.com/scl/fi/uh964d1k6woc9wo3l2slc/dyf_w_embeddings.json?rlkey=j23j5338n4e88kvvsmj7s7aff&dl=1')
df = pd.read_json('dyf_w_embeddings.json')


GPT_MODEL = 'gpt-3.5-turbo'
EMBEDDING_MODEL = "text-embedding-ada-002"

# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 25
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response["data"][0]["embedding"]
    strings_and_relatednesses = [
        (row["citation"]+':\n'+row["text"]+'\nINDEX:'+str(i), relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def double_check(question, passage):
    
    message = f'Possibly related text:{passage}\n\nSearch query: {question}'
    messages = [
        {"role": "system", "content": "Is the following text topically related to the search query. Answer with just Yes or No."},
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    if 'yes' in response_message.lower():
        return True
    return False
    return response_message

def extract_numbers_after_index(text):
    numbers = []
    lines = text.split("\n")

    for line in lines:
        if "INDEX:" in line:
            index = line.split("INDEX:")[1].strip()
            try:
                number = int(index)
                numbers.append(number)
            except ValueError:
                pass

    return numbers


def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = 'Use the below articles written by W.E.B. Du Bois subsequent question. Write your response in the form of an four paragraph essay for a college class. If the answer cannot be found in the articles, write "I could not find an answer. Be sure to put direct quotations in quotation marks. Use in APA-Style text references where approriate.'
    message = introduction
    article_cites = defaultdict(int)
    
    for counter, string in enumerate(strings):
        article_cite = string.splitlines()[0]
        next_article = f'\n\nDu Bois article:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + query, model=model)
            > token_budget
        ):
            break
        else:
            if double_check(query, string) == True and article_cites[article_cite] <= 2:
                message += next_article
                article_cites[article_cite] += 1
    print(article_cites)     
    return message + query

def remove_lines_with_index(input_string):
    lines = input_string.strip().split('\n')
    cleaned_lines = [line for line in lines if "INDEX:" not in line]
    cleaned_string = "\n".join(cleaned_lines)
    return cleaned_string

def ask(
    query: str,

) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    
    model = GPT_MODEL
    token_budget = 4096 - 600

        
    message = query_message(query, df, model=model, token_budget=token_budget)
    
    # Add references
    cite_rows = extract_numbers_after_index(message)
    used_df = df[df.index.isin(cite_rows)].copy()
    citations = list(set(used_df['citation'].values))
    if len(citations) == 0:
        return "No relevant articles found. Sorry. Please try a different question."
    
    resources = '**Resources**\n* ' + '\n* '.join(sorted(citations))
    # clean up to remove index
    message = remove_lines_with_index(message)
    
    
    messages = [
        {"role": "system", "content": "You answer questions based on the writings of W.E.B. Du Bois. All the provided texts are written by Du Bois."},
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    

    
    answer = f'{resources}\n\n**Summary**\n\n{response_message}'
    return answer



intro_text = '''
# W.E.B. Du Bois in the Crisis

This search engine find the most relevant articles from [Dare You Fight](https://www.dareyoufight.org), an online repository of W.E.B. Du Bois's writings in The Crisis, the official journal of the NAACP, which Du Bois founded and edited between 1911 and 1934. In addition to locating the most relevant articles, it also produces a short essay in response to your question. 

**Notes:** 
* Avoid using "Du Bois" in the question, as this is information is passed along behind the scenes. 
* Searches can take 20 to 40 seconds.
* You may need a follow up question if your original question is only a word or two.
* The model usually looks at five or fewer relevant articles, so if you response requires more, consider refining and splitting up your question. 

**Caveats:** Like all apps that employ large language models, this one has the possiblitiy for bias and confabulation. Please refer to the original articles. 

'''

outro_text = '''
**Behind the Scenes**

This app uses sentence embeddings and a large language model to craft the response. Behind the scenes, it involves the following steps:

1. Each article from Dare You Fight (or segment of the article if it's long) is converted into a fixed-length vector representation using OpenAI's text-embedding-ada-002 model. These representations are stored in a dataframe.
2. The user's query is embedded using the same text-embedding-ada-002 model to convert it into a fixed-length vector representation.
3. To find the most relevant articles to the query, cosine similarity is calculated between the query vector and all the article vectors. The articles with the highest cosine similarity are retrieved as the top matches.
4. The text of each of the possibly related articles (based on Step 3) is passed to OpenAI's ChatGPT 3.5 model, along with a question asking whether the text is relevant to the search query. Only texts coded as relevant are used in subsequent steps.
5. All of the relevant texts (from Step 4), along with the original search query, are passed to OpenAI's ChatGPT 3.5 model with specific instructions to answer the query in the form of a college essay using only the supplied texts.
'''



block = gr.Blocks(theme = 'bethecloud/storj_theme')

with block:
    gr.Markdown(intro_text)
        
    # Define the input and output blocks
    input_block = gr.Textbox(label='Question')
    research_btn = gr.Button(value="Ask the archive")
    output_block = gr.Markdown(label="Response")
    research_btn.click(ask, inputs=input_block, outputs=output_block)
    gr.Examples(["What is the relationship between social, political and economic equality?", 
                 "What is Pan-Africanism?",
                 "Did Du Bois support American involvement in WWI?",
                 "What are the most effective tactics or methods for racial equality?",
                 "Why was the NAACP founded and what was it's original goals?"], inputs=[input_block])
    gr.Markdown(outro_text)

# Launch the interface
block.launch()