NealCaren commited on
Commit
24b752a
1 Parent(s): ae69701
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. app.py +63 -126
  3. rw7.json +3 -0
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  dyf_w_embeddings.json filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  dyf_w_embeddings.json filter=lfs diff=lfs merge=lfs -text
37
+ rw7.json filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -11,18 +11,32 @@ from tenacity import retry, stop_after_attempt, wait_random_exponential
11
 
12
 
13
  #df = pd.read_json('https://www.dropbox.com/scl/fi/uh964d1k6woc9wo3l2slc/dyf_w_embeddings.json?rlkey=j23j5338n4e88kvvsmj7s7aff&dl=1')
14
- df = pd.read_json('dyf_w_embeddings.json')
15
-
16
 
17
  GPT_MODEL = 'gpt-3.5-turbo'
18
  EMBEDDING_MODEL = "text-embedding-ada-002"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  # search function
21
  def strings_ranked_by_relatedness(
22
  query: str,
23
  df: pd.DataFrame,
24
  relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
25
- top_n: int = 25
26
  ) -> tuple[list[str], list[float]]:
27
  """Returns a list of strings and relatednesses, sorted from most related to least."""
28
  query_embedding_response = openai.Embedding.create(
@@ -31,152 +45,76 @@ def strings_ranked_by_relatedness(
31
  )
32
  query_embedding = query_embedding_response["data"][0]["embedding"]
33
  strings_and_relatednesses = [
34
- (row["citation"]+':\n'+row["text"]+'\nINDEX:'+str(i), relatedness_fn(query_embedding, row["embedding"]))
35
  for i, row in df.iterrows()
36
  ]
37
  strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
38
  strings, relatednesses = zip(*strings_and_relatednesses)
39
- return strings[:top_n], relatednesses[:top_n]
40
 
41
- def num_tokens(text: str, model: str = GPT_MODEL) -> int:
42
  """Return the number of tokens in a string."""
43
- encoding = tiktoken.encoding_for_model(model)
44
  return len(encoding.encode(text))
45
 
46
- def double_check(question, passage):
47
-
48
- message = f'Possibly related text:{passage}\n\nSearch query: {question}'
49
- messages = [
50
- {"role": "system", "content": "Is the following text topically related to the search query. Answer with just Yes or No."},
51
- {"role": "user", "content": message},
52
- ]
53
- response = openai.ChatCompletion.create(
54
- model='gpt-3.5-turbo',
55
- messages=messages,
56
- temperature=0
57
- )
58
- response_message = response["choices"][0]["message"]["content"]
59
- if 'yes' in response_message.lower():
60
- return True
61
- return False
62
- return response_message
63
-
64
- def extract_numbers_after_index(text):
65
- numbers = []
66
- lines = text.split("\n")
67
-
68
- for line in lines:
69
- if "INDEX:" in line:
70
- index = line.split("INDEX:")[1].strip()
71
- try:
72
- number = int(index)
73
- numbers.append(number)
74
- except ValueError:
75
- pass
76
-
77
- return numbers
78
-
79
-
80
- def query_message(
81
- query: str,
82
- df: pd.DataFrame,
83
- model: str,
84
- token_budget: int
85
- ) -> str:
86
- """Return a message for GPT, with relevant source texts pulled from a dataframe."""
87
- strings, relatednesses = strings_ranked_by_relatedness(query, df)
88
- introduction = 'Use the below articles written by W.E.B. Du Bois subsequent question. Write your response in the form of an four paragraph essay for a college class. If the answer cannot be found in the articles, write "I could not find an answer. Be sure to put direct quotations in quotation marks. Use in APA-Style text references where approriate.'
89
- message = introduction
90
- article_cites = defaultdict(int)
91
-
92
- for counter, string in enumerate(strings):
93
- article_cite = string.splitlines()[0]
94
- next_article = f'\n\nDu Bois article:\n"""\n{string}\n"""'
95
  if (
96
- num_tokens(message + next_article + query, model=model)
97
- > token_budget
98
  ):
99
  break
100
  else:
101
- if double_check(query, string) == True and article_cites[article_cite] <= 2:
102
- message += next_article
103
- article_cites[article_cite] += 1
104
- print(article_cites)
105
- return message + query
106
-
107
- def remove_lines_with_index(input_string):
108
- lines = input_string.strip().split('\n')
109
- cleaned_lines = [line for line in lines if "INDEX:" not in line]
110
- cleaned_string = "\n".join(cleaned_lines)
111
- return cleaned_string
112
-
113
- def ask(
114
- query: str,
115
-
116
- ) -> str:
117
- """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
118
 
119
- model = GPT_MODEL
120
- token_budget = 4096 - 600
121
-
 
 
122
 
123
- message = query_message(query, df, model=model, token_budget=token_budget)
 
 
 
124
 
125
- # Add references
126
- cite_rows = extract_numbers_after_index(message)
127
- used_df = df[df.index.isin(cite_rows)].copy()
128
- citations = list(set(used_df['citation'].values))
129
- if len(citations) == 0:
130
- return "No relevant articles found. Sorry. Please try a different question."
131
-
132
- resources = '**Resources**\n* ' + '\n* '.join(sorted(citations))
133
- # clean up to remove index
134
- message = remove_lines_with_index(message)
135
-
136
-
137
- messages = [
138
- {"role": "system", "content": "You answer questions based on the writings of W.E.B. Du Bois. All the provided texts are written by Du Bois."},
139
- {"role": "user", "content": message},
140
- ]
141
  response = openai.ChatCompletion.create(
142
- model=model,
143
- messages=messages,
144
- temperature=0
145
- )
146
- response_message = response["choices"][0]["message"]["content"]
147
-
148
-
 
 
149
 
150
- answer = f'{resources}\n\n**Summary**\n\n{response_message}'
151
- return answer
152
 
153
 
154
 
155
  intro_text = '''
156
- # W.E.B. Du Bois in the Crisis
157
-
158
- This search engine find the most relevant articles from [Dare You Fight](https://www.dareyoufight.org), an online repository of W.E.B. Du Bois's writings in The Crisis, the official journal of the NAACP, which Du Bois founded and edited between 1911 and 1934. In addition to locating the most relevant articles, it also produces a short essay in response to your question.
159
-
160
- **Notes:**
161
- * Avoid using "Du Bois" in the question, as this is information is passed along behind the scenes.
162
- * Searches can take 20 to 40 seconds.
163
- * You may need a follow up question if your original question is only a word or two.
164
- * The model usually looks at five or fewer relevant articles, so if you response requires more, consider refining and splitting up your question.
165
-
166
- **Caveats:** Like all apps that employ large language models, this one has the possiblitiy for bias and confabulation. Please refer to the original articles.
167
 
 
168
  '''
169
 
170
  outro_text = '''
 
 
171
  **Behind the Scenes**
172
 
173
  This app uses sentence embeddings and a large language model to craft the response. Behind the scenes, it involves the following steps:
174
 
175
- 1. Each article from Dare You Fight (or segment of the article if it's long) is converted into a fixed-length vector representation using OpenAI's text-embedding-ada-002 model. These representations are stored in a dataframe.
176
- 2. The user's query is embedded using the same text-embedding-ada-002 model to convert it into a fixed-length vector representation.
177
- 3. To find the most relevant articles to the query, cosine similarity is calculated between the query vector and all the article vectors. The articles with the highest cosine similarity are retrieved as the top matches.
178
- 4. The text of each of the possibly related articles (based on Step 3) is passed to OpenAI's ChatGPT 3.5 model, along with a question asking whether the text is relevant to the search query. Only texts coded as relevant are used in subsequent steps.
179
- 5. All of the relevant texts (from Step 4), along with the original search query, are passed to OpenAI's ChatGPT 3.5 model with specific instructions to answer the query in the form of a college essay using only the supplied texts.
180
  '''
181
 
182
 
@@ -188,16 +126,15 @@ with block:
188
 
189
  # Define the input and output blocks
190
  input_block = gr.Textbox(label='Question')
191
- research_btn = gr.Button(value="Ask the archive")
192
  output_block = gr.Markdown(label="Response")
193
  research_btn.click(ask, inputs=input_block, outputs=output_block)
194
- gr.Examples(["What is the relationship between social, political and economic equality?",
195
- "What is Pan-Africanism?",
196
- "Did Du Bois support American involvement in WWI?",
197
- "What are the most effective tactics or methods for racial equality?",
198
- "Why was the NAACP founded and what was it's original goals?"], inputs=[input_block])
199
  gr.Markdown(outro_text)
200
 
 
 
201
  # Launch the interface
202
  block.launch()
203
 
 
11
 
12
 
13
  #df = pd.read_json('https://www.dropbox.com/scl/fi/uh964d1k6woc9wo3l2slc/dyf_w_embeddings.json?rlkey=j23j5338n4e88kvvsmj7s7aff&dl=1')
14
+ df = pd.read_json('rw7.json')
 
15
 
16
  GPT_MODEL = 'gpt-3.5-turbo'
17
  EMBEDDING_MODEL = "text-embedding-ada-002"
18
+ @retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(6))
19
+ def ask_naive(query):
20
+ messages = [
21
+ {"role": "system", "content": "You are a collegee sociology professor. Provide a very brief answer to this student question."},
22
+ {"role": "user", "content": query},
23
+ ]
24
+ response = openai.ChatCompletion.create(
25
+ model='gpt-3.5-turbo',
26
+ messages=messages,
27
+ )
28
+
29
+ response_message = response["choices"][0]["message"]["content"]
30
+ return response_message
31
+
32
+ # search function
33
 
34
  # search function
35
  def strings_ranked_by_relatedness(
36
  query: str,
37
  df: pd.DataFrame,
38
  relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
39
+ top_n: int = 100
40
  ) -> tuple[list[str], list[float]]:
41
  """Returns a list of strings and relatednesses, sorted from most related to least."""
42
  query_embedding_response = openai.Embedding.create(
 
45
  )
46
  query_embedding = query_embedding_response["data"][0]["embedding"]
47
  strings_and_relatednesses = [
48
+ (row["text"], relatedness_fn(query_embedding, row["embedding"]))
49
  for i, row in df.iterrows()
50
  ]
51
  strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
52
  strings, relatednesses = zip(*strings_and_relatednesses)
53
+ return strings[:top_n]
54
 
55
+ def num_tokens(text: str) -> int:
56
  """Return the number of tokens in a string."""
57
+ encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
58
  return len(encoding.encode(text))
59
 
60
+ def build_resources(psuedo_answer):
61
+ related_book_selections = strings_ranked_by_relatedness(psuedo_answer, df, top_n=15)
62
+ message = 'Real World Sociology selections:\n'
63
+ for selection in related_book_selections:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  if (
65
+ num_tokens(message + selection)
66
+ > 3000
67
  ):
68
  break
69
  else:
70
+ message += '\n' + selection
71
+ print(num_tokens(message))
72
+ return message
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ @retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(6))
75
+ def respond(question, textbook_samples):
76
+ messages = [
77
+ {"role": "system", "content": "You are a college profesor who excels at explaining topics to students. Start with a direct answer to the question. Then, definition/overview of the concept's essence; break it down into understandable pieces; use clear language and structure. Where approriate, provide connections and comparisions to related terms. "},
78
+ {"role": "user", "content": f"""Use markdown and emphasize important phrases in bold. Respond to the following question: {question}.
79
 
80
+ When contructing the question, use the following information from the textbook.
81
+ {textbook_samples}
82
+ """ }
83
+ ]
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  response = openai.ChatCompletion.create(
86
+ model='gpt-3.5-turbo',
87
+ n=1,
88
+ messages=messages)
89
+ return response["choices"][0]["message"]["content"]
90
+
91
+ def ask(query):
92
+ psuedo_answer = ask_naive(query)
93
+ resources = build_resources(psuedo_answer)
94
+ response = respond(query, resources)
95
 
96
+ return response
 
97
 
98
 
99
 
100
  intro_text = '''
101
+ # Ask the textbook
 
 
 
 
 
 
 
 
 
 
102
 
103
+ This app responds to your questions by looking up the most relevant selections from the textbook, and asking ChatGPT to respond based on the selections. It can take up to 30 seconds to respond.
104
  '''
105
 
106
  outro_text = '''
107
+ **Caveats:** Like all apps that employ large language models, this one has the possiblitiy for bias and confabulation.
108
+
109
  **Behind the Scenes**
110
 
111
  This app uses sentence embeddings and a large language model to craft the response. Behind the scenes, it involves the following steps:
112
 
113
+ 1. Each page from the textbook (or segment of the article if it's long) is converted into a fixed-length vector representation using OpenAI's text-embedding-ada-002 model. These representations are stored in a dataframe.
114
+ 2. Your question is embedded using the same text-embedding-ada-002 model to convert it into a fixed-length vector representation.
115
+ 3. To find the most relevant articles to your question, cosine similarity is calculated between the query vector and all the page vectors. The pages with the highest cosine similarity are retrieved as the top matches.
116
+ 5. All of the relevant texts (from Step 3), along with the original search query, are passed to OpenAI's ChatGPT 3.5 model with specific instructions to answer the question using the supplied texts.
117
+
118
  '''
119
 
120
 
 
126
 
127
  # Define the input and output blocks
128
  input_block = gr.Textbox(label='Question')
129
+ research_btn = gr.Button(value="Ask the textbook")
130
  output_block = gr.Markdown(label="Response")
131
  research_btn.click(ask, inputs=input_block, outputs=output_block)
132
+ gr.Examples(["What is the difference beween organic and mechnical solidarity?",
133
+ ], inputs=[input_block])
 
 
 
134
  gr.Markdown(outro_text)
135
 
136
+
137
+
138
  # Launch the interface
139
  block.launch()
140
 
rw7.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e4437c739095717327abd9f6a92f8057fe466b2e4e4b74070282d1294128755
3
+ size 78978588