File size: 6,432 Bytes
ae69701 24b752a ae69701 24b752a 0508f51 24b752a ae69701 24b752a ae69701 24b752a ae69701 24b752a ae69701 24b752a ae69701 24b752a ae69701 24b752a ae69701 24b752a ae69701 24b752a ae69701 24b752a 5ebbf4f 24b752a ae69701 0508f51 24b752a ae69701 24b752a ae69701 24b752a ae69701 0cd5c6b ae69701 6e052a3 24b752a ae69701 6e052a3 ae69701 6e052a3 24b752a ae69701 2fa8f8b 566cb6b ae69701 24b752a ae69701 24b752a c11805d 24b752a ae69701 24b752a ae69701 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import pandas as pd
from glob import glob
from scipy import spatial
from collections import defaultdict
import tiktoken
import openai
import gradio as gr
from tenacity import retry, stop_after_attempt, wait_random_exponential
df = pd.read_json('rw7.json')
GPT_MODEL = 'gpt-3.5-turbo'
EMBEDDING_MODEL = "text-embedding-ada-002"
@retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(6))
def ask_naive(query):
messages = [
{"role": "system", "content": "You are a college sociology professor. Provide a very brief answer to this student question."},
{"role": "user", "content": query},
]
response = openai.ChatCompletion.create(
model='gpt-3.5-turbo',
messages=messages,
)
response_message = response["choices"][0]["message"]["content"]
return response_message
# search function
# search function
def strings_ranked_by_relatedness(
query: str,
df: pd.DataFrame,
relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
top_n: int = 100
) -> tuple[list[str], list[float]]:
"""Returns a list of strings and relatednesses, sorted from most related to least."""
query_embedding_response = openai.Embedding.create(
model=EMBEDDING_MODEL,
input=query,
)
query_embedding = query_embedding_response["data"][0]["embedding"]
strings_and_relatednesses = [
(row["text"], relatedness_fn(query_embedding, row["embedding"]))
for i, row in df.iterrows()
]
strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
strings, relatednesses = zip(*strings_and_relatednesses)
return strings[:top_n]
def num_tokens(text: str) -> int:
"""Return the number of tokens in a string."""
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
return len(encoding.encode(text))
def build_resources(psuedo_answer):
related_book_selections = strings_ranked_by_relatedness(psuedo_answer, df, top_n=15)
message = 'Real World Sociology selections:\n'
for selection in related_book_selections:
if (
num_tokens(message + selection)
> 3000
):
break
else:
message += '\n' + selection
print(num_tokens(message))
return message
@retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(6))
def respond(question, textbook_samples):
messages = [
{"role": "system", "content": "You are a college professor who excels at explaining topics to students and is known for dad jokes and puns. Start with a direct answer to the question. Then, definition/overview of the concept's essence; break it down into understandable pieces; use clear language and structure. Always use examples related to the life of a college student. Where appropriate, provide connections and comparisons to related terms. "},
{"role": "user", "content": f"""Use markdown and emphasize important phrases in bold. Respond to the following question: {question}.
When constructing the answer, use the following information from the textbook.
{textbook_samples}
""" }
]
response = openai.ChatCompletion.create(
model='gpt-3.5-turbo',
n=1,
messages=messages)
return response["choices"][0]["message"]["content"]
def ask(query):
psuedo_answer = ask_naive(query)
resources = build_resources(psuedo_answer)
response = respond(query, resources)
return response
intro_text = '''
This app responds to your questions by looking up the most relevant selections from the textbook, and asking ChatGPT to respond based on the selections.
Enter your question in the grey box below and click "Ask the textbook." It can take up to 30 seconds to respond.
'''
outro_text = '''
**Caveats:** Like all apps that employ large language models, this one has the possiblitiy for bias and confabulation.
**Behind the Scenes**
This app uses a large language model (ChatGPT 3.5) and sentence embeddings (text-embedding-ada-002) to craft the response using what's called a retrieval-augmented generation process. Behind the scenes, it involves the following steps:
1. Each textbook page is broken down into small chunks of text.
2. A machine learning system converts each chunk of text into a mathematical representation called a vector. All these vectors get saved in a table.
3. ChatGPT is used to generate a sample answer to the question.
4. The sample answer is converted into a vector using the same method.
5. The vector for the sample answer is compared to all the vectors for the textbook chunks. The chunks whose vectors are most like the sample answer vector are identified. These chunks are likely to be relevant to answering the question.
6. The original question, along with the relevant textbook chunks that were found, is given to ChatGPT. ChatGPT is instructed to read the textbook chunks first and use them to help answer the question in its own words.
In summary:
- Text is converted to math vectors.
- Textbook vectors similar to a sample answer vector are found.
- The questions, similar textbook chunks, are given to ChatGPT to answer using those chunks.
This process allows the AI system to search the textbook, find relevant information, and use it to generate a better answer to the question!
'''
block = gr.Blocks(theme = 'bethecloud/storj_theme')
with block:
gr.Markdown("# Ask the Sociology 101 Textbook")
gr.Image("https://huggingface.co/spaces/NealCaren/Ask101/resolve/main/rw_cover.jpg")
gr.Markdown(intro_text)
# Define the input and output blocks
input_block = gr.Textbox(label='Question')
research_btn = gr.Button(value="Ask the textbook")
output_block = gr.Markdown(label="Response")
research_btn.click(ask, inputs=input_block, outputs=output_block)
gr.Examples(["What is the difference beween organic and mechnical solidarity?",
"What are the main perspectives on deviance and crime, and how do they relate to social norms and control?",
"How do sociologists conduct research, and what are the main research methods they use?",
'''How is the "generalized other" different from the "looking glass self?"''',
], inputs=[input_block])
gr.Markdown(outro_text)
# Launch the interface
block.launch()
|