File size: 6,508 Bytes
ae69701
 
 
 
 
 
183108b
ae69701
 
 
 
 
24b752a
ae69701
 
 
24b752a
 
 
0508f51
24b752a
 
183108b
 
 
24b752a
 
 
 
183108b
24b752a
 
 
ae69701
 
 
 
 
 
24b752a
ae69701
 
183108b
 
 
 
 
ae69701
 
 
183108b
ae69701
24b752a
ae69701
 
 
 
24b752a
ae69701
24b752a
ae69701
24b752a
ae69701
 
24b752a
 
 
 
ae69701
24b752a
 
ae69701
 
 
24b752a
 
 
ae69701
24b752a
 
 
5ebbf4f
24b752a
ae69701
0508f51
24b752a
 
 
ae69701
183108b
 
 
24b752a
 
 
183108b
24b752a
 
 
 
 
ae69701
24b752a
ae69701
 
 
 
0cd5c6b
 
 
ae69701
 
 
6e052a3
24b752a
ae69701
 
6e052a3
ae69701
6e052a3
 
 
 
 
 
 
 
 
 
 
 
 
24b752a
ae69701
 
 
 
 
 
 
2fa8f8b
566cb6b
ae69701
 
 
 
24b752a
ae69701
 
24b752a
c11805d
 
 
 
24b752a
ae69701
 
24b752a
 
ae69701
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import pandas as pd
from glob import glob
from scipy import spatial 
from collections import defaultdict

import tiktoken
from openai import OpenAI
import gradio as gr
from tenacity import retry, stop_after_attempt, wait_random_exponential



df = pd.read_json('rw7.json')

GPT_MODEL = 'gpt-3.5-turbo'
EMBEDDING_MODEL = "text-embedding-ada-002"
@retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(6))
def ask_naive(query):
    messages = [
        {"role": "system", "content": "You are a college sociology professor. Provide a very brief answer to this student question."},
        {"role": "user", "content": query},
    ]

    client = OpenAI()
    response = client.chat.completions.create(
        model='gpt-3.5-turbo',
        messages=messages,
    )
    
    response_message = response.choices[0].message.content
    return response_message

# search function

# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    
    client = OpenAI()

    
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n]

def num_tokens(text: str) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
    return len(encoding.encode(text))

def build_resources(psuedo_answer):
    related_book_selections = strings_ranked_by_relatedness(psuedo_answer, df, top_n=15)
    message  = 'Real World Sociology selections:\n'
    for selection in related_book_selections:
        if (
            num_tokens(message + selection)
            > 3000
        ):
            break
        else:
            message += '\n' + selection
    print(num_tokens(message))
    return message
    
@retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(6))
def respond(question, textbook_samples):
    messages = [
        {"role": "system", "content": "You are a college professor who excels at explaining topics to students and is known for dad jokes and puns. Start with a direct answer to the question. Then, definition/overview of the concept's essence; break it down into understandable pieces; use clear language and structure. Always use examples related to the life of a college student. Where appropriate, provide connections  and comparisons to  related terms. "},
        {"role": "user", "content": f"""Use markdown and emphasize important phrases in bold. Respond to the following question: {question}.
        
        When constructing the answer, use the following information from the textbook.
        {textbook_samples}
        """ }
        ]
    
    client = OpenAI()
    
    response = client.chat.completions.create(
        model='gpt-3.5-turbo',
        n=1,
        messages=messages)
    return response.choices[0].message.content
        
def ask(query):
    psuedo_answer = ask_naive(query)
    resources = build_resources(psuedo_answer)
    response = respond(query, resources)
    
    return response



intro_text = '''
This app responds to your questions by looking up the most relevant selections from the textbook, and asking ChatGPT to respond based on the selections. 

Enter your question in the grey box below and click "Ask the textbook." It can take up to 30 seconds to respond.
'''

outro_text = '''
**Caveats:** Like all apps that employ large language models, this one has the possiblitiy for bias and confabulation. 

**Behind the Scenes**

This app uses a large language model (ChatGPT 3.5) and  sentence embeddings (text-embedding-ada-002)  to craft the response using what's called a retrieval-augmented generation process. Behind the scenes, it involves the following steps:

1. Each textbook page is broken down into small chunks of text.
2. A machine learning system converts each chunk of text into a mathematical representation called a vector. All these vectors get saved in a table. 
3. ChatGPT is used to generate a sample answer to the question.
4. The sample answer is converted into a vector using the same method. 
5. The vector for the sample answer is compared to all the vectors for the textbook chunks. The chunks whose vectors are most like the sample answer vector are identified. These chunks are likely to be relevant to answering the question.
6. The original question, along with the relevant textbook chunks that were found, is given to ChatGPT. ChatGPT is instructed to read the textbook chunks first and use them to help answer the question in its own words.

In summary:
- Text is converted to math vectors.
- Textbook vectors similar to a sample answer vector are found. 
- The questions, similar textbook chunks, are given to ChatGPT to answer using those chunks.

This process allows the AI system to search the textbook, find relevant information, and use it to generate a better answer to the question!

'''



block = gr.Blocks(theme = 'bethecloud/storj_theme')

with block:
    gr.Markdown("# Ask the Sociology 101 Textbook")
    gr.Image("https://huggingface.co/spaces/NealCaren/Ask101/resolve/main/rw_cover.jpg")
    gr.Markdown(intro_text)
        
    # Define the input and output blocks
    input_block = gr.Textbox(label='Question')
    research_btn = gr.Button(value="Ask the textbook")
    output_block = gr.Markdown(label="Response")
    research_btn.click(ask, inputs=input_block, outputs=output_block)
    gr.Examples(["What is the difference beween organic and mechnical solidarity?", 
                 "What are the main perspectives on deviance and crime, and how do they relate to social norms and control?",
                 "How do sociologists conduct research, and what are the main research methods they use?",
                 '''How is the "generalized other" different from the "looking glass self?"''',
                 
                 ], inputs=[input_block])
    gr.Markdown(outro_text)



# Launch the interface
block.launch()