Ask101

Runtime error

File size: 6,508 Bytes

import pandas as pd
from glob import glob
from scipy import spatial 
from collections import defaultdict

import tiktoken
from openai import OpenAI
import gradio as gr
from tenacity import retry, stop_after_attempt, wait_random_exponential



df = pd.read_json('rw7.json')

GPT_MODEL = 'gpt-3.5-turbo'
EMBEDDING_MODEL = "text-embedding-ada-002"
@retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(6))
def ask_naive(query):
    messages = [
        {"role": "system", "content": "You are a college sociology professor. Provide a very brief answer to this student question."},
        {"role": "user", "content": query},
    ]

    client = OpenAI()
    response = client.chat.completions.create(
        model='gpt-3.5-turbo',
        messages=messages,
    )
    
    response_message = response.choices[0].message.content
    return response_message

# search function

# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    
    client = OpenAI()

    
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n]

def num_tokens(text: str) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
    return len(encoding.encode(text))

def build_resources(psuedo_answer):
    related_book_selections = strings_ranked_by_relatedness(psuedo_answer, df, top_n=15)
    message  = 'Real World Sociology selections:\n'
    for selection in related_book_selections:
        if (
            num_tokens(message + selection)
            > 3000
        ):
            break
        else:
            message += '\n' + selection
    print(num_tokens(message))
    return message
    
@retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(6))
def respond(question, textbook_samples):
    messages = [
        {"role": "system", "content": "You are a college professor who excels at explaining topics to students and is known for dad jokes and puns. Start with a direct answer to the question. Then, definition/overview of the concept's essence; break it down into understandable pieces; use clear language and structure. Always use examples related to the life of a college student. Where appropriate, provide connections  and comparisons to  related terms. "},
        {"role": "user", "content": f"""Use markdown and emphasize important phrases in bold. Respond to the following question: {question}.
        
        When constructing the answer, use the following information from the textbook.
        {textbook_samples}
        """ }
        ]
    
    client = OpenAI()
    
    response = client.chat.completions.create(
        model='gpt-3.5-turbo',
        n=1,
        messages=messages)
    return response.choices[0].message.content
        
def ask(query):
    psuedo_answer = ask_naive(query)
    resources = build_resources(psuedo_answer)
    response = respond(query, resources)
    
    return response



intro_text = '''
This app responds to your questions by looking up the most relevant selections from the textbook, and asking ChatGPT to respond based on the selections. 

Enter your question in the grey box below and click "Ask the textbook." It can take up to 30 seconds to respond.
'''

outro_text = '''
**Caveats:** Like all apps that employ large language models, this one has the possiblitiy for bias and confabulation. 

**Behind the Scenes**

This app uses a large language model (ChatGPT 3.5) and  sentence embeddings (text-embedding-ada-002)  to craft the response using what's called a retrieval-augmented generation process. Behind the scenes, it involves the following steps:

1. Each textbook page is broken down into small chunks of text.
2. A machine learning system converts each chunk of text into a mathematical representation called a vector. All these vectors get saved in a table. 
3. ChatGPT is used to generate a sample answer to the question.
4. The sample answer is converted into a vector using the same method. 
5. The vector for the sample answer is compared to all the vectors for the textbook chunks. The chunks whose vectors are most like the sample answer vector are identified. These chunks are likely to be relevant to answering the question.
6. The original question, along with the relevant textbook chunks that were found, is given to ChatGPT. ChatGPT is instructed to read the textbook chunks first and use them to help answer the question in its own words.

In summary:
- Text is converted to math vectors.
- Textbook vectors similar to a sample answer vector are found. 
- The questions, similar textbook chunks, are given to ChatGPT to answer using those chunks.

This process allows the AI system to search the textbook, find relevant information, and use it to generate a better answer to the question!

'''



block = gr.Blocks(theme = 'bethecloud/storj_theme')

with block:
    gr.Markdown("# Ask the Sociology 101 Textbook")
    gr.Image("https://huggingface.co/spaces/NealCaren/Ask101/resolve/main/rw_cover.jpg")
    gr.Markdown(intro_text)
        
    # Define the input and output blocks
    input_block = gr.Textbox(label='Question')
    research_btn = gr.Button(value="Ask the textbook")
    output_block = gr.Markdown(label="Response")
    research_btn.click(ask, inputs=input_block, outputs=output_block)
    gr.Examples(["What is the difference beween organic and mechnical solidarity?", 
                 "What are the main perspectives on deviance and crime, and how do they relate to social norms and control?",
                 "How do sociologists conduct research, and what are the main research methods they use?",
                 '''How is the "generalized other" different from the "looking glass self?"''',
                 
                 ], inputs=[input_block])
    gr.Markdown(outro_text)



# Launch the interface
block.launch()