Spaces:
Runtime error
Runtime error
""" | |
This program demonstrates how openAI's ChatGPT language model can be used to answer questions in specific domain areas. | |
The program asks a user for a question in a prescribed domain area. The program then compares the user's query against | |
pre-loaded domain content to identify the most useful sections of content. The program answers the question by leveraging | |
ChatGPT's powerful general capabilities with the newly incorporated domain knowledge. Such an approach might be used, | |
for example, to provide a customized chat box for an insurance company's customers, where the company's policy materials | |
are brought in as domain content. For this example, I compiled the 2023 investment outlook summaries posted on the websites of | |
Morgan Stanley (https://www.morganstanley.com/ideas/global-investment-strategy-outlook-2023), | |
JPMorgan (https://www.jpmorgan.com/insights/research/market-outlook) and | |
Goldman Sachs (https://www.goldmansachs.com/insights/pages/gs-research/macro-outlook-2023-this-cycle-is-different/report.pdf). | |
Far more robust domain-specific responses are possible with further customization/retraining of ChatGPT. | |
""" | |
################################# LOAD LIBRARIES/IMPORTS ######################################### | |
# !pip install openai | |
# ! pip install transformers | |
# ! pip install gradio | |
# ! pip install PyPDF2 | |
# ! pip install python-docx | |
# ! pip install pandas | |
import docx | |
import pandas as pd | |
import numpy as np | |
import openai | |
import gradio as gr | |
import pickle | |
import os | |
from transformers import GPT2TokenizerFast | |
# import openai_secret_manager | |
################################# VARIABLES ######################################### | |
USE_INTERFACE = True # Change to False if you want to run the code without the Gradio interface, and instead see a single pre-supplied question | |
filepath = '2023_investment_outlook.docx' | |
# Path to document containing domain content. Initial cleaning of domain content | |
# can be done inside (eg, using Python) or outside (eg, using Word) this program, | |
# depending on needs and circumstances. | |
# emb_filepath = 'PATH HERE' # Path to document containing saved content embeddings, if applicable | |
COMPLETIONS_MODEL = "text-davinci-003" | |
# Get the value of confidential OpenAI API key; register at OpenAI for keys | |
openai.api_key = os.environ["API-KEY"] | |
MODEL_NAME = "curie" | |
DOC_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-doc-001" | |
QUERY_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-query-001" | |
MAX_SECTION_LEN =1100 # The API limits total tokens -- for the prompt containing the question and domain-specific content and the answer -- to 2048 tokens, or about 1500 words. | |
SEPARATOR = "\n* " # A string called SEPARATOR is defined as the newline character followed by an asterisk and a space. This string will be used as a separator between different pieces of text. | |
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") | |
separator_len = len(tokenizer.tokenize(SEPARATOR)) | |
COMPLETIONS_API_PARAMS = { | |
# We use temperature of 0.0 because it gives the most predictable, factual answer. | |
"temperature": 0.0, | |
"max_tokens": 300, | |
"model": COMPLETIONS_MODEL, | |
} | |
################################# FUNCTIONS ######################################### | |
def load_text(filepath): | |
""" | |
Loads a Microsoft Word document and returns a DataFrame containing the text of each paragraph in the document. | |
Input: | |
filepath (str): the filepath to the Microsoft Word document. | |
Returns: | |
df (pandas.DataFrame): a DataFrame containing the 'content' column with the text of each paragraph in the document. | |
""" | |
# Open the Word document | |
doc = docx.Document(filepath) | |
# Create an empty pandas DataFrame | |
df = pd.DataFrame() | |
# Iterate through the paragraphs in the document and add each to the df | |
for i, p in enumerate(doc.paragraphs): | |
# Add the paragraph text [and index to the DataFrame] | |
df.loc[i, 'content'] = p.text | |
# df.loc[i, 'paragraph_index'] = i | |
# Delete empty paragraphs | |
df['content'] = df['content'].replace('', np.nan) | |
df = df.dropna(axis=0, subset=['content']).reset_index(drop=True) | |
return df | |
def count_tokens(row): | |
"""count the number of tokens in a string""" | |
return len(tokenizer.encode(row)) | |
def truncate_text(df): | |
""" | |
Truncates the text in the 'content' column of the input DataFrame if the number of tokens | |
in the text exceeds a specified maximum number. It will set the truncated text and the | |
number of tokens in the 'content' and 'tokens' columns, respectively. | |
Input: | |
df (pandas.DataFrame): a DataFrame containing the 'content' column | |
Returns: | |
df (pandas.DataFrame): the input DataFrame with modified 'content' and 'tokens' columns. | |
""" | |
for i in range(len(df)): | |
if df['tokens'][i] > 590: | |
text = df['content'][i] | |
tokens = tokenizer.encode(text) | |
truncated_tokens = tokens[:590] | |
truncated_text = tokenizer.decode(truncated_tokens) | |
df.at[i, 'content'] = truncated_text | |
df.at[i, 'tokens'] = len(truncated_tokens) | |
return df | |
def get_embedding(text, model): | |
""" | |
Generates an embedding for the given text using the specified OpenAI model. | |
Args: | |
text (str): The text for which to generate an embedding. | |
model (str): The name of the OpenAI model to use for generating the embedding. | |
Returns: | |
numpy.ndarray: The embedding for the given text. | |
""" | |
result = openai.Embedding.create( | |
model=model, | |
input=[text] | |
) | |
return result["data"][0]["embedding"] | |
def get_doc_embedding(text): | |
""" | |
Generates an embedding for the given text using the OpenAI document embeddings model. | |
Args: | |
text (str): The text for which to generate an embedding. | |
Returns: | |
numpy.ndarray: The embedding for the given text. | |
""" | |
return get_embedding(text, DOC_EMBEDDINGS_MODEL) | |
def get_query_embedding(text): | |
""" | |
Generates an embedding for the given text using the OpenAI query embeddings model. | |
Args: | |
text (str): The text for which to generate an embedding. | |
Returns: | |
numpy.ndarray: The embedding for the given text. | |
""" | |
return get_embedding(text, QUERY_EMBEDDINGS_MODEL) | |
def compute_doc_embeddings(df): | |
""" | |
Generate embeddings for each row in a Pandas DataFrame using the OpenAI document embeddings model. | |
Args: | |
df (pandas.DataFrame): The DataFrame for which to generate embeddings. | |
Returns: | |
dict: A dictionary that maps the embedding vectors to the indices of the rows that they correspond to. | |
""" | |
return { | |
idx: get_doc_embedding(r.content.replace("\n", " ")) for idx, r in df.iterrows() # r here refers to each row | |
} | |
def load_embeddings(fname): | |
""" | |
Load document embeddings and their keys from a CSV file. Only if embeddings are pre-loaded. | |
Args: | |
fname (str): The path to the CSV file. The file must have exactly these named columns: | |
"title", "heading", "0", "1", ... up to the length of the embedding vectors. | |
Returns: | |
dict: A dictionary that maps the embedding vectors to tuples of the form (title, heading). | |
""" | |
df = pd.read_csv(fname, header=0) | |
max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"]) | |
return { | |
(r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows() | |
} | |
def vector_similarity(x, y): | |
""" | |
Calculate the similarity between two vectors using dot product. | |
Args: | |
x (iterable): The first vector. | |
y (iterable): The second vector. | |
Returns: | |
float: The dot product of the two vectors. | |
""" | |
return np.dot(np.array(x), np.array(y)) | |
def order_document_sections_by_query_similarity(query, contexts): | |
""" | |
Find the query embedding for the given query, and compare it against all of the pre-calculated document embeddings | |
to find the most relevant sections. | |
Args: | |
query (str): The query for which to find relevant document sections. | |
contexts (dict): A dictionary mapping document embeddings to their indices. | |
Returns: | |
list: A list of tuples, each containing the similarity score and index of a document section, sorted in descending | |
order of relevance. | |
""" | |
query_embedding = get_query_embedding(query) | |
document_similarities = sorted([(vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items() | |
], reverse=True) | |
return document_similarities | |
def construct_prompt(question, context_embeddings, df): | |
""" | |
Construct a prompt for answering a question using the most relevant document sections. | |
Args: | |
question (str): The question to answer. | |
context_embeddings (dict): A dictionary mapping document embeddings to their indices. | |
df (pandas.DataFrame): A DataFrame containing the document sections. | |
Returns: | |
str: The prompt, including the question and the relevant context. | |
""" | |
most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings) | |
chosen_sections = [] | |
chosen_sections_len = 0 | |
chosen_sections_indexes = [] | |
for _, section_index in most_relevant_document_sections: | |
# Add contexts until we run out of space. | |
document_section = df.loc[section_index] | |
chosen_sections_len += document_section.tokens + separator_len | |
if chosen_sections_len > MAX_SECTION_LEN: | |
break | |
chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " ")) | |
chosen_sections_indexes.append(str(section_index)) | |
# # Useful diagnostic information -- FOR TESTING PURPOSES | |
# print(f"Selected {len(chosen_sections)} document sections:") | |
# print("\n".join(chosen_sections_indexes)) | |
header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "Sorry, I don't know."\n\nContext:\n""" | |
full_prompt = header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:" | |
# print(full_prompt) # FOR TESTING PURPOSES | |
return full_prompt | |
def answer_query_with_context( | |
query, | |
df, | |
document_embeddings, | |
show_prompt: bool = False): | |
prompt = construct_prompt( | |
query, | |
document_embeddings, | |
df | |
) | |
""" | |
Answer a query using relevant context from a DataFrame. | |
Args: | |
query (str): The query to answer. | |
df (pandas.DataFrame): A DataFrame containing the document sections. | |
document_embeddings (dict): A dictionary mapping document embeddings to their indices. | |
show_prompt (bool, optional): If `True`, print the prompt before generating a response. | |
Returns: | |
str: The generated response to the query. | |
""" | |
if show_prompt: | |
print(prompt) | |
response = openai.Completion.create( | |
prompt=prompt, | |
**COMPLETIONS_API_PARAMS | |
) | |
return response["choices"][0]["text"].strip(" \n") | |
######################### MAIN PROGRAM ######################################### | |
# Load the text into dataframe | |
df = load_text(filepath) | |
# print(df.head()) # FOR TESTING PURPOSES | |
# Count the tokens | |
df = df.copy() | |
df['tokens'] = df['content'].apply(count_tokens) | |
# print(df.head(10)) # FOR TESTING PURPOSES | |
# print(df['content'][3]) # FOR TESTING PURPOSES | |
# Call the truncate_text function on the dataframe | |
df = df.copy() | |
df = truncate_text(df) | |
# print(df.head(10)) # FOR TESTING PURPOSES | |
# print(df['content'][3]) # FOR TESTING PURPOSES | |
# Use code below only if importing embeddings from file, rather than creating in real time through OpenAI API | |
# document_embeddings = load_embeddings(empb_filepath) | |
# Use code below if calculating the embeddings in real time via OpenAI API | |
document_embeddings = compute_doc_embeddings(df[:33]) # Can limit size (eg, df[:10] if run into limit on free-of-charge usage | |
# Embedding; embedding have 4096 dimensions, FOR TESTING ONLY | |
# example_entry = list(document_embeddings.items())[4] | |
# print(example_entry) | |
# print ("Length of example embedding = ", len(example_entry[1])) | |
if USE_INTERFACE: | |
demo = gr.Interface( | |
fn=lambda query: answer_query_with_context(query, df, document_embeddings), | |
inputs=gr.Textbox(lines=2, label="Query", placeholder="Type Question Here..."), | |
outputs=gr.Textbox(lines=2, label="Answer"), | |
description="Example of a domain-specific chatbot, using ChatGPT with supplemental content added.<br>\ | |
Here, the content relates to the investment outlook for 2023, according to Morgan Stanley, JPMorgan and Goldman Sachs.<br>\ | |
Sample queries: What is Goldman's outlook for inflation? What about the bond market? What does JPMorgan think about 2023?<br>\ | |
NOTE: High-level demo only. Supplemental content used here limited to about 30 paragraphs, due to limits on free-of-charge usage of ChatGPT.<br>\ | |
More robust domain-specific responses are possible.", | |
title="Domain-Specific Chatbot",) | |
# Launch the interface | |
demo.launch() | |
else: | |
prompt = construct_prompt( | |
'What is the outlook for inflation?', | |
document_embeddings, | |
df | |
) | |
# print("===\n", prompt) # FOR TESTING ONLY | |
answer_query_with_context("What is Goldman's outlook for inflation?", df, document_embeddings) | |