File size: 4,024 Bytes
2980408
 
 
 
 
31ca135
5cd4f42
 
a9ea810
31ca135
9ddaca4
 
 
31ca135
9ddaca4
 
a9ea810
9ddaca4
 
 
 
 
 
7055aff
5cd4f42
 
86e6300
5073e2b
44e5665
8c2e21c
359755a
8c2e21c
c156bc5
 
8c2e21c
5fd9368
a9ea810
 
675e3c4
aadb451
81f8cdc
62a1daa
 
a9baa59
 
417ec9a
81f8cdc
 
 
 
 
 
 
 
 
 
 
 
6195c27
2980408
 
 
 
 
81f8cdc
2980408
 
 
 
 
 
 
5fd9368
f5e51e0
0f297dd
81f8cdc
 
aadb451
65e4312
2980408
 
cc21469
 
86e6300
1c7baa8
f2889c6
1c7baa8
f2889c6
 
 
c84f2f8
bd01a7f
 
cc21469
f2889c6
 
a657bab
3a2d3ef
2980408
8c2e21c
2980408
3a2d3ef
 
 
2980408
92012a0
f2889c6
92012a0
2980408
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import gradio as gr
import openai
import pandas as pd 
import numpy as np
import csv
import os
from sentence_transformers.util import semantic_search

from datasets import load_dataset
openai.api_key= os.environ.get("openai.api_key")
import requests
model_id = "sentence-transformers/all-MiniLM-L6-v2"
import json
hf_token = os.environ.get("hf_token")
import re
from sklearn.metrics.pairwise import cosine_similarity

def generate_embeddings(texts, model_id, hf_token):
    api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
    headers = {"Authorization": f"Bearer {hf_token}"}
    response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
    embeddings = response.json()
    return embeddings
AP_Bio = load_dataset('vjain/biology_AP_embeddings')
dataset_embeddings1 = torch.from_numpy(AP_bio["train"].to_pandas().to_numpy()).to(torch.float)


AP_Physics = load_dataset('vjain/AP_physics_embeddings')
df2 = pd.DataFrame(AP_Physics['train'])
df2["similarity"] = 0

dataframes = {
    "AP_Bio": df1,
    "AP_Physics": df2
}

#df = pd.read_csv("TA_embeddings.csv")
#df["embedding"]=df["embedding"].apply(eval).apply(np.array)
def reply(input, dataset_name):
    global messages
    try:
        if dataset_name not in dataframes:
            return "Invalid dataset selected. Please select a valid dataset."
        if not input:
            return "Please Enter a Question to get an Answer"
        df = dataframes[dataset_name]
        input = input
        input_vector = generate_embeddings(input, model_id,hf_token)
        df["similarities"]=df["embedding"].apply(lambda x: cosine_similarity([x],[input_vector])[0][0])
        data = df.sort_values("similarities", ascending=False).head(10)
        data.to_csv("sorted.csv")
        context = []
        for i, row in data.iterrows():
            context.append(row['text'])
        context
        text = "\n".join(context)
        context = text
        prompt = f"""
                Answer the following question using the context given below.If you don't know the answer for certain, say I don't know.
                Context: {context}

                Q: {input}

                """      
        response= openai.Completion.create(
                    prompt=prompt,
                    temperature=1,
                    max_tokens=500,
                    top_p=1,
                    frequency_penalty=0,
                    presence_penalty=0,
                    model="text-davinci-003"
                )["choices"][0]["text"].strip(" \n")
        return response
        
    except Exception as e:
        return f"An error occurred: {e}"
        
    
    


csv_dropdown = gr.inputs.Dropdown(
    label="Select the Book",
    choices=["AP_Bio", "AP_Physics"],
    default="AP_Bio"
            
)
input_text = gr.inputs.Textbox(
    label="Enter your questions here",
    placeholder="E.g. What is DNA?",
    lines=3
    
)
text_output = gr.outputs.Textbox(label="Answer")

description = "Scholar Bot is a question answering system designed to provide accurate and relevant answers to questions from this book hosted by OpenStax https://openstax.org/details/books/biology-ap-courses. Simply enter your question in the text box above and Scholar Bot will use advanced natural language processing algorithms to search a large corpus of biology text to find the best answer for you. Scholar Bot uses the Sentence Transformers model to generate embeddings of text, and OpenAI's GPT-3 language model to provide answers to your questions."

ui = gr.Interface(fn=reply,
                  inputs=[input_text, csv_dropdown],
                  outputs=[text_output],
                  title="Scholar Bot",
                  description=description,
                  theme="light",
                  layout="vertical",
                  allow_flagging=False,
                  examples=[["What is the function of DNA polymerase?", "AP_Bio"]]
                )

ui.launch()