File size: 4,680 Bytes
2980408
 
 
 
 
31ca135
a9ea810
e37d22f
1da5a81
9ddaca4
4494cde
9ddaca4
95a7161
9ddaca4
 
a9ea810
9ddaca4
 
 
 
 
 
7055aff
1da5a81
 
5cd4f42
1da5a81
 
 
86e6300
02bd0fb
 
 
 
afea2c7
db1f228
79976b8
afea2c7
ca04288
743e3c4
3ea1e2b
 
26fa27b
 
 
a7b587b
 
43093ce
 
a7b587b
 
1da5a81
 
02bd0fb
79976b8
3ea1e2b
26fa27b
a7b587b
 
1da5a81
5fd9368
a9ea810
 
aaa1f66
81f8cdc
18e6a04
 
a9baa59
 
3a9bd63
81f8cdc
9c9f12f
81f8cdc
098e8e8
81f8cdc
 
 
 
 
 
 
 
6195c27
2980408
 
 
81f8cdc
2980408
 
 
 
 
 
 
5fd9368
f5e51e0
81f8cdc
 
cc21469
6b774a8
18e6a04
a7b587b
18e6a04
6b774a8
18e6a04
f2889c6
 
c84f2f8
bd01a7f
 
cc21469
f2889c6
 
a657bab
3a2d3ef
2980408
8c2e21c
2980408
3a2d3ef
 
 
2980408
92012a0
f2889c6
92012a0
2980408
0bbe279
2980408
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import gradio as gr
import openai
import pandas as pd 
import numpy as np
import csv
import os
from datasets import load_dataset
openai.api_key= os.environ.get("openai.api_key")
from openai.embeddings_utils import get_embedding
import requests
model_id = "sentence-transformers/all-MiniLM-L6-v2"
import json
hf_token = os.environ.get("hugginface.api.token")
import re
from sklearn.metrics.pairwise import cosine_similarity

def generate_embeddings(texts, model_id, hf_token):
    api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
    headers = {"Authorization": f"Bearer {hf_token}"}
    response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
    embeddings = response.json()
    return embeddings
AP_Bio = load_dataset('vjain/biology_AP_embeddings')
df1 = pd.DataFrame(AP_Bio['train'])
df1["similarity"] = 0

AP_Physics = load_dataset('vjain/AP_physics_embeddings')
df2 = pd.DataFrame(AP_Physics['train'])
df2["similarity"] = 0

Personality = load_dataset('vjain/Personality_em')
df3 = pd.DataFrame(Personality['train'])
df3["similarity"] = 0

AP_statistics = load_dataset('vjain/AP_statistics')
df4 = pd.DataFrame(AP_statistics['train'])
df4["similarity"] = 0

tax_embeddings = load_dataset('vjain/tax_embeddings')
df5 = pd.DataFrame(tax_embeddings['train'])
df5["similarity"] = 0

therapy = load_dataset('vjain/therapy')
df6 = pd.DataFrame(therapy['train'])
df6["similarity"] = 0

gurbani = load_dataset('vjain/gurbani')
df7 = pd.DataFrame(gurbani['train'])
df7["similarity"] = 0


dataframes = {
    "AP_Bio": df1,
    "AP_Physics": df2,
    "Personality" : df3,
    "AP_statistics": df4,
    "tax_embeddings": df5,
    "therapy": df6,
    "gurbani":df7
}

#df = pd.read_csv("TA_embeddings.csv")
#df["embedding"]=df["embedding"].apply(eval).apply(np.array)
def reply(input, dataset_name):
    try:
        if dataset_name not in dataframes:
            return "Invalid dataset selected. Please select a valid dataset."
        if not input:
            return "Please Enter a Question to get an Answer"
        df = dataframes[dataset_name]
        input = input
        input_vector = generate_embeddings(input, model_id,hf_token)
        df["similarities"]=df["embedding"].apply(lambda x: cosine_similarity([x],[input_vector])[0][0])
        data = df.sort_values("similarities", ascending=False).head(5)
        data.to_csv("sorted.csv")
        context = []
        for i, row in data.iterrows():
            context.append(row['text'])
        context
        text = "\n".join(context)
        context = text
        prompt = f"""
                Answer the following question using the context given below.If you don't know the answer for certain, say I don't know.
                Context: {context}
                Q: {input}
                """      
        response= openai.Completion.create(
                    prompt=prompt,
                    temperature=1,
                    max_tokens=500,
                    top_p=1,
                    frequency_penalty=0,
                    presence_penalty=0,
                    model="text-davinci-003"
                )["choices"][0]["text"].strip(" \n")
        return response
    except Exception as e:
        return f"An error occurred: {e}"

csv_dropdown = gr.inputs.Dropdown(
    label="Select the Book",
    choices=["AP_Bio", "AP_Physics","Personality","AP_statistics","tax_embeddings","therapy","gurbani"],
    default="AP_Bio"
            
)
input_text = gr.inputs.Textbox(
    label="Enter your questions here",
    placeholder="E.g. What is DNA?",
    lines=3
    
)
text_output = gr.outputs.Textbox(label="Answer")

description = "Scholar Bot is a question answering system designed to provide accurate and relevant answers to questions from this book hosted by OpenStax https://openstax.org/details/books/biology-ap-courses. Simply enter your question in the text box above and Scholar Bot will use advanced natural language processing algorithms to search a large corpus of biology text to find the best answer for you. Scholar Bot uses the Sentence Transformers model to generate embeddings of text, and OpenAI's GPT-3 language model to provide answers to your questions."

ui = gr.Interface(fn=reply,
                  inputs=[input_text, csv_dropdown],
                  outputs=[text_output],
                  title="Scholar Bot",
                  description=description,
                  theme="light",
                  layout="vertical",
                  allow_flagging=False,
                  examples=[["What is the function of DNA polymerase?", "AP_Bio"]]
                )


ui.launch()