import gradio as gr import openai import pandas as pd import numpy as np import csv import os from datasets import load_dataset openai.api_key= os.environ.get("openai.api_key") from openai.embeddings_utils import get_embedding import requests model_id = "sentence-transformers/all-MiniLM-L6-v2" import json hf_token = os.environ.get("hugginface.api.token") import re from sklearn.metrics.pairwise import cosine_similarity def generate_embeddings(texts, model_id, hf_token): api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}" headers = {"Authorization": f"Bearer {hf_token}"} response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}}) embeddings = response.json() return embeddings AP_Bio = load_dataset('vjain/biology_AP_embeddings') df1 = pd.DataFrame(AP_Bio['train']) df1["similarity"] = 0 AP_Physics = load_dataset('vjain/AP_physics_embeddings') df2 = pd.DataFrame(AP_Physics['train']) df2["similarity"] = 0 Personality = load_dataset('vjain/Personality_em') df3 = pd.DataFrame(Personality['train']) df3["similarity"] = 0 AP_statistics = load_dataset('vjain/AP_statistics') df4 = pd.DataFrame(AP_statistics['train']) df4["similarity"] = 0 tax_embeddings = load_dataset('vjain/tax_embeddings') df5 = pd.DataFrame(tax_embeddings['train']) df5["similarity"] = 0 therapy = load_dataset('vjain/therapy') df6 = pd.DataFrame(therapy['train']) df6["similarity"] = 0 gurbani = load_dataset('vjain/gurbani') df7 = pd.DataFrame(gurbani['train']) df7["similarity"] = 0 dataframes = { "AP_Bio": df1, "AP_Physics": df2, "Personality" : df3, "AP_statistics": df4, "tax_embeddings": df5, "therapy": df6, "gurbani":df7 } #df = pd.read_csv("TA_embeddings.csv") #df["embedding"]=df["embedding"].apply(eval).apply(np.array) def reply(input, dataset_name): try: if dataset_name not in dataframes: return "Invalid dataset selected. Please select a valid dataset." if not input: return "Please Enter a Question to get an Answer" df = dataframes[dataset_name] input = input input_vector = generate_embeddings(input, model_id,hf_token) df["similarities"]=df["embedding"].apply(lambda x: cosine_similarity([x],[input_vector])[0][0]) data = df.sort_values("similarities", ascending=False).head(5) data.to_csv("sorted.csv") context = [] for i, row in data.iterrows(): context.append(row['text']) context text = "\n".join(context) context = text prompt = f""" Answer the following question using the context given below.If you don't know the answer for certain, say I don't know. Context: {context} Q: {input} """ response= openai.Completion.create( prompt=prompt, temperature=1, max_tokens=500, top_p=1, frequency_penalty=0, presence_penalty=0, model="text-davinci-003" )["choices"][0]["text"].strip(" \n") return response except Exception as e: return f"An error occurred: {e}" csv_dropdown = gr.inputs.Dropdown( label="Select the Book", choices=["AP_Bio", "AP_Physics","Personality","AP_statistics","tax_embeddings","therapy","gurbani"], default="AP_Bio" ) input_text = gr.inputs.Textbox( label="Enter your questions here", placeholder="E.g. What is DNA?", lines=3 ) text_output = gr.outputs.Textbox(label="Answer") description = "Scholar Bot is a question answering system designed to provide accurate and relevant answers to questions from this book hosted by OpenStax https://openstax.org/details/books/biology-ap-courses. Simply enter your question in the text box above and Scholar Bot will use advanced natural language processing algorithms to search a large corpus of biology text to find the best answer for you. Scholar Bot uses the Sentence Transformers model to generate embeddings of text, and OpenAI's GPT-3 language model to provide answers to your questions." ui = gr.Interface(fn=reply, inputs=[input_text, csv_dropdown], outputs=[text_output], title="Scholar Bot", description=description, theme="light", layout="vertical", allow_flagging=False, examples=[["What is the function of DNA polymerase?", "AP_Bio"]] ) ui.launch()