Minakshee25's picture
Update app.py
8e3db67 verified
raw
history blame
No virus
5.27 kB
from selfcheckgpt.modeling_selfcheck import SelfCheckNLI, SelfCheckBERTScore, SelfCheckNgram
import torch
import spacy
import os
import gradio as gr
# Load the English language model
nlp = spacy.load("en_core_web_sm")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
selfcheck_nli = SelfCheckNLI(device=device) # set device to 'cuda' if GPU is available
selfcheck_bertscore = SelfCheckBERTScore(rescale_with_baseline=True)
selfcheck_ngram = SelfCheckNgram(n=1) # n=1 means Unigram, n=2 means Bigram, etc.
openai_key = os.getenv("OPENAI_API_KEY")
resource_url = os.getenv("OPENAI_API_RESOURCEURL")
api_version =os.getenv("OPENAI_API_VERSION")
api_url=os.getenv("OPENAI_API_RESOURCEURL")
import os
from openai import AzureOpenAI
client = AzureOpenAI(
api_key=openai_key,
api_version=api_version,
azure_endpoint = api_url
)
deployment_name=os.getenv("model_name") #This will correspond to the custom name you chose for your deployment when you deployed a model. Use a gpt-35-turbo-instruct deployment.
import os
from openai import AzureOpenAI
client = AzureOpenAI(
api_key = openai_key,
api_version =api_version,
azure_endpoint =api_url
)
def generate_response(prompt):
response = client.chat.completions.create(
model=deployment_name, # model = "deployment_name".
temperature=0.0,
messages=[
{"role": "user", "content": prompt}
]
)
return response.choices[0].message.content
def generate_response_high_temp(prompt):
response = client.chat.completions.create(
model=deployment_name, # model = "deployment_name".
temperature=1.0,
messages=[
{"role": "user", "content": prompt}
]
)
return response.choices[0].message.content
def create_dataset(prompt):
s1 = generate_response_high_temp(prompt)
s2 = generate_response_high_temp(prompt)
s3 = generate_response_high_temp(prompt)
return s1, s2, s3
def split_sent(sentence):
return [sent.text.strip() for sent in nlp(sentence).sents]
def func_selfcheck_nli(sentence, s1, s2, s3):
sentence1 = [sentence[2:-2]]
sample_dataset = [s1, s2, s3]
score = selfcheck_nli.predict(
sentences = sentence1, # list of sentences
sampled_passages = sample_dataset, # list of sampled passages
)
if (score > 0.35):
return f"The LLM is hallucinating with selfcheck nli score of {score}"
else:
return f"The LLM is generating true information with selfcheck nli score of {score}"
def func_selfcheckbert(sentence, s1, s2, s3):
sentence1 = [sentence[2:-2]]
sample_dataset = [s1, s2, s3]
sent_scores_bertscore = selfcheck_bertscore.predict(
sentences = sentence1, # list of sentences
sampled_passages = sample_dataset, # list of sampled passages
)
if (sent_scores_bertscore > 0.6):
return f"The LLM is hallucinating with selfcheck BERT score of {sent_scores_bertscore}"
else:
return f"The LLM is generating true information with selfcheck BERT score of {sent_scores_bertscore}"
def func_selfcheckngram(sentence, s1, s2, s3):
sentence1 = [sentence[2:-2]]
sample_dataset = [s1, s2, s3]
sentences_split = split_sent(sentence1[0])
sent_scores_ngram = selfcheck_ngram.predict(
sentences = sentences_split,
passage = sentence1[0],
sampled_passages = sample_dataset,
)
avg_max_neg_logprob = sent_scores_ngram['doc_level']['avg_max_neg_logprob']
if(avg_max_neg_logprob > 6):
return f"The LLM is hallucinating with selfcheck ngram score of {avg_max_neg_logprob}"
else:
return f"The LLM is generating true information with selfcheck ngram score of {avg_max_neg_logprob}"
return sent_scores_ngram
def generating_samples(prompt):
prompt_template=f"This is a Wikipedia passage on the topic of '{prompt}' in 100 words"
sample_response=generate_response(prompt_template)
s1, s2, s3 =create_dataset(prompt_template)
sentence=[sample_response]
return sentence, s1, s2, s3
with gr.Blocks() as demo:
gr.Markdown(
"""
<h1> LLM Hackathon : LLM Hallucination Detector​ <h1>
""")
with gr.Column():
prompt = gr.Textbox(label="prompt")
with gr.Column():
sentence = gr.Textbox(label="response")
print(sentence)
with gr.Row():
s1 = gr.Textbox(label="sample1")
s2 = gr.Textbox(label="sample2")
s3 = gr.Textbox(label="sample3")
with gr.Column():
score= gr.Textbox(label="output")
output_response = gr.Button("Generate response")
output_response.click(
fn=generating_samples,
inputs=prompt,
outputs=[sentence, s1, s2, s3]
)
with gr.Row(equal_height=True):
self_check_nli_button = gr.Button("self check nli")
self_check_nli_button.click(
fn=func_selfcheck_nli,
inputs=[sentence, s1, s2, s3],
outputs=score
)
selfcheckbert_button = gr.Button("self check Bert")
selfcheckbert_button.click(
fn=func_selfcheckbert,
inputs=[sentence, s1, s2, s3],
outputs=score
)
self_check_ngram_button = gr.Button("self check ngram")
self_check_ngram_button.click(
fn=func_selfcheckngram,
inputs=[sentence, s1, s2, s3],
outputs=score
)
demo.launch()