jrocha's picture
Update app.py
c1a3ab2 verified
raw
history blame contribute delete
No virus
3.6 kB
# -*- coding: utf-8 -*-
"""gradio_sindi.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/12KZGcYbsXlMWYC8U4aeR_Ex0u8fJLgly
# libraries
"""
import gradio as gr
import torch
from transformers import pipeline
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
#import re
"""# data - text"""
splitted_df = pd.read_csv('splitted_df_jo.csv')
"""# getting context"""
def remove_symbols(text):
remove_list = ['/', '(', ')', '\n', '.']
remove_chars = "".join(remove_list)
cleaned_text = "".join([char for char in text if char not in remove_chars])
# Remove non-ASCII characters
#pattern_ascii = r'[^\x00-\x7F]' # Matches any character outside the ASCII range
#filtered_text = re.sub(pattern_ascii, '', cleaned_text)
return cleaned_text
def context_func(message):
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()
# Convert abstracts and question to TF-IDF vectors
text_tfidf = vectorizer.fit_transform(splitted_df["section_text"])
question_tfidf = vectorizer.transform([message])
# Calculate cosine similarity between question and each abstract
similarities = cosine_similarity(question_tfidf, text_tfidf)[0]
# Find the index of the most similar abstract
most_similar_index = similarities.argmax()
# Get the most similar abstract
most_similar_context = splitted_df["section_text"][most_similar_index]
most_similar_context = remove_symbols(most_similar_context)
return most_similar_context
def answer_question(question):
context = context_func(question)
tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final")
model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final")
# Tokenize the inputs
inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True)
# Get the answer from the model
outputs = model(**inputs)
answer_start_scores = outputs.start_logits
answer_end_scores = outputs.end_logits
answer_start = torch.argmax(answer_start_scores)
answer_end = torch.argmax(answer_end_scores) + 1
answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
return answer, context
def main():
""""
Initializes a Women Cancer ChatBot interface using Hugging Face models for question answering.
This function loads a pretrained tokenizer and model from the Hugging Face model hub
and creates a Gradio interface for the ChatBot. Users can input questions related to
women's cancer topics, and the ChatBot will generate answers based on the provided context.
Returns:
None
Example:
>>> main()
"""
tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final")
model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final")
iface = gr.Interface(fn=answer_question,
inputs=["text"],
outputs=[gr.Textbox(label="Answer")],
title="Women Cancer ChatBot",
description="How can I help you?",
examples=[
["What is breast cancer?"],
["What are treatments for cervical cancer?"]
])
return iface.launch(debug = True, share=True)
if __name__ == "__main__":
main()