Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""gradio_sindi.ipynb | |
Automatically generated by Colab. | |
Original file is located at | |
https://colab.research.google.com/drive/12KZGcYbsXlMWYC8U4aeR_Ex0u8fJLgly | |
# libraries | |
""" | |
import gradio as gr | |
import torch | |
from transformers import pipeline | |
import numpy as np | |
import pandas as pd | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from transformers import AutoTokenizer, AutoModelForQuestionAnswering | |
#import re | |
"""# data - text""" | |
splitted_df = pd.read_csv('splitted_df_jo.csv') | |
"""# getting context""" | |
def remove_symbols(text): | |
remove_list = ['/', '(', ')', '\n', '.'] | |
remove_chars = "".join(remove_list) | |
cleaned_text = "".join([char for char in text if char not in remove_chars]) | |
# Remove non-ASCII characters | |
#pattern_ascii = r'[^\x00-\x7F]' # Matches any character outside the ASCII range | |
#filtered_text = re.sub(pattern_ascii, '', cleaned_text) | |
return cleaned_text | |
def context_func(message): | |
# Create a TF-IDF vectorizer | |
vectorizer = TfidfVectorizer() | |
# Convert abstracts and question to TF-IDF vectors | |
text_tfidf = vectorizer.fit_transform(splitted_df["section_text"]) | |
question_tfidf = vectorizer.transform([message]) | |
# Calculate cosine similarity between question and each abstract | |
similarities = cosine_similarity(question_tfidf, text_tfidf)[0] | |
# Find the index of the most similar abstract | |
most_similar_index = similarities.argmax() | |
# Get the most similar abstract | |
most_similar_context = splitted_df["section_text"][most_similar_index] | |
most_similar_context = remove_symbols(most_similar_context) | |
return most_similar_context | |
def answer_question(question): | |
context = context_func(question) | |
tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final") | |
model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final") | |
# Tokenize the inputs | |
inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True) | |
# Get the answer from the model | |
outputs = model(**inputs) | |
answer_start_scores = outputs.start_logits | |
answer_end_scores = outputs.end_logits | |
answer_start = torch.argmax(answer_start_scores) | |
answer_end = torch.argmax(answer_end_scores) + 1 | |
answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end]) | |
return answer, context | |
def main(): | |
"""" | |
Initializes a Women Cancer ChatBot interface using Hugging Face models for question answering. | |
This function loads a pretrained tokenizer and model from the Hugging Face model hub | |
and creates a Gradio interface for the ChatBot. Users can input questions related to | |
women's cancer topics, and the ChatBot will generate answers based on the provided context. | |
Returns: | |
None | |
Example: | |
>>> main() | |
""" | |
tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final") | |
model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final") | |
iface = gr.Interface(fn=answer_question, | |
inputs=["text"], | |
outputs=[gr.Textbox(label="Answer")], | |
title="Women Cancer ChatBot", | |
description="How can I help you?", | |
examples=[ | |
["What is breast cancer?"], | |
["What are treatments for cervical cancer?"] | |
]) | |
return iface.launch(debug = True, share=True) | |
if __name__ == "__main__": | |
main() |