Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from torch.nn.functional import softmax | |
import shap | |
import requests | |
from bs4 import BeautifulSoup | |
from sklearn.metrics.pairwise import cosine_similarity | |
from transformers import RobertaTokenizer,RobertaForSequenceClassification, pipeline,RobertaModel | |
from IPython.core.display import HTML | |
model_dir = 'temp' | |
tokenizer = RobertaTokenizer.from_pretrained(model_dir) | |
model = RobertaForSequenceClassification.from_pretrained(model_dir) | |
tokenizer1 = RobertaTokenizer.from_pretrained('roberta-base') | |
model1 = RobertaModel.from_pretrained('roberta-base') | |
threshold=0.5 | |
#pipe = pipeline("text-classification", model="thugCodeNinja/robertatemp") | |
# pipe = pipeline("text-classification",model=model,tokenizer=tokenizer) | |
def process_text(input_text): | |
if input_text: | |
text = input_text | |
inputs = tokenizer(text, return_tensors="pt") | |
with torch.no_grad(): | |
logits = model(**inputs).logits | |
probs = softmax(logits, dim=1) | |
max_prob, predicted_class_id = torch.max(probs, dim=1) | |
prob = str(round(max_prob.item() * 100, 2)) | |
label = model.config.id2label[predicted_class_id.item()] | |
final_label='Human' if model.config.id2label[predicted_class_id.item()]=='LABEL_0' else 'Chat-GPT' | |
processed_result = text | |
def search(text): | |
query = text | |
api_key = 'AIzaSyClvkiiJTZrCJ8BLqUY9I38WYmbve8g-c8' | |
search_engine_id = '53d064810efa44ce7' | |
url = f'https://www.googleapis.com/customsearch/v1?key={api_key}&cx={search_engine_id}&q={query}&num=5' | |
try: | |
response = requests.get(url) | |
data = response.json() | |
return data | |
except Exception as e: | |
return {'error': str(e)} | |
def get_article_text(url): | |
try: | |
response = requests.get(url) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Extract text from the article content (you may need to adjust this based on the website's structure) | |
article_text = ' '.join([p.get_text() for p in soup.find_all('p')]) | |
return article_text | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
return '' | |
def find_plagiarism(text): | |
search_results=[] | |
if len(text)>300: | |
search_results = search(text) | |
if 'items' not in search_results: | |
return [] | |
similar_articles = [] | |
for item in search_results['items']: | |
link = item.get('link', '') | |
article_text = get_article_text(link) | |
if article_text: | |
# Tokenize and encode the input text and the article text | |
encoding1 = tokenizer1(text, max_length=512, truncation=True, padding=True, return_tensors="pt") | |
encoding2 = tokenizer1(article_text, max_length=512, truncation=True, padding=True, return_tensors="pt") | |
# Calculate embeddings using the model | |
with torch.no_grad(): | |
embedding1 = model1(**encoding1).last_hidden_state.mean(dim=1) | |
embedding2 = model1(**encoding2).last_hidden_state.mean(dim=1) | |
# Calculate cosine similarity between the input text and the article text embeddings | |
similarity = cosine_similarity(embedding1, embedding2)[0][0] | |
if similarity > threshold: | |
similar_articles.append([link,float(similarity)]) | |
similar_articles = sorted(similar_articles, key=lambda x: x[1], reverse=True) | |
#threshold = 0.5 # Adjust the threshold as needed | |
return similar_articles[:5] | |
# prediction = pipe([text]) | |
# explainer = shap.DeepExplainer(model,[text]) | |
# shap_values = explainer([text]) | |
# shap_plot_html = HTML(shap.plots.text(shap_values, display=False)).data | |
similar_articles = find_plagiarism(text) | |
return processed_result, prob, final_label,similar_articles | |
text_input = gr.Textbox(label="Enter text") | |
outputs = [gr.Textbox(label="Processed text"), gr.Textbox(label="Probability"), gr.Textbox(label="Label"),gr.Dataframe(label="Similar Articles", headers=["Link", "Similarity"],row_count=5)] | |
title = "Group 2- ChatGPT text detection module" | |
description = '''Please upload text files and text input responsibly and await the explainable results. The approach in place includes finetuning a Roberta model for text classification.Once the classifications are done the most similar articles are presented along with the alleged similarity''' | |
gr.Interface(fn=process_text,title=title,description=description, inputs=[text_input], outputs=outputs).launch() | |