File size: 4,575 Bytes
72eedb2 2088fe5 72eedb2 2088fe5 72eedb2 2088fe5 1a7dca5 72eedb2 c2b2d84 1a7dca5 a8075a2 ef01f5b 72eedb2 1a7dca5 72eedb2 d446b84 5759bd5 72eedb2 ef01f5b 72eedb2 5759bd5 72eedb2 e44280a bb4f2bd 804ff49 4a8bb58 804ff49 1e7e35f 745e9e0 804ff49 4cf6176 72eedb2 71097c4 72eedb2 394d728 72eedb2 2088fe5 ef01f5b 2088fe5 e334543 ef01f5b e334543 ef01f5b 72eedb2 1a7dca5 9878be5 d7ef77b 642f8d5 b9ccf67 394d728 d3d7112 79e8ae9 14e201b 0ec7480 14e201b c95dd80 394d728 c95dd80 394d728 642f8d5 0ab2c20 1d20a79 80720b9 394d728 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import streamlit as st
import numpy as np
import pandas as pd
import re
import time
import os
from transformers import AutoModelForSequenceClassification, AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from Scraper import Scrap
model_checkpoint = "Rifky/indobert-hoax-classification"
base_model_checkpoint = "indobenchmark/indobert-base-p1"
data_checkpoint = "Rifky/indonesian-hoax-news"
label = {0: "valid", 1: "fake"}
@st.cache(show_spinner=False, allow_output_mutation=True)
def load_model():
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
base_model = SentenceTransformer(base_model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, fast=True)
data = load_dataset(data_checkpoint, split="train", download_mode='force_redownload')
return model, base_model, tokenizer, data
def sigmoid(x):
return 1 / (1 + np.exp(-x))
with st.spinner("Loading Model..."):
model, base_model, tokenizer, data = load_model()
st.markdown("""<h1 style="text-align:center;">Fake News Detection AI</h1>""", unsafe_allow_html=True)
user_input = st.text_input("Article URL")
m = st.markdown("""
<style>
div.stButton > button:first-child {
margin: auto;
display: block;
width: 100%;
}
</style>""", unsafe_allow_html=True)
submit = st.button("submit")
if submit:
last_time = time.time()
with st.spinner("Reading Article..."):
scrap = Scrap(user_input)
if scrap:
title, text = scrap.title, scrap.text
text = re.sub(r'\n', ' ', text)
with st.spinner("Computing..."):
token = text.split()
text_len = len(token)
sequences = []
for i in range(text_len // 512):
sequences.append(" ".join(token[i * 512: (i + 1) * 512]))
sequences.append(" ".join(token[text_len - (text_len % 512) : text_len]))
sequences = tokenizer(sequences, max_length=512, truncation=True, padding="max_length", return_tensors='pt')
predictions = model(**sequences)[0].detach().numpy()
result = [
np.sum([sigmoid(i[0]) for i in predictions]) / len(predictions),
np.sum([sigmoid(i[1]) for i in predictions]) / len(predictions)
]
print (f'\nresult: {result}')
title_embeddings = base_model.encode(title)
similarity_score = cosine_similarity(
[title_embeddings],
data["embeddings"]
).flatten()
sorted = np.argsort(similarity_score)[::-1].tolist()
prediction = np.argmax(result, axis=-1)
if prediction == 0:
st.markdown(f"""<p style="background-color: rgb(236, 253, 245);
color: rgb(6, 95, 70);
font-size: 20px;
border-radius: 7px;
padding-left: 12px;
padding-top: 15px;
padding-bottom: 15px;
line-height: 25px;
text-align: center;">This article is <b>{label[prediction]}</b>.</p>""", unsafe_allow_html=True)
else:
st.markdown(f"""<p style="background-color: rgb(254, 242, 242);
color: rgb(153, 27, 27);
font-size: 20px;
border-radius: 7px;
padding-left: 12px;
padding-top: 15px;
padding-bottom: 15px;
line-height: 25px;
text-align: center;">This article is <b>{label[prediction]}</b>.</p>""", unsafe_allow_html=True)
with st.expander("Related Articles"):
for i in sorted[:5]:
# st.write(f"""""",unsafe_allow_html=True)
st.markdown(f"""
<small style="text-align: left;">{data["url"][i].split("/")[2]}</small><br>
<a href={data["url"][i]} style="text-align: left;">{data["title"][i]}</a>
""", unsafe_allow_html=True)
else:
st.markdown(f"""<p style="background-color: rgb(254, 242, 242);
color: rgb(153, 27, 27);
font-size: 20px;
border-radius: 7px;
padding-left: 12px;
padding-top: 15px;
padding-bottom: 15px;
line-height: 25px;
text-align: center;">Can't scrap article from this link.</p>""", unsafe_allow_html=True) |