File size: 4,575 Bytes
72eedb2
 
2088fe5
72eedb2
 
2088fe5
72eedb2
2088fe5
1a7dca5
 
 
72eedb2
 
c2b2d84
1a7dca5
a8075a2
ef01f5b
72eedb2
 
 
 
1a7dca5
72eedb2
d446b84
5759bd5
72eedb2
ef01f5b
 
72eedb2
 
5759bd5
72eedb2
e44280a
bb4f2bd
804ff49
 
4a8bb58
804ff49
1e7e35f
 
745e9e0
804ff49
 
 
4cf6176
72eedb2
 
 
 
71097c4
72eedb2
394d728
 
72eedb2
 
 
2088fe5
 
ef01f5b
 
 
2088fe5
 
e334543
ef01f5b
e334543
ef01f5b
 
 
 
72eedb2
 
1a7dca5
9878be5
 
 
 
 
 
d7ef77b
642f8d5
b9ccf67
394d728
 
d3d7112
79e8ae9
 
14e201b
 
0ec7480
14e201b
c95dd80
394d728
 
c95dd80
 
 
 
 
 
 
394d728
642f8d5
0ab2c20
 
1d20a79
80720b9
 
 
394d728
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import streamlit as st
import numpy as np
import pandas as pd
import re
import time
import os

from transformers import AutoModelForSequenceClassification, AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from Scraper import Scrap

model_checkpoint = "Rifky/indobert-hoax-classification"
base_model_checkpoint = "indobenchmark/indobert-base-p1"
data_checkpoint = "Rifky/indonesian-hoax-news"
label = {0: "valid", 1: "fake"}

@st.cache(show_spinner=False, allow_output_mutation=True)
def load_model():
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
    base_model = SentenceTransformer(base_model_checkpoint)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, fast=True)
    data = load_dataset(data_checkpoint, split="train", download_mode='force_redownload')
    return model, base_model, tokenizer, data

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

with st.spinner("Loading Model..."):
    model, base_model, tokenizer, data = load_model()

st.markdown("""<h1 style="text-align:center;">Fake News Detection AI</h1>""", unsafe_allow_html=True)
user_input = st.text_input("Article URL")

m = st.markdown("""
<style>
div.stButton > button:first-child {
    margin: auto;
    display: block;
    width: 100%;
}
</style>""", unsafe_allow_html=True)

submit = st.button("submit")

if submit:
    last_time = time.time()
    with st.spinner("Reading Article..."):
        scrap = Scrap(user_input)

    if scrap:
        title, text = scrap.title, scrap.text
        text = re.sub(r'\n', ' ', text)

        with st.spinner("Computing..."):
            token = text.split()
            text_len = len(token)

            sequences = []
            for i in range(text_len // 512):
                sequences.append(" ".join(token[i * 512: (i + 1) * 512]))
            sequences.append(" ".join(token[text_len - (text_len % 512) : text_len]))
            sequences = tokenizer(sequences, max_length=512, truncation=True, padding="max_length", return_tensors='pt')

            predictions = model(**sequences)[0].detach().numpy()
            result = [
                np.sum([sigmoid(i[0]) for i in predictions]) / len(predictions), 
                np.sum([sigmoid(i[1]) for i in predictions]) / len(predictions)
            ]
                
            print (f'\nresult: {result}')
            
            title_embeddings = base_model.encode(title)
            similarity_score = cosine_similarity(
                [title_embeddings],
                data["embeddings"]
            ).flatten()
            sorted = np.argsort(similarity_score)[::-1].tolist()

            prediction = np.argmax(result, axis=-1)
            if prediction == 0:
                st.markdown(f"""<p style="background-color: rgb(236, 253, 245); 
                color: rgb(6, 95, 70);
                font-size: 20px;
                border-radius: 7px;
                padding-left: 12px;
                padding-top: 15px;
                padding-bottom: 15px;
                line-height: 25px;
                text-align: center;">This article is <b>{label[prediction]}</b>.</p>""", unsafe_allow_html=True)
            else:
                st.markdown(f"""<p style="background-color: rgb(254, 242, 242); 
                color: rgb(153, 27, 27);
                font-size: 20px;
                border-radius: 7px;
                padding-left: 12px;
                padding-top: 15px;
                padding-bottom: 15px;
                line-height: 25px;
                text-align: center;">This article is <b>{label[prediction]}</b>.</p>""", unsafe_allow_html=True)
                
            
            with st.expander("Related Articles"):
                for i in sorted[:5]:
                    # st.write(f"""""",unsafe_allow_html=True)
                    st.markdown(f"""
                    <small style="text-align: left;">{data["url"][i].split("/")[2]}</small><br>
                    <a href={data["url"][i]} style="text-align: left;">{data["title"][i]}</a>
                    """, unsafe_allow_html=True)
    else:
        st.markdown(f"""<p style="background-color: rgb(254, 242, 242); 
        color: rgb(153, 27, 27);
        font-size: 20px;
        border-radius: 7px;
        padding-left: 12px;
        padding-top: 15px;
        padding-bottom: 15px;
        line-height: 25px;
        text-align: center;">Can't scrap article from this link.</p>""", unsafe_allow_html=True)