Rifky commited on
Commit
1a7dca5
1 Parent(s): 4d5820d

Reference Searcher

Browse files
Files changed (1) hide show
  1. app.py +26 -8
app.py CHANGED
@@ -6,19 +6,24 @@ import time
6
  import os
7
 
8
  from transformers import AutoModelForSequenceClassification, AutoModel, AutoTokenizer
 
 
 
9
  from Scraper import Scrap
10
 
11
  st.set_page_config(layout="wide")
12
 
13
  model_checkpoint = "Rifky/FND"
 
14
  data_checkpoint = "Rifky/turnbackhoax-encoded"
15
  label = {0: "valid", 1: "fake"}
16
 
17
  @st.cache(show_spinner=False, allow_output_mutation=True)
18
  def load_model():
19
  model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
 
20
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, fast=True)
21
- return model, tokenizer
22
 
23
  def sigmoid(x):
24
  return 1 / (1 + np.exp(-x))
@@ -27,7 +32,8 @@ input_column, reference_column = st.columns(2)
27
  input_column.write('# Fake News Detection AI')
28
 
29
  with st.spinner("Loading Model..."):
30
- model, tokenizer = load_model()
 
31
 
32
  user_input = input_column.text_input("Article url")
33
  submit = input_column.button("submit")
@@ -36,11 +42,7 @@ submit = input_column.button("submit")
36
  if submit:
37
  last_time = time.time()
38
  with st.spinner("Reading Article..."):
39
- if user_input:
40
- if user_input[:4] == 'http':
41
- text = Scrap(user_input)
42
- else:
43
- text = user_input
44
 
45
  if text:
46
  text = re.sub(r'\n', ' ', text)
@@ -66,4 +68,20 @@ if submit:
66
  prediction = np.argmax(result, axis=-1)
67
  input_column.success(f"This news is {label[prediction]}.")
68
  st.text(f"{int(result[prediction]*100)}% confidence")
69
- input_column.progress(result[prediction])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import os
7
 
8
  from transformers import AutoModelForSequenceClassification, AutoModel, AutoTokenizer
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+ from datasets import load_dataset
11
+ from sentence_transformers import SentenceTransformer
12
  from Scraper import Scrap
13
 
14
  st.set_page_config(layout="wide")
15
 
16
  model_checkpoint = "Rifky/FND"
17
+ base_model_checkpoint = "indobenchmark/indobert-base-p1"
18
  data_checkpoint = "Rifky/turnbackhoax-encoded"
19
  label = {0: "valid", 1: "fake"}
20
 
21
  @st.cache(show_spinner=False, allow_output_mutation=True)
22
  def load_model():
23
  model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
24
+ base_model = SentenceTransformer(base_model_checkpoint)
25
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, fast=True)
26
+ return model, base_model, tokenizer
27
 
28
  def sigmoid(x):
29
  return 1 / (1 + np.exp(-x))
 
32
  input_column.write('# Fake News Detection AI')
33
 
34
  with st.spinner("Loading Model..."):
35
+ model, base_model, tokenizer = load_model()
36
+ data = load_dataset(data_checkpoint, split="train")
37
 
38
  user_input = input_column.text_input("Article url")
39
  submit = input_column.button("submit")
 
42
  if submit:
43
  last_time = time.time()
44
  with st.spinner("Reading Article..."):
45
+ title, text = Scrap(user_input)
 
 
 
 
46
 
47
  if text:
48
  text = re.sub(r'\n', ' ', text)
 
68
  prediction = np.argmax(result, axis=-1)
69
  input_column.success(f"This news is {label[prediction]}.")
70
  st.text(f"{int(result[prediction]*100)}% confidence")
71
+ input_column.progress(result[prediction])
72
+
73
+ with st.spinner("Searching for references"):
74
+ title_embeddings = base_model.encode(title)
75
+ similarity_score = cosine_similarity(
76
+ [title_embeddings],
77
+ data["embeddings"]
78
+ ).flatten()
79
+ sorted = np.argsort(similarity_score)[::-1].tolist()
80
+
81
+ for i in sorted:
82
+ reference_column.write(f"""
83
+ <a href={data["url"][i]}><small>turnbackhoax.id</small></a>
84
+ <h2>{data["title"][i]}</h2>
85
+ """, unsafe_allow_html=True)
86
+ with reference_column.beta_expander("read content"):
87
+ st.write(data["text"][i])