File size: 2,287 Bytes
f2ddab7 a68d2df f2ddab7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import pandas as pd
import numpy as np
import re
import string
from torch import clamp
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st
class TokenSimilarity:
def load_pretrained(self, from_pretrained:str='indobenchmark/indobert-base-p1'):
self.tokenizer = AutoTokenizer.from_pretrained(from_pretrained)
self.model = AutoModel.from_pretrained(from_pretrained)
def __cleaning(self, text:str):
text = text.translate(str.maketrans('', ''))
text = text.translate(str.maketrans('', '', string.punctuation))
text = re.sub(r'/s+', ' ', text).strip()
return text
def __process(self, first_token:str, second_token:str):
inputs = self.tokenizer([first_token, second_token], max_length=self.max_length, truncation=self.truncation, padding=self.padding, return_tensors='pt')
attention = inputs.attention_mask
outputs = self.model(**inputs)
embeddings = outputs[0]
embeddings = outputs.last_hidden_state
mask = attention.unsqueeze(-1).expand(embeddings.shape).float()
masked_embeddings = embeddings * mask
summed = masked_embeddings.sum(1)
counts = clamp(mask.sum(1), min = 1e-9)
mean_pooled = summed / counts
return mean_pooled.detach().numpy()
def predict(self, first_token:str, second_token:str, return_as_embeddings:bool=False, max_length:int=16, truncation:bool=True, padding:str='max_length'):
self.max_length = max_length
self.truncation = truncation
self.padding = padding
first_token = self.__cleaning(first_token)
second_token = self.__cleaning(second_token)
mean_pooled_arr = self.__process(first_token, second_token)
if return_as_embeddings:
return mean_pooled_arr
similarity = cosine_similarity([mean_pooled_arr[0]], [mean_pooled_arr[1]])
return similarity
st.title('CEK HOAKS')
model = TokenSimilarity()
model.load_pretrained('indobenchmark/indobert-base-p2')
df = pd.read_csv('hoax.csv', sep=';')
# def clear_submit():
# st.session_state['submit'] = False
to_check = st.text_area('Teks yang mau dicek...')
if to_check:
for i in np.arange(len(df['text'])):
result = model.predict(to_check, df['text'][i])
st.write(result) |