|
|
|
|
|
|
|
import streamlit as st |
|
from GoogleNews import GoogleNews |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import pandas as pd |
|
import numpy as np |
|
import string |
|
import re |
|
from nltk.corpus import stopwords |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import sklearn |
|
import time |
|
|
|
|
|
googlenews = GoogleNews() |
|
googlenews = GoogleNews(lang='ar') |
|
googlenews.clear() |
|
|
|
|
|
|
|
st.write(""" |
|
Arabic Fake News Detection System |
|
A system designed as a part of master project |
|
done by Reem AlFouzan |
|
Supervised by : Dr, Abdulla al mutairi |
|
""") |
|
|
|
text_input = st.text_input (''' **Enter the text** ''') |
|
if len(text_input) != 0: |
|
inputt = [] |
|
inputt = pd.DataFrame([text_input]) |
|
|
|
googlenews.search(inputt.iloc[0,0]) |
|
googlenews.get_news(inputt.iloc[0,0]) |
|
|
|
result_0 = googlenews.page_at(1) |
|
print("Data") |
|
print(result_0, "data 2") |
|
|
|
if len(result_0) == 0: |
|
desc_1 = ['لا يوجد نتائج للخبر '] |
|
link_1 = ['لا يوجد مصدر'] |
|
if len(result_0) != 0: |
|
desc_1 = googlenews.get_texts() |
|
link_1 = googlenews.get_links() |
|
for i in list(range(2, 70)): |
|
|
|
result = googlenews.page_at(i) |
|
desc = googlenews.get_texts() |
|
link = googlenews.get_links() |
|
|
|
desc_1 = desc_1 + desc |
|
link_1 = link_1 + link |
|
|
|
column_names = ["text", 'link'] |
|
df = pd.DataFrame(columns = column_names) |
|
|
|
df['text'] = desc_1 |
|
df['link'] = link_1 |
|
|
|
for letter in '#.][!XR': |
|
df['text'] = df['text'].astype(str).str.replace(letter,'') |
|
inputt[0] = inputt[0].astype(str).str.replace(letter,'') |
|
|
|
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' |
|
english_punctuations = string.punctuation |
|
punctuations_list = arabic_punctuations + english_punctuations |
|
|
|
def remove_punctuations(text): |
|
translator = str.maketrans('', '', punctuations_list) |
|
return text.translate(translator) |
|
|
|
def normalize_arabic(text): |
|
text = re.sub("[إأآا]", "ا", text) |
|
text = re.sub("ى", "ي", text) |
|
text = re.sub("ة", "ه", text) |
|
text = re.sub("گ", "ك", text) |
|
return text |
|
|
|
|
|
def remove_repeating_char(text): |
|
return re.sub(r'(.)\1+', r'\1', text) |
|
|
|
def processPost(text): |
|
|
|
|
|
text = re.sub('@[^\s]+', ' ', text) |
|
|
|
|
|
text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',text) |
|
|
|
|
|
text = re.sub(r'#([^\s]+)', r'\1', text) |
|
|
|
|
|
text= remove_punctuations(text) |
|
|
|
|
|
text= normalize_arabic(text) |
|
|
|
|
|
text=remove_repeating_char(text) |
|
|
|
return text |
|
|
|
|
|
df['text'] = df['text'].apply(lambda x: processPost(x)) |
|
inputt[0] = inputt[0].apply(lambda x: processPost(x)) |
|
|
|
st.markdown(f"my input is : { inputt.iloc[0,0] }") |
|
|
|
|
|
|
|
vectorizer = TfidfVectorizer() |
|
vectors = vectorizer.fit_transform(df['text']) |
|
|
|
text_tfidf = pd.DataFrame(vectors.toarray()) |
|
|
|
traninput = vectorizer.transform(inputt[0]) |
|
traninput = traninput.toarray() |
|
cosine_sim = cosine_similarity(traninput,text_tfidf) |
|
top = np.max(cosine_sim) |
|
|
|
|
|
if top >= .85 : |
|
prediction = 'الخبر صحيح' |
|
elif (top < .85) and (top >= .6) : |
|
prediction = 'الخبر مظلل ' |
|
elif top < .6 : |
|
prediction = 'الخبر كاذب ' |
|
|
|
|
|
st.markdown(f"most similar news is: { df['text'].iloc[np.argmax(np.array(cosine_sim[0]))] }") |
|
st.markdown(f"Source url : {df['link'].iloc[np.argmax(np.array(cosine_sim[0]))]}") |
|
st.markdown(f"Credibility rate : { np.max(cosine_sim)}") |
|
st.markdown(f"system prediction: { prediction}") |
|
df.to_csv('Students.csv', sep ='\t') |
|
|
|
|
|
st.sidebar.markdown('مواقع اخباريه معتمده ') |
|
st.sidebar.markdown("[العربية](https://www.alarabiya.net/)") |
|
st.sidebar.markdown("[الجزيرة نت](https://www.aljazeera.net/news/)") |
|
st.sidebar.markdown("[وكالة الانباء الكويتية](https://www.kuna.net.kw/Default.aspx?language=ar)") |
|
|
|
|