python-test / app.py
AVAIYA's picture
Create app.py
e228e17
#pip install GoogleNews
#pip install --upgrade GoogleNews
import streamlit as st
from GoogleNews import GoogleNews
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import string
import re
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import sklearn
import time
googlenews = GoogleNews()
googlenews = GoogleNews(lang='ar')
googlenews.clear()
st.write("""
Arabic Fake News Detection System
A system designed as a part of master project
done by Reem AlFouzan
Supervised by : Dr, Abdulla al mutairi
""")
#df = pd.read_csv('News.csv')
text_input = st.text_input (''' **Enter the text** ''')
if len(text_input) != 0:
inputt = []
inputt = pd.DataFrame([text_input])
googlenews.search(inputt.iloc[0,0])
googlenews.get_news(inputt.iloc[0,0])
result_0 = googlenews.page_at(1)
print("Data")
print(result_0, "data 2")
# time.sleep(100)
if len(result_0) == 0:
desc_1 = ['لا يوجد نتائج للخبر ']
link_1 = ['لا يوجد مصدر']
if len(result_0) != 0:
desc_1 = googlenews.get_texts()
link_1 = googlenews.get_links()
for i in list(range(2, 70)):
result = googlenews.page_at(i)
desc = googlenews.get_texts()
link = googlenews.get_links()
desc_1 = desc_1 + desc
link_1 = link_1 + link
column_names = ["text", 'link']
df = pd.DataFrame(columns = column_names)
df['text'] = desc_1
df['link'] = link_1
for letter in '#.][!XR':
df['text'] = df['text'].astype(str).str.replace(letter,'')
inputt[0] = inputt[0].astype(str).str.replace(letter,'')
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations
def remove_punctuations(text):
translator = str.maketrans('', '', punctuations_list)
return text.translate(translator)
def normalize_arabic(text):
text = re.sub("[إأآا]", "ا", text)
text = re.sub("ى", "ي", text)
text = re.sub("ة", "ه", text)
text = re.sub("گ", "ك", text)
return text
def remove_repeating_char(text):
return re.sub(r'(.)\1+', r'\1', text)
def processPost(text):
#Replace @username with empty string
text = re.sub('@[^\s]+', ' ', text)
#Convert www.* or https?://* to " "
text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',text)
#Replace #word with word
text = re.sub(r'#([^\s]+)', r'\1', text)
# remove punctuations
text= remove_punctuations(text)
# normalize the text
text= normalize_arabic(text)
# remove repeated letters
text=remove_repeating_char(text)
return text
df['text'] = df['text'].apply(lambda x: processPost(x))
inputt[0] = inputt[0].apply(lambda x: processPost(x))
st.markdown(f"my input is : { inputt.iloc[0,0] }")
#input=input.apply(lambda x: processPost(x))
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['text'])
text_tfidf = pd.DataFrame(vectors.toarray())
traninput = vectorizer.transform(inputt[0])
traninput = traninput.toarray()
cosine_sim = cosine_similarity(traninput,text_tfidf)
top = np.max(cosine_sim)
if top >= .85 :
prediction = 'الخبر صحيح'
elif (top < .85) and (top >= .6) :
prediction = 'الخبر مظلل '
elif top < .6 :
prediction = 'الخبر كاذب '
st.markdown(f"most similar news is: { df['text'].iloc[np.argmax(np.array(cosine_sim[0]))] }")
st.markdown(f"Source url : {df['link'].iloc[np.argmax(np.array(cosine_sim[0]))]}")
st.markdown(f"Credibility rate : { np.max(cosine_sim)}")
st.markdown(f"system prediction: { prediction}")
df.to_csv('Students.csv', sep ='\t')
st.sidebar.markdown('مواقع اخباريه معتمده ')
st.sidebar.markdown("[العربية](https://www.alarabiya.net/)")
st.sidebar.markdown("[الجزيرة نت](https://www.aljazeera.net/news/)")
st.sidebar.markdown("[وكالة الانباء الكويتية](https://www.kuna.net.kw/Default.aspx?language=ar)")
#st.markdown('test')