SaiedAlshahrani commited on
Commit
45110eb
β€’
1 Parent(s): c607808

Upload 5 files

Browse files
README.md CHANGED
@@ -1,11 +1,11 @@
1
  ---
2
  title: Egyptian Wikipedia Scanner
3
- emoji: πŸ“Š
4
- colorFrom: yellow
5
- colorTo: pink
6
  sdk: streamlit
7
- sdk_version: 1.32.2
8
- app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
 
1
  ---
2
  title: Egyptian Wikipedia Scanner
3
+ emoji: πŸ”
4
+ colorFrom: gray
5
+ colorTo: red
6
  sdk: streamlit
7
+ sdk_version: 1.31.1
8
+ app_file: scanner.py
9
  pinned: false
10
  license: mit
11
  ---
XGBoost_metadata+camelbert_embeddings.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9c5af4de2e308394a520a60a309755964f33554bf53b4b1942c5d3b5aa8e1b7
3
+ size 21531
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ torch
3
+ typing
4
+ pandas
5
+ xgboost
6
+ requests
7
+ wikipedia
8
+ streamlit
9
+ torchvision
10
+ scikit-learn
11
+ transformers
12
+ beautifulsoup4
13
+ streamlit-searchbox
scanner.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import warnings
3
+ import wikipedia
4
+ import streamlit as st
5
+ from typing import List
6
+ from scanner_utils import *
7
+ from xgboost import XGBClassifier
8
+ from streamlit_searchbox import st_searchbox
9
+ from transformers import logging as hflogging
10
+
11
+
12
+ logging.disable(logging.WARNING)
13
+ hflogging.set_verbosity_warning()
14
+
15
+ warnings.simplefilter(action='ignore', category=UserWarning)
16
+ warnings.simplefilter(action='ignore', category=FutureWarning)
17
+ warnings.simplefilter(action='ignore', category=DeprecationWarning)
18
+
19
+ st.set_page_config(layout="centered", page_title="Egyptian Wikipedia Scanner", page_icon="πŸ‡ͺπŸ‡¬")
20
+
21
+ wikipedia.set_lang("arz")
22
+
23
+
24
+ with open('.streamlit/style.css') as f:
25
+ st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
26
+
27
+
28
+ st.markdown("""
29
+ <h1 style='text-align: center';>Egyptian Arabic Wikipedia Scanner</h1>
30
+ <h5 style='text-align: center';>Automatic Detection of Template-translated Articles in the Egyptian Wikipedia</h5>
31
+ """, unsafe_allow_html=True)
32
+
33
+
34
+ st.markdown("", unsafe_allow_html=True)
35
+
36
+
37
+ def search_wikipedia(searchterm: str) -> List[any]:
38
+ return wikipedia.search(searchterm) if searchterm else []
39
+
40
+
41
+ @st.cache_resource
42
+ def load_xgb_model(model):
43
+ loaded_xgb_classifier = XGBClassifier()
44
+ loaded_xgb_classifier.load_model(model)
45
+ return loaded_xgb_classifier
46
+
47
+
48
+ selected_title = st_searchbox(search_wikipedia, label="Search for an article in Egyptian Arabic Wikipedia:",
49
+ placeholder="Search for an article", rerun_on_update=True, clear_on_submit=False, key="wiki_searchbox")
50
+
51
+ if selected_title:
52
+ X, article, dataframe, selected_title = prepare_features(selected_title)
53
+
54
+ st.write(f':black_small_square: Collected Metadata of **{selected_title}**')
55
+
56
+ st.dataframe(dataframe, hide_index=True , use_container_width=True)
57
+
58
+ loaded_xgb_classifier = load_xgb_model("XGBoost_metadata+camelbert_embeddings.model")
59
+
60
+ id2label = {0:'Human-generated Article', 1:'Template-translated Article'}
61
+
62
+ result = id2label[int(loaded_xgb_classifier.predict(X))]
63
+
64
+ if result =='Human-generated Article':
65
+ st.write(f":black_small_square: Automatic Classification of **{selected_title}**")
66
+ st.success(result, icon="βœ…")
67
+
68
+ else:
69
+ st.write(f":black_small_square: Automatic Classification of **{selected_title}**")
70
+ st.error(result, icon="🚨")
71
+
72
+ st.write(f":black_small_square: Full Summary of **{selected_title}**")
73
+
74
+ with st.expander(f'**{selected_title}**', expanded=True):
75
+ st.markdown('<style>p {text-align: justify;}</style>', unsafe_allow_html=True)
76
+ try:
77
+ article_text = wikipedia.summary(selected_title)
78
+
79
+ except wikipedia.exceptions.DisambiguationError as e:
80
+ article_text = wikipedia.summary(e.options[0])
81
+ st.write(article_text)
82
+ st.write(f'> :globe_with_meridians: Read Full Text of **{selected_title}**: <br>{article.url}', unsafe_allow_html=True)
83
+
84
+
85
+ st.markdown('<br><br>', unsafe_allow_html=True)
86
+
87
+
88
+ footer="""
89
+ <div class="footer"> <p class="p1">Copyright Β© 2024 by *****************<br>Hosted with Hugging Face Spaces πŸ€—</p> </div>
90
+ """
91
+ st.markdown(footer, unsafe_allow_html=True)
scanner_utils.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ import wikipedia
4
+ import numpy as np
5
+ import pandas as pd
6
+ import streamlit as st
7
+ from bs4 import BeautifulSoup
8
+ from transformers import AutoModel
9
+ from transformers import BertTokenizer
10
+
11
+
12
+ def clean_page_text(text):
13
+ text = re.sub(r'[^\w\s]', ' ', text) #Replaces the non-alphanumeric characters with spaces.
14
+ text = re.sub(r'[^\u0600-\u06FF]', ' ', text) #Replaces the non-Arabic characters with spaces.
15
+ text = re.sub(r'\s+', ' ', text) #Replaces extra spaces with a single space.
16
+ return text
17
+
18
+
19
+ @st.cache_resource
20
+ def encode_page_text(page_text):
21
+ tokenizer = BertTokenizer.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-mix-pos-egy')
22
+ model = AutoModel.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-mix-pos-egy')
23
+
24
+ tokenized_page_text = tokenizer(page_text, return_tensors='pt', max_length=512, truncation=True)
25
+ encoded_page_text = model(**tokenized_page_text)[0][0][0].tolist()
26
+
27
+ return encoded_page_text
28
+
29
+
30
+ @st.cache_data
31
+ def get_page_info(title):
32
+ page_info = f"https://xtools.wmcloud.org/api/page/articleinfo/arz.wikipedia.org/{title}?format=json"
33
+
34
+ creation_date = eval(str(BeautifulSoup(requests.get(page_info).content, "html.parser")).replace('null', 'None'))['created_at']
35
+ creator_name = eval(str(BeautifulSoup(requests.get(page_info).content, "html.parser")).replace('null', 'None'))['author']
36
+ total_edits = eval(str(BeautifulSoup(requests.get(page_info).content, "html.parser")).replace('null', 'None'))['revisions']
37
+ total_editors = eval(str(BeautifulSoup(requests.get(page_info).content, "html.parser")).replace('null', 'None'))['editors']
38
+
39
+ return creation_date, creator_name, total_edits, total_editors
40
+
41
+
42
+ @st.cache_data
43
+ def get_page_prose(title):
44
+ page_prose = f"https://xtools.wmcloud.org/api/page/prose/arz.wikipedia.org/{title}?format=json"
45
+
46
+ total_bytes = eval(str(BeautifulSoup(requests.get(page_prose).content, "html.parser")).replace('null', 'None'))['bytes']
47
+ total_words = eval(str(BeautifulSoup(requests.get(page_prose).content, "html.parser")).replace('null', 'None'))['words']
48
+ total_chars = eval(str(BeautifulSoup(requests.get(page_prose).content, "html.parser")).replace('null', 'None'))['characters']
49
+
50
+ return total_bytes, total_words, total_chars
51
+
52
+
53
+ @st.cache_data
54
+ def prepare_features(selected_title):
55
+ dataframe = get_metadata_features(selected_title)
56
+
57
+ try:
58
+ article = wikipedia.page(selected_title)
59
+ full_article_text = clean_page_text(article.content)
60
+
61
+ except wikipedia.exceptions.DisambiguationError as e:
62
+ selected_title = e.options[0]
63
+ article = wikipedia.page(selected_title)
64
+ full_article_text = clean_page_text(article.content)
65
+
66
+ encode_full_article_text = encode_page_text(full_article_text)
67
+
68
+ X = []
69
+
70
+ for i in range(dataframe.shape[0]):
71
+ x = []
72
+ x.append(dataframe['Total Edits'][i])
73
+ x.append(dataframe['Total Editors'][i])
74
+ x.append(dataframe['Total Bytes'][i])
75
+ x.append(dataframe['Total Characters'][i])
76
+ x.append(dataframe['Total Words'][i])
77
+
78
+ # Both page_metadata + page_text_embeddings
79
+ X.append(np.hstack([x, list(encode_full_article_text)]))
80
+
81
+ return X, article, dataframe, selected_title
82
+
83
+
84
+ @st.cache_data
85
+ def get_metadata_features(selected_title):
86
+ creation_date, creator_name, total_edits, total_editors = get_page_info(selected_title)
87
+ total_bytes, total_words, total_chars = get_page_prose(selected_title)
88
+
89
+ data = {'Total Edits':[total_edits], 'Total Editors':[total_editors], 'Total Bytes':[total_bytes],
90
+ 'Total Characters':[total_chars], 'Total Words':[total_words], 'Creator Name':[creator_name],
91
+ 'Creation Date':[creation_date]}
92
+
93
+ dataframe = pd.DataFrame(data)
94
+
95
+ return dataframe