import re import requests import wikipedia import numpy as np import pandas as pd import streamlit as st from bs4 import BeautifulSoup from transformers import AutoModel from transformers import BertTokenizer def clean_page_text(text): text = re.sub(r'[^\w\s]', ' ', text) #Replaces the non-alphanumeric characters with spaces. text = re.sub(r'[^\u0600-\u06FF]', ' ', text) #Replaces the non-Arabic characters with spaces. text = re.sub(r'\s+', ' ', text) #Replaces extra spaces with a single space. return text @st.cache_resource def encode_page_text(page_text): tokenizer = BertTokenizer.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-mix-pos-egy') model = AutoModel.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-mix-pos-egy') tokenized_page_text = tokenizer(page_text, return_tensors='pt', max_length=512, truncation=True) encoded_page_text = model(**tokenized_page_text)[0][0][0].tolist() return encoded_page_text @st.cache_data def get_page_info(title): page_info = f"https://xtools.wmcloud.org/api/page/articleinfo/arz.wikipedia.org/{title}?format=json" creation_date = eval(str(BeautifulSoup(requests.get(page_info).content, "html.parser")).replace('null', 'None'))['created_at'] creator_name = eval(str(BeautifulSoup(requests.get(page_info).content, "html.parser")).replace('null', 'None'))['author'] total_edits = eval(str(BeautifulSoup(requests.get(page_info).content, "html.parser")).replace('null', 'None'))['revisions'] total_editors = eval(str(BeautifulSoup(requests.get(page_info).content, "html.parser")).replace('null', 'None'))['editors'] return creation_date, creator_name, total_edits, total_editors @st.cache_data def get_page_prose(title): page_prose = f"https://xtools.wmcloud.org/api/page/prose/arz.wikipedia.org/{title}?format=json" total_bytes = eval(str(BeautifulSoup(requests.get(page_prose).content, "html.parser")).replace('null', 'None'))['bytes'] total_words = eval(str(BeautifulSoup(requests.get(page_prose).content, "html.parser")).replace('null', 'None'))['words'] total_chars = eval(str(BeautifulSoup(requests.get(page_prose).content, "html.parser")).replace('null', 'None'))['characters'] return total_bytes, total_words, total_chars @st.cache_data def prepare_features(selected_title): dataframe = get_metadata_features(selected_title) try: article = wikipedia.page(selected_title) full_article_text = clean_page_text(article.content) except wikipedia.exceptions.DisambiguationError as e: selected_title = e.options[0] article = wikipedia.page(selected_title) full_article_text = clean_page_text(article.content) encode_full_article_text = encode_page_text(full_article_text) X = [] for i in range(dataframe.shape[0]): x = [] x.append(dataframe['Total Edits'][i]) x.append(dataframe['Total Editors'][i]) x.append(dataframe['Total Bytes'][i]) x.append(dataframe['Total Characters'][i]) x.append(dataframe['Total Words'][i]) # Both page_metadata + page_text_embeddings X.append(np.hstack([x, list(encode_full_article_text)])) return X, article, dataframe, selected_title @st.cache_data def get_metadata_features(selected_title): creation_date, creator_name, total_edits, total_editors = get_page_info(selected_title) total_bytes, total_words, total_chars = get_page_prose(selected_title) data = {'Total Edits':[total_edits], 'Total Editors':[total_editors], 'Total Bytes':[total_bytes], 'Total Characters':[total_chars], 'Total Words':[total_words], 'Creator Name':[creator_name], 'Creation Date':[creation_date]} dataframe = pd.DataFrame(data) return dataframe