Spaces:
Runtime error
Runtime error
Commit
·
55af729
1
Parent(s):
c61f36b
Upload 12 files
Browse files- aljazeera.py +28 -0
- css.py +61 -0
- hespress.py +32 -0
- inference.py +63 -0
- main.py +131 -0
- main_scraping.py +21 -0
- mwn.py +60 -0
- requirements.txt +10 -0
- scraping_needs.py +11 -0
- trends_aljazeera.py +40 -0
- trends_hespress.py +68 -0
- trends_mwn.py +37 -0
aljazeera.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import regex as re
|
| 4 |
+
import streamlit as st
|
| 5 |
+
|
| 6 |
+
@st.cache_data(ttl=7200)
|
| 7 |
+
def scrape_aljaz(my_url):
|
| 8 |
+
#print(my_url)
|
| 9 |
+
codehtml = requests.get(my_url)
|
| 10 |
+
page_soup = BeautifulSoup(codehtml.content, "html.parser")
|
| 11 |
+
# print("Le code HTML est:",page_soup)
|
| 12 |
+
article = page_soup.find("div", {"class": "wysiwyg wysiwyg--all-content css-ibbk12"})
|
| 13 |
+
try:
|
| 14 |
+
paragraphe = article.find_all("p")
|
| 15 |
+
except AttributeError:
|
| 16 |
+
return 'This is not a valid article, please choose another.'
|
| 17 |
+
fullArticle = ""
|
| 18 |
+
i=0
|
| 19 |
+
for news in paragraphe:
|
| 20 |
+
if i==0: # skip first iteration
|
| 21 |
+
i = 1
|
| 22 |
+
fullArticle = fullArticle + news.text.strip() # no newline before the first paragraph
|
| 23 |
+
continue
|
| 24 |
+
fullArticle = fullArticle + "\n" + news.text.strip()
|
| 25 |
+
|
| 26 |
+
#suppression des espaces entrelignes
|
| 27 |
+
fullArticle = re.sub(r'\n[\t\n\s]+\n*',r"\n",fullArticle)
|
| 28 |
+
return fullArticle
|
css.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
ext_css = """
|
| 3 |
+
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
main_css = """
|
| 9 |
+
<style>
|
| 10 |
+
.navbar-light .navbar-brand:focus, .navbar-light .navbar-brand:hover {
|
| 11 |
+
color: rgba(0,0,0,.9);
|
| 12 |
+
text-decoration: none;
|
| 13 |
+
background_color:#6f42c1;
|
| 14 |
+
}
|
| 15 |
+
.navbar navbar-expand-lg navbar-light{
|
| 16 |
+
background-color:red;
|
| 17 |
+
}
|
| 18 |
+
.title {
|
| 19 |
+
color: black;
|
| 20 |
+
font-weight: bold;
|
| 21 |
+
text-align: center;
|
| 22 |
+
text_decoration:none;
|
| 23 |
+
}
|
| 24 |
+
.title:hover {
|
| 25 |
+
text_decoration:none;
|
| 26 |
+
}
|
| 27 |
+
div.stButton > button:first-child {
|
| 28 |
+
background-color: rgb(207, 87, 87);
|
| 29 |
+
}
|
| 30 |
+
div.block-container.css-z5fcl4.egzxvld4{
|
| 31 |
+
padding-top : 46px;
|
| 32 |
+
}
|
| 33 |
+
img{
|
| 34 |
+
width:100px;
|
| 35 |
+
hight:100px;
|
| 36 |
+
margin:0;
|
| 37 |
+
}
|
| 38 |
+
</style>
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
nav_bar_css = """
|
| 42 |
+
<nav class="navbar bg-light navbar-expand-lg navbar-light">
|
| 43 |
+
<div class="container-fluid">
|
| 44 |
+
<a class="navbar-brand" href="#">
|
| 45 |
+
<img src="https://assets.lightfunnels.com/account-3085/images_library/54068d2e-eb65-4140-aa77-7acb8a31b779.presum.pmg.png">
|
| 46 |
+
<span class="title text-decoration-none">PreSum</span>
|
| 47 |
+
</a>
|
| 48 |
+
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
|
| 49 |
+
<span class="navbar-toggler-icon"></span>
|
| 50 |
+
</button>
|
| 51 |
+
<div class="collapse navbar-collapse" id="navbarNav">
|
| 52 |
+
<ul class="navbar-nav">
|
| 53 |
+
<li class="nav-item">
|
| 54 |
+
<a class="nav-link active" aria-current="page" href="#">About</a>
|
| 55 |
+
</li>
|
| 56 |
+
</ul>
|
| 57 |
+
</div>
|
| 58 |
+
</div>
|
| 59 |
+
</nav>
|
| 60 |
+
<br>
|
| 61 |
+
"""
|
hespress.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup
|
| 2 |
+
import requests
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
def scrape_hes(url):
|
| 6 |
+
HEADERS = {
|
| 7 |
+
'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
|
| 8 |
+
page = requests.get(url, headers=HEADERS)
|
| 9 |
+
|
| 10 |
+
src = page.content # variable to store page content
|
| 11 |
+
soup = BeautifulSoup(src, "html.parser") # beautify code
|
| 12 |
+
# print(soup)
|
| 13 |
+
Matches_Details = []
|
| 14 |
+
|
| 15 |
+
# find all divs where exists class...
|
| 16 |
+
article_content = soup.find("div", {'article-content'})
|
| 17 |
+
|
| 18 |
+
all_paragraphes = article_content.find_all("p") # get all a tags
|
| 19 |
+
# matches_number = len(all_matches)
|
| 20 |
+
article_text = ""
|
| 21 |
+
i=0
|
| 22 |
+
for x in all_paragraphes:
|
| 23 |
+
if i==0:
|
| 24 |
+
i=1
|
| 25 |
+
x = x.text.strip()
|
| 26 |
+
article_text = article_text+'\n'+x
|
| 27 |
+
continue
|
| 28 |
+
x = x.text.strip()
|
| 29 |
+
article_text = article_text+'\n'+x
|
| 30 |
+
#suppression espaces vides
|
| 31 |
+
article_text = re.sub(r'\n[\t\n\s]+\n*',r"\n",article_text)
|
| 32 |
+
return article_text.strip()
|
inference.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from transformers import pipeline
|
| 3 |
+
import spacy
|
| 4 |
+
from difflib import SequenceMatcher
|
| 5 |
+
|
| 6 |
+
nlp = spacy.load("en_core_web_sm")
|
| 7 |
+
|
| 8 |
+
def get_n_first_sent(text, n = 1): # extract first n sentences of text
|
| 9 |
+
doc = nlp(text)
|
| 10 |
+
sentences = [sent.text for sent in doc.sents]
|
| 11 |
+
if n == -1: # return all sentences
|
| 12 |
+
return sentences
|
| 13 |
+
return sentences[0:n-1]
|
| 14 |
+
|
| 15 |
+
def rem_similiar(list_sent_text,list_sent_sum,treshhold = 0.9): # uses SequenceMatcher to find similiar sentences
|
| 16 |
+
for i, sent_sum in enumerate(list_sent_sum):
|
| 17 |
+
if i == len(list_sent_text):
|
| 18 |
+
break
|
| 19 |
+
for sent_text in list_sent_text: # calcule la similiartité avec ttes les autres phrases
|
| 20 |
+
score_similarité = SequenceMatcher(None, sent_sum, sent_text).ratio()
|
| 21 |
+
if score_similarité >= treshhold:
|
| 22 |
+
list_sent_text.pop(i)
|
| 23 |
+
|
| 24 |
+
@st.cache_resource
|
| 25 |
+
def load_model():
|
| 26 |
+
return pipeline("summarization", model="Yahiael1/mymodel_final_v2")
|
| 27 |
+
|
| 28 |
+
def summary_wrapper(sum_obj,text,min_len,max_len):
|
| 29 |
+
return sum_obj(text, max_length = max_len,
|
| 30 |
+
min_length = min_len,
|
| 31 |
+
early_stopping = True,
|
| 32 |
+
clean_up_tokenization_spaces = True,
|
| 33 |
+
truncation=True, # max token number = 1024
|
| 34 |
+
num_beams = 8, # nombres de tokens à générer après chaque mot, le modèle ensuite choisit l'un de ces tokens; associée à do_sample
|
| 35 |
+
#do_sample=True, # associée à num_beams, utilise un algorithme non-glouton pour le choix du token suivant
|
| 36 |
+
repetition_penalty = 1.1, # pénalise les mots redondants en diminuant leur score
|
| 37 |
+
temperature = 1.3, # modifie hasardément les scores des tokens à choisir pour augmenter ou diminuer la "créativité du modèle"
|
| 38 |
+
num_beam_groups = 4 # doit etre diviseur de num_beams, ajoute un mécanisme promouvant la diversité des tokens générés, ne peut pas etre utlisé avec do_sample
|
| 39 |
+
)[0]["summary_text"]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def summarize(summarizer_object,desired_length,text):
|
| 43 |
+
if desired_length == 'long':
|
| 44 |
+
max_len = 128
|
| 45 |
+
min_len = 100
|
| 46 |
+
elif desired_length == 'medium':
|
| 47 |
+
max_len = 90
|
| 48 |
+
min_len = 50
|
| 49 |
+
elif desired_length == 'short':
|
| 50 |
+
max_len = 40
|
| 51 |
+
min_len = 10
|
| 52 |
+
|
| 53 |
+
first_summary = summary_wrapper(summarizer_object,text,min_len,max_len)
|
| 54 |
+
|
| 55 |
+
sent_text = get_n_first_sent(text, 2) # get 5 first sentences of text
|
| 56 |
+
sent_sum = get_n_first_sent(first_summary, -1) # get all sentences of summary
|
| 57 |
+
|
| 58 |
+
rem_similiar(sent_text,sent_sum) # on supprime les phrases extraites
|
| 59 |
+
new_text = '\n'.join(sent_text)
|
| 60 |
+
|
| 61 |
+
return summary_wrapper(summarizer_object,new_text,min_len,max_len)
|
| 62 |
+
|
| 63 |
+
|
main.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
st.set_page_config(layout="wide")
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
from scraping_needs import list_user_agents #list of user agents for scraping
|
| 6 |
+
import random
|
| 7 |
+
import streamlit.components.v1 as components
|
| 8 |
+
from streamlit_image_select import image_select
|
| 9 |
+
from main_scraping import main_scrape
|
| 10 |
+
from trends_aljazeera import trends_aljazeera
|
| 11 |
+
from trends_mwn import trends_mwn
|
| 12 |
+
from trends_hespress import trends_hespress
|
| 13 |
+
from css import main_css,nav_bar_css,ext_css
|
| 14 |
+
from inference import *
|
| 15 |
+
|
| 16 |
+
# on charge le modèle
|
| 17 |
+
resumeur = load_model()
|
| 18 |
+
|
| 19 |
+
mwn_trends = trends_mwn(list_user_agents[19])[0:6]
|
| 20 |
+
hes_tends = trends_hespress()
|
| 21 |
+
aljaz_tends = trends_aljazeera(list_user_agents[11])[0:6]
|
| 22 |
+
|
| 23 |
+
image_list=[]
|
| 24 |
+
caption_list=[]
|
| 25 |
+
|
| 26 |
+
def get_link_from_image(image_link):
|
| 27 |
+
for i in mwn_trends + hes_tends + aljaz_tends:
|
| 28 |
+
if image_link == i["image_link"]:
|
| 29 |
+
return i["article_link"]
|
| 30 |
+
|
| 31 |
+
def link_to_text(user_input):
|
| 32 |
+
return main_scrape(user_input)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
st.markdown(ext_css,unsafe_allow_html=True)
|
| 36 |
+
st.markdown(main_css,unsafe_allow_html=True)
|
| 37 |
+
st.markdown(nav_bar_css,unsafe_allow_html=True)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
with st.expander(r":balloon: ${\huge \text{Quick grab}}$",expanded = 0):
|
| 41 |
+
option_site = st.selectbox("Choose a news outlet", options=("Morocco World News", "Hespress","Aljazeera"))
|
| 42 |
+
|
| 43 |
+
if option_site=="Morocco World News":
|
| 44 |
+
for i in range(len(mwn_trends)):
|
| 45 |
+
image_list.append(mwn_trends[i]["image_link"])
|
| 46 |
+
caption_list.append(mwn_trends[i]["title"])
|
| 47 |
+
elif option_site=="Hespress":
|
| 48 |
+
for i in range(len(hes_tends)):
|
| 49 |
+
image_list.append(hes_tends[i]["image_link"])
|
| 50 |
+
caption_list.append(hes_tends[i]["title"])
|
| 51 |
+
elif option_site=="Aljazeera":
|
| 52 |
+
for i in range(5):
|
| 53 |
+
image_list.append(aljaz_tends[i]["image_link"])
|
| 54 |
+
caption_list.append(aljaz_tends[i]["title"])
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
img = image_select(
|
| 58 |
+
label="Choose an article to summarize: ",
|
| 59 |
+
images=image_list,
|
| 60 |
+
captions=caption_list,
|
| 61 |
+
#use_container_width=False
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
sum_from_img = st.button("Get text of article!")
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
col1,col2,col3,col3_5,col4,col5 = st.columns([1.5,0.2,1,0.2,0.75,0.8]) # dont change these values cuz they work on compact mode
|
| 68 |
+
|
| 69 |
+
with col1:
|
| 70 |
+
st.markdown("<br>",unsafe_allow_html=True)
|
| 71 |
+
option = st.selectbox("Choose your input option", options=("Text", "Link"),help="""
|
| 72 |
+
For link inputs, you can choose any article from these 3 websites:
|
| 73 |
+
* [Morocco World News](https://www.moroccoworldnews.com/)
|
| 74 |
+
* [Hespress english](https://en.hespress.com/)
|
| 75 |
+
* [Aljazeera english](https://www.aljazeera.com/)
|
| 76 |
+
"""
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
with col3:
|
| 82 |
+
#summary_length = st.radio("Desired summary length:",
|
| 83 |
+
# ["short", "medium", "long"])
|
| 84 |
+
summary_length= st.select_slider(
|
| 85 |
+
'Desired summary length:',
|
| 86 |
+
options=['short', 'medium', 'long'])
|
| 87 |
+
|
| 88 |
+
with col4:
|
| 89 |
+
st.markdown("<br>",unsafe_allow_html=True)
|
| 90 |
+
sum_button = st.button("Summarize")
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
flag = 1
|
| 95 |
+
if sum_from_img:
|
| 96 |
+
chosen_link = str(get_link_from_image(img))
|
| 97 |
+
option = "Text"
|
| 98 |
+
text_from_link=st.text_area("Your article's text : ", value=main_scrape(chosen_link))
|
| 99 |
+
st.session_state.user_text = text_from_link
|
| 100 |
+
flag = 0
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
if flag:
|
| 104 |
+
if (option == "Text"): #to make sure lines 118 and 110 are not both executed(if user chose image, we shouldnt add another text area)
|
| 105 |
+
if 'user_text' in st.session_state:
|
| 106 |
+
text_from_link = st.text_area("Put the text of the article here",value=st.session_state.user_text)
|
| 107 |
+
else:
|
| 108 |
+
text_from_link = st.text_area("Put the text of the article here")
|
| 109 |
+
elif option == "Link":
|
| 110 |
+
user_input_link = st.text_input("Paste your link here")
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
if(sum_button): # user clicked summarize
|
| 115 |
+
if (flag == 0): # article text area already generated
|
| 116 |
+
st.divider()
|
| 117 |
+
# model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
|
| 118 |
+
# tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
|
| 119 |
+
# inputs = tokenizer([user_input], return_tensors='pt')
|
| 120 |
+
# summary = model.generate(inputs['input_ids'], max_length=summary_lenth, min_length=30, do_sample=False)
|
| 121 |
+
# st.write([tokenizer.decode(g, skip_special_tokens=True) for g in summary])
|
| 122 |
+
|
| 123 |
+
else: # no text area created yet
|
| 124 |
+
if (option == "Link") :
|
| 125 |
+
text_from_link=st.text_area("Your article's text : ", value=link_to_text(user_input_link))
|
| 126 |
+
|
| 127 |
+
#summary = summarizer(text_from_link, max_length=100, min_length=30, do_sample=False)
|
| 128 |
+
|
| 129 |
+
#st.write(summary[0]['summary_text'])
|
| 130 |
+
summary = st.text_area('Article summary:', value=summarize(resumeur,summary_length,text_from_link), disabled=True)
|
| 131 |
+
|
main_scraping.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from urllib.parse import urlparse
|
| 2 |
+
from hespress import scrape_hes
|
| 3 |
+
from aljazeera import scrape_aljaz
|
| 4 |
+
from mwn import scrape_mwn
|
| 5 |
+
import streamlit as st
|
| 6 |
+
import validators
|
| 7 |
+
|
| 8 |
+
@st.cache_data
|
| 9 |
+
def main_scrape(url):
|
| 10 |
+
if not validators.url(url):
|
| 11 |
+
return "We're sorry, but the link you provided seems to be invalid. Please double-check that the URL is correct and properly formatted."
|
| 12 |
+
|
| 13 |
+
domain = urlparse(url).netloc
|
| 14 |
+
if "moroccoworldnews.com" in domain:
|
| 15 |
+
return scrape_mwn(url)
|
| 16 |
+
if "en.hespress.com" in domain:
|
| 17 |
+
return scrape_hes(url)
|
| 18 |
+
if "aljazeera.com" in domain:
|
| 19 |
+
return scrape_aljaz(url)
|
| 20 |
+
else:
|
| 21 |
+
return "We're sorry, this website is not currently supported. To see a list of supported websites, click the '?' next to the input option."
|
mwn.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup
|
| 2 |
+
import requests
|
| 3 |
+
import json
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
def scrape_mwn(url):
|
| 7 |
+
# extraction du code html
|
| 8 |
+
html_content = requests.get(url)
|
| 9 |
+
# parser le code html
|
| 10 |
+
soup = BeautifulSoup(html_content.text, 'html.parser')
|
| 11 |
+
# extaction des paragraphes p l'article
|
| 12 |
+
app_div = soup.find('div', id='app')
|
| 13 |
+
p = app_div.attrs["data-page"]
|
| 14 |
+
p = json.loads(p) # transformer le contenu en dict json
|
| 15 |
+
p = p["props"]["post"]["post_content"] # chemin des p de l'article
|
| 16 |
+
# print(json_object) # debug
|
| 17 |
+
# parser l'html du texte pour nettoyage
|
| 18 |
+
main_text_soup = BeautifulSoup(p, 'html.parser')
|
| 19 |
+
# extraction des tags p
|
| 20 |
+
main_text_soup.find_all('p')
|
| 21 |
+
|
| 22 |
+
# suppression du message de copyright
|
| 23 |
+
main_text_soup.find("p", {"class": "article_copyright"}).decompose()
|
| 24 |
+
|
| 25 |
+
# suppression du Read also
|
| 26 |
+
for p in main_text_soup.find_all("p"): #on itère sur les paragraphes, on debute par le dernier car 'read also' se trouve en fin de texte
|
| 27 |
+
for link_tag in p.find_all("a"): # on cherche les liens
|
| 28 |
+
joint_list = filter(lambda x: (x != None), [link_tag.find_previous_sibling(string=True) , link_tag.find_previous_sibling("strong")]) # elemnts avant lien + enlever none
|
| 29 |
+
for prev in joint_list: # le read also est tpujours avant le lien, donc on ne cherche que les tags avant
|
| 30 |
+
if ('read also:' in prev.string.lower()): # verifie si c est le tag voulu
|
| 31 |
+
p.decompose() # on supprime le tag parent
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# suppression des sous titres des paragraphes
|
| 36 |
+
try:
|
| 37 |
+
for sub_title_tag in filter(lambda x: x != None, (main_text_soup.find_all("h3") + main_text_soup.find_all("strong"))): #on cherche les tags strong et h3 et on itère
|
| 38 |
+
if((sub_title_tag.parent.name != "a") & (len(sub_title_tag.findChildren("a")) == 0 )): # pour ne pas supprimer les liens marqués strong
|
| 39 |
+
if(sub_title_tag.parent.name == "[document]"): # si le tag est au plus haut niveau cad parent = document
|
| 40 |
+
#print("self")
|
| 41 |
+
sub_title_tag.decompose() # on détruit le tag
|
| 42 |
+
else: # si le tag est a l'interieur d'un <a>
|
| 43 |
+
#print("parent decomp")
|
| 44 |
+
sub_title_tag.parent.decompose() # on détruit le tag parent
|
| 45 |
+
except Exception as e:
|
| 46 |
+
pass
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
text = main_text_soup.text.strip() # texte
|
| 51 |
+
|
| 52 |
+
#suppression de la ville au début
|
| 53 |
+
if(re.search(r"\s-\s",text[:20])): #cherche pour ville dans 20 premiers chars
|
| 54 |
+
city_endpos = re.search(r"\s-\s",text[:20]).end()
|
| 55 |
+
text = text[city_endpos:] # supprimer la ville au début du texte
|
| 56 |
+
|
| 57 |
+
#suppression des espaces entrelignes
|
| 58 |
+
text = re.sub(r'\n[\t\n\s]+\n*',r"\n",text)
|
| 59 |
+
|
| 60 |
+
return text
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
requests
|
| 2 |
+
regex
|
| 3 |
+
streamlit
|
| 4 |
+
Transformers
|
| 5 |
+
bs4
|
| 6 |
+
streamlit_image_select
|
| 7 |
+
validators
|
| 8 |
+
torch
|
| 9 |
+
spacy
|
| 10 |
+
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl
|
scraping_needs.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
|
| 3 |
+
list_user_agents = ["Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.58", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36"]
|
| 4 |
+
|
| 5 |
+
def get_content(url,user_agent,bool_text):
|
| 6 |
+
r = requests.get(url,headers={'User-Agent':user_agent})
|
| 7 |
+
if r.status_code == 200:
|
| 8 |
+
if bool_text:
|
| 9 |
+
return str(r.text)
|
| 10 |
+
return r.content
|
| 11 |
+
return 0
|
trends_aljazeera.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup as soup
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import re
|
| 4 |
+
from scraping_needs import get_content
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@st.cache_data(ttl=7800) # cache clears after 7800s
|
| 8 |
+
def trends_aljazeera(user_agent):
|
| 9 |
+
main_url = "https://www.aljazeera.com/"
|
| 10 |
+
contenaire = soup(get_content(main_url, user_agent, 0), "html.parser")
|
| 11 |
+
|
| 12 |
+
headers_link = contenaire.find_all("li", {"class": "fte-featured-articles-list__item"})
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
H3 = []
|
| 17 |
+
for i in headers_link:
|
| 18 |
+
Live=i.find(class_="post-label__text")
|
| 19 |
+
if(Live!=None):
|
| 20 |
+
if(Live.text=="Live updates"):
|
| 21 |
+
continue
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
pic = {}
|
| 26 |
+
title = i.find(class_="fte-article__title").find('span').text
|
| 27 |
+
picture = i.find("img")
|
| 28 |
+
image_url = main_url + picture.attrs["src"]
|
| 29 |
+
image_url = re.sub(r"\?(.*)", "", image_url)
|
| 30 |
+
image_url2 = image_url + "?resize=900%2C500%"
|
| 31 |
+
|
| 32 |
+
link = i.find("a")
|
| 33 |
+
article_link = main_url + link.attrs["href"]
|
| 34 |
+
|
| 35 |
+
pic["title"] = title
|
| 36 |
+
pic["image_link"] = image_url2
|
| 37 |
+
pic["article_link"] = article_link
|
| 38 |
+
|
| 39 |
+
H3.append(pic)
|
| 40 |
+
return H3
|
trends_hespress.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup
|
| 2 |
+
import requests
|
| 3 |
+
import streamlit as st
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def get_trends_image(page):
|
| 8 |
+
HEADERS = {
|
| 9 |
+
'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
|
| 10 |
+
page = requests.get(
|
| 11 |
+
page,
|
| 12 |
+
headers=HEADERS)
|
| 13 |
+
src = page.content # variable to store page content
|
| 14 |
+
soup = BeautifulSoup(src, "html.parser") # beautify code
|
| 15 |
+
# print(soup)
|
| 16 |
+
|
| 17 |
+
image = soup.find("img") # find all divs where exists class...
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
image_link=image.get('src')
|
| 22 |
+
return image_link
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@st.cache_data(ttl=7200) # cache clears after 7200s
|
| 27 |
+
def trends_hespress():
|
| 28 |
+
HEADERS = {
|
| 29 |
+
'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
|
| 30 |
+
page = requests.get(
|
| 31 |
+
'https://en.hespress.com/',
|
| 32 |
+
headers=HEADERS)
|
| 33 |
+
src = page.content # variable to store page content
|
| 34 |
+
soup = BeautifulSoup(src, "html.parser") # beautify code
|
| 35 |
+
# print(soup)
|
| 36 |
+
|
| 37 |
+
page = soup.find("div", {'left-side heading-box col'}) # find all divs where exists class...
|
| 38 |
+
all_trends = page.find_all("a", {'wpp-post-title'}) # get all a tags
|
| 39 |
+
article_text = ""
|
| 40 |
+
trends_list=[]
|
| 41 |
+
for x in all_trends:
|
| 42 |
+
|
| 43 |
+
trend_link = x.get('href')
|
| 44 |
+
trend_title = x.text
|
| 45 |
+
trend_image=get_trends_image(trend_link)
|
| 46 |
+
|
| 47 |
+
subdict={ 'title':trend_title,
|
| 48 |
+
'image_link':trend_image,
|
| 49 |
+
'article_link':trend_link}
|
| 50 |
+
trends_list.append(subdict)
|
| 51 |
+
extra_page = soup.find("div", {'group-item col-sm-12 col-md-6 col-xl-4 category-society bloc_col'})
|
| 52 |
+
extra_trend=extra_page.find("div", {'ratio-medium'})
|
| 53 |
+
image = extra_trend.find("img")
|
| 54 |
+
trend_image=image.get('src')
|
| 55 |
+
trend_title=image.get('alt')
|
| 56 |
+
trend_link = extra_page.find_all('a')
|
| 57 |
+
trend_link = trend_link[1].get('href')
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
trend_4={ 'title':trend_title,
|
| 61 |
+
'image_link':trend_image,
|
| 62 |
+
'article_link':trend_link}
|
| 63 |
+
trends_list.append(trend_4)
|
| 64 |
+
return trends_list
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
|
trends_mwn.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import json
|
| 3 |
+
import streamlit as st
|
| 4 |
+
from scraping_needs import get_content
|
| 5 |
+
|
| 6 |
+
@st.cache_data(ttl=7800) # cache clears after 7800s
|
| 7 |
+
def trends_mwn(user_agent):
|
| 8 |
+
|
| 9 |
+
#page uses javascript
|
| 10 |
+
str_main_list = get_content("https://www.moroccoworldnews.com/home/post/zheadlines",user_agent,1) # we want text not html
|
| 11 |
+
|
| 12 |
+
main_list = json.loads(str_main_list)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
cards_content = []
|
| 16 |
+
|
| 17 |
+
for card in main_list:
|
| 18 |
+
content_dict = {}
|
| 19 |
+
|
| 20 |
+
content_dict["image_link"] = card["thumb"]
|
| 21 |
+
content_dict["title"] = card["post_title"]
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
tmp = card["tsize"]
|
| 25 |
+
except KeyError:
|
| 26 |
+
try:
|
| 27 |
+
tmp = card["msize"]
|
| 28 |
+
except KeyError:
|
| 29 |
+
tmp = card["lsize"]
|
| 30 |
+
year = re.search('20\d\d(?=[\\\/])',tmp).group()
|
| 31 |
+
month = re.search('(?<=[\\\/])\d{1,2}(?=[\\\/])',tmp).group()
|
| 32 |
+
|
| 33 |
+
content_dict["article_link"] = "https://www.moroccoworldnews.com/" + str(year) + "/" + str(month) + "/" + str(card["ID"]) + "/" + card["post_name"]
|
| 34 |
+
|
| 35 |
+
cards_content.append(content_dict)
|
| 36 |
+
|
| 37 |
+
return cards_content
|