Python-proje commited on
Commit
55af729
·
1 Parent(s): c61f36b

Upload 12 files

Browse files
Files changed (12) hide show
  1. aljazeera.py +28 -0
  2. css.py +61 -0
  3. hespress.py +32 -0
  4. inference.py +63 -0
  5. main.py +131 -0
  6. main_scraping.py +21 -0
  7. mwn.py +60 -0
  8. requirements.txt +10 -0
  9. scraping_needs.py +11 -0
  10. trends_aljazeera.py +40 -0
  11. trends_hespress.py +68 -0
  12. trends_mwn.py +37 -0
aljazeera.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import regex as re
4
+ import streamlit as st
5
+
6
+ @st.cache_data(ttl=7200)
7
+ def scrape_aljaz(my_url):
8
+ #print(my_url)
9
+ codehtml = requests.get(my_url)
10
+ page_soup = BeautifulSoup(codehtml.content, "html.parser")
11
+ # print("Le code HTML est:",page_soup)
12
+ article = page_soup.find("div", {"class": "wysiwyg wysiwyg--all-content css-ibbk12"})
13
+ try:
14
+ paragraphe = article.find_all("p")
15
+ except AttributeError:
16
+ return 'This is not a valid article, please choose another.'
17
+ fullArticle = ""
18
+ i=0
19
+ for news in paragraphe:
20
+ if i==0: # skip first iteration
21
+ i = 1
22
+ fullArticle = fullArticle + news.text.strip() # no newline before the first paragraph
23
+ continue
24
+ fullArticle = fullArticle + "\n" + news.text.strip()
25
+
26
+ #suppression des espaces entrelignes
27
+ fullArticle = re.sub(r'\n[\t\n\s]+\n*',r"\n",fullArticle)
28
+ return fullArticle
css.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ext_css = """
3
+ <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
4
+ """
5
+
6
+
7
+
8
+ main_css = """
9
+ <style>
10
+ .navbar-light .navbar-brand:focus, .navbar-light .navbar-brand:hover {
11
+ color: rgba(0,0,0,.9);
12
+ text-decoration: none;
13
+ background_color:#6f42c1;
14
+ }
15
+ .navbar navbar-expand-lg navbar-light{
16
+ background-color:red;
17
+ }
18
+ .title {
19
+ color: black;
20
+ font-weight: bold;
21
+ text-align: center;
22
+ text_decoration:none;
23
+ }
24
+ .title:hover {
25
+ text_decoration:none;
26
+ }
27
+ div.stButton > button:first-child {
28
+ background-color: rgb(207, 87, 87);
29
+ }
30
+ div.block-container.css-z5fcl4.egzxvld4{
31
+ padding-top : 46px;
32
+ }
33
+ img{
34
+ width:100px;
35
+ hight:100px;
36
+ margin:0;
37
+ }
38
+ </style>
39
+ """
40
+
41
+ nav_bar_css = """
42
+ <nav class="navbar bg-light navbar-expand-lg navbar-light">
43
+ <div class="container-fluid">
44
+ <a class="navbar-brand" href="#">
45
+ <img src="https://assets.lightfunnels.com/account-3085/images_library/54068d2e-eb65-4140-aa77-7acb8a31b779.presum.pmg.png">
46
+ <span class="title text-decoration-none">PreSum</span>
47
+ </a>
48
+ <button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
49
+ <span class="navbar-toggler-icon"></span>
50
+ </button>
51
+ <div class="collapse navbar-collapse" id="navbarNav">
52
+ <ul class="navbar-nav">
53
+ <li class="nav-item">
54
+ <a class="nav-link active" aria-current="page" href="#">About</a>
55
+ </li>
56
+ </ul>
57
+ </div>
58
+ </div>
59
+ </nav>
60
+ <br>
61
+ """
hespress.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import requests
3
+ import re
4
+
5
+ def scrape_hes(url):
6
+ HEADERS = {
7
+ 'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
8
+ page = requests.get(url, headers=HEADERS)
9
+
10
+ src = page.content # variable to store page content
11
+ soup = BeautifulSoup(src, "html.parser") # beautify code
12
+ # print(soup)
13
+ Matches_Details = []
14
+
15
+ # find all divs where exists class...
16
+ article_content = soup.find("div", {'article-content'})
17
+
18
+ all_paragraphes = article_content.find_all("p") # get all a tags
19
+ # matches_number = len(all_matches)
20
+ article_text = ""
21
+ i=0
22
+ for x in all_paragraphes:
23
+ if i==0:
24
+ i=1
25
+ x = x.text.strip()
26
+ article_text = article_text+'\n'+x
27
+ continue
28
+ x = x.text.strip()
29
+ article_text = article_text+'\n'+x
30
+ #suppression espaces vides
31
+ article_text = re.sub(r'\n[\t\n\s]+\n*',r"\n",article_text)
32
+ return article_text.strip()
inference.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import pipeline
3
+ import spacy
4
+ from difflib import SequenceMatcher
5
+
6
+ nlp = spacy.load("en_core_web_sm")
7
+
8
+ def get_n_first_sent(text, n = 1): # extract first n sentences of text
9
+ doc = nlp(text)
10
+ sentences = [sent.text for sent in doc.sents]
11
+ if n == -1: # return all sentences
12
+ return sentences
13
+ return sentences[0:n-1]
14
+
15
+ def rem_similiar(list_sent_text,list_sent_sum,treshhold = 0.9): # uses SequenceMatcher to find similiar sentences
16
+ for i, sent_sum in enumerate(list_sent_sum):
17
+ if i == len(list_sent_text):
18
+ break
19
+ for sent_text in list_sent_text: # calcule la similiartité avec ttes les autres phrases
20
+ score_similarité = SequenceMatcher(None, sent_sum, sent_text).ratio()
21
+ if score_similarité >= treshhold:
22
+ list_sent_text.pop(i)
23
+
24
+ @st.cache_resource
25
+ def load_model():
26
+ return pipeline("summarization", model="Yahiael1/mymodel_final_v2")
27
+
28
+ def summary_wrapper(sum_obj,text,min_len,max_len):
29
+ return sum_obj(text, max_length = max_len,
30
+ min_length = min_len,
31
+ early_stopping = True,
32
+ clean_up_tokenization_spaces = True,
33
+ truncation=True, # max token number = 1024
34
+ num_beams = 8, # nombres de tokens à générer après chaque mot, le modèle ensuite choisit l'un de ces tokens; associée à do_sample
35
+ #do_sample=True, # associée à num_beams, utilise un algorithme non-glouton pour le choix du token suivant
36
+ repetition_penalty = 1.1, # pénalise les mots redondants en diminuant leur score
37
+ temperature = 1.3, # modifie hasardément les scores des tokens à choisir pour augmenter ou diminuer la "créativité du modèle"
38
+ num_beam_groups = 4 # doit etre diviseur de num_beams, ajoute un mécanisme promouvant la diversité des tokens générés, ne peut pas etre utlisé avec do_sample
39
+ )[0]["summary_text"]
40
+
41
+
42
+ def summarize(summarizer_object,desired_length,text):
43
+ if desired_length == 'long':
44
+ max_len = 128
45
+ min_len = 100
46
+ elif desired_length == 'medium':
47
+ max_len = 90
48
+ min_len = 50
49
+ elif desired_length == 'short':
50
+ max_len = 40
51
+ min_len = 10
52
+
53
+ first_summary = summary_wrapper(summarizer_object,text,min_len,max_len)
54
+
55
+ sent_text = get_n_first_sent(text, 2) # get 5 first sentences of text
56
+ sent_sum = get_n_first_sent(first_summary, -1) # get all sentences of summary
57
+
58
+ rem_similiar(sent_text,sent_sum) # on supprime les phrases extraites
59
+ new_text = '\n'.join(sent_text)
60
+
61
+ return summary_wrapper(summarizer_object,new_text,min_len,max_len)
62
+
63
+
main.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ st.set_page_config(layout="wide")
3
+
4
+
5
+ from scraping_needs import list_user_agents #list of user agents for scraping
6
+ import random
7
+ import streamlit.components.v1 as components
8
+ from streamlit_image_select import image_select
9
+ from main_scraping import main_scrape
10
+ from trends_aljazeera import trends_aljazeera
11
+ from trends_mwn import trends_mwn
12
+ from trends_hespress import trends_hespress
13
+ from css import main_css,nav_bar_css,ext_css
14
+ from inference import *
15
+
16
+ # on charge le modèle
17
+ resumeur = load_model()
18
+
19
+ mwn_trends = trends_mwn(list_user_agents[19])[0:6]
20
+ hes_tends = trends_hespress()
21
+ aljaz_tends = trends_aljazeera(list_user_agents[11])[0:6]
22
+
23
+ image_list=[]
24
+ caption_list=[]
25
+
26
+ def get_link_from_image(image_link):
27
+ for i in mwn_trends + hes_tends + aljaz_tends:
28
+ if image_link == i["image_link"]:
29
+ return i["article_link"]
30
+
31
+ def link_to_text(user_input):
32
+ return main_scrape(user_input)
33
+
34
+
35
+ st.markdown(ext_css,unsafe_allow_html=True)
36
+ st.markdown(main_css,unsafe_allow_html=True)
37
+ st.markdown(nav_bar_css,unsafe_allow_html=True)
38
+
39
+
40
+ with st.expander(r":balloon: ${\huge \text{Quick grab}}$",expanded = 0):
41
+ option_site = st.selectbox("Choose a news outlet", options=("Morocco World News", "Hespress","Aljazeera"))
42
+
43
+ if option_site=="Morocco World News":
44
+ for i in range(len(mwn_trends)):
45
+ image_list.append(mwn_trends[i]["image_link"])
46
+ caption_list.append(mwn_trends[i]["title"])
47
+ elif option_site=="Hespress":
48
+ for i in range(len(hes_tends)):
49
+ image_list.append(hes_tends[i]["image_link"])
50
+ caption_list.append(hes_tends[i]["title"])
51
+ elif option_site=="Aljazeera":
52
+ for i in range(5):
53
+ image_list.append(aljaz_tends[i]["image_link"])
54
+ caption_list.append(aljaz_tends[i]["title"])
55
+
56
+
57
+ img = image_select(
58
+ label="Choose an article to summarize: ",
59
+ images=image_list,
60
+ captions=caption_list,
61
+ #use_container_width=False
62
+ )
63
+
64
+ sum_from_img = st.button("Get text of article!")
65
+
66
+
67
+ col1,col2,col3,col3_5,col4,col5 = st.columns([1.5,0.2,1,0.2,0.75,0.8]) # dont change these values cuz they work on compact mode
68
+
69
+ with col1:
70
+ st.markdown("<br>",unsafe_allow_html=True)
71
+ option = st.selectbox("Choose your input option", options=("Text", "Link"),help="""
72
+ For link inputs, you can choose any article from these 3 websites:
73
+ * [Morocco World News](https://www.moroccoworldnews.com/)
74
+ * [Hespress english](https://en.hespress.com/)
75
+ * [Aljazeera english](https://www.aljazeera.com/)
76
+ """
77
+ )
78
+
79
+
80
+
81
+ with col3:
82
+ #summary_length = st.radio("Desired summary length:",
83
+ # ["short", "medium", "long"])
84
+ summary_length= st.select_slider(
85
+ 'Desired summary length:',
86
+ options=['short', 'medium', 'long'])
87
+
88
+ with col4:
89
+ st.markdown("<br>",unsafe_allow_html=True)
90
+ sum_button = st.button("Summarize")
91
+
92
+
93
+
94
+ flag = 1
95
+ if sum_from_img:
96
+ chosen_link = str(get_link_from_image(img))
97
+ option = "Text"
98
+ text_from_link=st.text_area("Your article's text : ", value=main_scrape(chosen_link))
99
+ st.session_state.user_text = text_from_link
100
+ flag = 0
101
+
102
+
103
+ if flag:
104
+ if (option == "Text"): #to make sure lines 118 and 110 are not both executed(if user chose image, we shouldnt add another text area)
105
+ if 'user_text' in st.session_state:
106
+ text_from_link = st.text_area("Put the text of the article here",value=st.session_state.user_text)
107
+ else:
108
+ text_from_link = st.text_area("Put the text of the article here")
109
+ elif option == "Link":
110
+ user_input_link = st.text_input("Paste your link here")
111
+
112
+
113
+
114
+ if(sum_button): # user clicked summarize
115
+ if (flag == 0): # article text area already generated
116
+ st.divider()
117
+ # model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
118
+ # tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
119
+ # inputs = tokenizer([user_input], return_tensors='pt')
120
+ # summary = model.generate(inputs['input_ids'], max_length=summary_lenth, min_length=30, do_sample=False)
121
+ # st.write([tokenizer.decode(g, skip_special_tokens=True) for g in summary])
122
+
123
+ else: # no text area created yet
124
+ if (option == "Link") :
125
+ text_from_link=st.text_area("Your article's text : ", value=link_to_text(user_input_link))
126
+
127
+ #summary = summarizer(text_from_link, max_length=100, min_length=30, do_sample=False)
128
+
129
+ #st.write(summary[0]['summary_text'])
130
+ summary = st.text_area('Article summary:', value=summarize(resumeur,summary_length,text_from_link), disabled=True)
131
+
main_scraping.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from urllib.parse import urlparse
2
+ from hespress import scrape_hes
3
+ from aljazeera import scrape_aljaz
4
+ from mwn import scrape_mwn
5
+ import streamlit as st
6
+ import validators
7
+
8
+ @st.cache_data
9
+ def main_scrape(url):
10
+ if not validators.url(url):
11
+ return "We're sorry, but the link you provided seems to be invalid. Please double-check that the URL is correct and properly formatted."
12
+
13
+ domain = urlparse(url).netloc
14
+ if "moroccoworldnews.com" in domain:
15
+ return scrape_mwn(url)
16
+ if "en.hespress.com" in domain:
17
+ return scrape_hes(url)
18
+ if "aljazeera.com" in domain:
19
+ return scrape_aljaz(url)
20
+ else:
21
+ return "We're sorry, this website is not currently supported. To see a list of supported websites, click the '?' next to the input option."
mwn.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import requests
3
+ import json
4
+ import re
5
+
6
+ def scrape_mwn(url):
7
+ # extraction du code html
8
+ html_content = requests.get(url)
9
+ # parser le code html
10
+ soup = BeautifulSoup(html_content.text, 'html.parser')
11
+ # extaction des paragraphes p l'article
12
+ app_div = soup.find('div', id='app')
13
+ p = app_div.attrs["data-page"]
14
+ p = json.loads(p) # transformer le contenu en dict json
15
+ p = p["props"]["post"]["post_content"] # chemin des p de l'article
16
+ # print(json_object) # debug
17
+ # parser l'html du texte pour nettoyage
18
+ main_text_soup = BeautifulSoup(p, 'html.parser')
19
+ # extraction des tags p
20
+ main_text_soup.find_all('p')
21
+
22
+ # suppression du message de copyright
23
+ main_text_soup.find("p", {"class": "article_copyright"}).decompose()
24
+
25
+ # suppression du Read also
26
+ for p in main_text_soup.find_all("p"): #on itère sur les paragraphes, on debute par le dernier car 'read also' se trouve en fin de texte
27
+ for link_tag in p.find_all("a"): # on cherche les liens
28
+ joint_list = filter(lambda x: (x != None), [link_tag.find_previous_sibling(string=True) , link_tag.find_previous_sibling("strong")]) # elemnts avant lien + enlever none
29
+ for prev in joint_list: # le read also est tpujours avant le lien, donc on ne cherche que les tags avant
30
+ if ('read also:' in prev.string.lower()): # verifie si c est le tag voulu
31
+ p.decompose() # on supprime le tag parent
32
+
33
+
34
+
35
+ # suppression des sous titres des paragraphes
36
+ try:
37
+ for sub_title_tag in filter(lambda x: x != None, (main_text_soup.find_all("h3") + main_text_soup.find_all("strong"))): #on cherche les tags strong et h3 et on itère
38
+ if((sub_title_tag.parent.name != "a") & (len(sub_title_tag.findChildren("a")) == 0 )): # pour ne pas supprimer les liens marqués strong
39
+ if(sub_title_tag.parent.name == "[document]"): # si le tag est au plus haut niveau cad parent = document
40
+ #print("self")
41
+ sub_title_tag.decompose() # on détruit le tag
42
+ else: # si le tag est a l'interieur d'un <a>
43
+ #print("parent decomp")
44
+ sub_title_tag.parent.decompose() # on détruit le tag parent
45
+ except Exception as e:
46
+ pass
47
+
48
+
49
+
50
+ text = main_text_soup.text.strip() # texte
51
+
52
+ #suppression de la ville au début
53
+ if(re.search(r"\s-\s",text[:20])): #cherche pour ville dans 20 premiers chars
54
+ city_endpos = re.search(r"\s-\s",text[:20]).end()
55
+ text = text[city_endpos:] # supprimer la ville au début du texte
56
+
57
+ #suppression des espaces entrelignes
58
+ text = re.sub(r'\n[\t\n\s]+\n*',r"\n",text)
59
+
60
+ return text
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ requests
2
+ regex
3
+ streamlit
4
+ Transformers
5
+ bs4
6
+ streamlit_image_select
7
+ validators
8
+ torch
9
+ spacy
10
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl
scraping_needs.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ list_user_agents = ["Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.58", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36"]
4
+
5
+ def get_content(url,user_agent,bool_text):
6
+ r = requests.get(url,headers={'User-Agent':user_agent})
7
+ if r.status_code == 200:
8
+ if bool_text:
9
+ return str(r.text)
10
+ return r.content
11
+ return 0
trends_aljazeera.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup as soup
2
+ import streamlit as st
3
+ import re
4
+ from scraping_needs import get_content
5
+
6
+
7
+ @st.cache_data(ttl=7800) # cache clears after 7800s
8
+ def trends_aljazeera(user_agent):
9
+ main_url = "https://www.aljazeera.com/"
10
+ contenaire = soup(get_content(main_url, user_agent, 0), "html.parser")
11
+
12
+ headers_link = contenaire.find_all("li", {"class": "fte-featured-articles-list__item"})
13
+
14
+
15
+
16
+ H3 = []
17
+ for i in headers_link:
18
+ Live=i.find(class_="post-label__text")
19
+ if(Live!=None):
20
+ if(Live.text=="Live updates"):
21
+ continue
22
+
23
+
24
+
25
+ pic = {}
26
+ title = i.find(class_="fte-article__title").find('span').text
27
+ picture = i.find("img")
28
+ image_url = main_url + picture.attrs["src"]
29
+ image_url = re.sub(r"\?(.*)", "", image_url)
30
+ image_url2 = image_url + "?resize=900%2C500%"
31
+
32
+ link = i.find("a")
33
+ article_link = main_url + link.attrs["href"]
34
+
35
+ pic["title"] = title
36
+ pic["image_link"] = image_url2
37
+ pic["article_link"] = article_link
38
+
39
+ H3.append(pic)
40
+ return H3
trends_hespress.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import requests
3
+ import streamlit as st
4
+
5
+
6
+
7
+ def get_trends_image(page):
8
+ HEADERS = {
9
+ 'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
10
+ page = requests.get(
11
+ page,
12
+ headers=HEADERS)
13
+ src = page.content # variable to store page content
14
+ soup = BeautifulSoup(src, "html.parser") # beautify code
15
+ # print(soup)
16
+
17
+ image = soup.find("img") # find all divs where exists class...
18
+
19
+
20
+
21
+ image_link=image.get('src')
22
+ return image_link
23
+
24
+
25
+
26
+ @st.cache_data(ttl=7200) # cache clears after 7200s
27
+ def trends_hespress():
28
+ HEADERS = {
29
+ 'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
30
+ page = requests.get(
31
+ 'https://en.hespress.com/',
32
+ headers=HEADERS)
33
+ src = page.content # variable to store page content
34
+ soup = BeautifulSoup(src, "html.parser") # beautify code
35
+ # print(soup)
36
+
37
+ page = soup.find("div", {'left-side heading-box col'}) # find all divs where exists class...
38
+ all_trends = page.find_all("a", {'wpp-post-title'}) # get all a tags
39
+ article_text = ""
40
+ trends_list=[]
41
+ for x in all_trends:
42
+
43
+ trend_link = x.get('href')
44
+ trend_title = x.text
45
+ trend_image=get_trends_image(trend_link)
46
+
47
+ subdict={ 'title':trend_title,
48
+ 'image_link':trend_image,
49
+ 'article_link':trend_link}
50
+ trends_list.append(subdict)
51
+ extra_page = soup.find("div", {'group-item col-sm-12 col-md-6 col-xl-4 category-society bloc_col'})
52
+ extra_trend=extra_page.find("div", {'ratio-medium'})
53
+ image = extra_trend.find("img")
54
+ trend_image=image.get('src')
55
+ trend_title=image.get('alt')
56
+ trend_link = extra_page.find_all('a')
57
+ trend_link = trend_link[1].get('href')
58
+
59
+
60
+ trend_4={ 'title':trend_title,
61
+ 'image_link':trend_image,
62
+ 'article_link':trend_link}
63
+ trends_list.append(trend_4)
64
+ return trends_list
65
+
66
+
67
+
68
+
trends_mwn.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import streamlit as st
4
+ from scraping_needs import get_content
5
+
6
+ @st.cache_data(ttl=7800) # cache clears after 7800s
7
+ def trends_mwn(user_agent):
8
+
9
+ #page uses javascript
10
+ str_main_list = get_content("https://www.moroccoworldnews.com/home/post/zheadlines",user_agent,1) # we want text not html
11
+
12
+ main_list = json.loads(str_main_list)
13
+
14
+
15
+ cards_content = []
16
+
17
+ for card in main_list:
18
+ content_dict = {}
19
+
20
+ content_dict["image_link"] = card["thumb"]
21
+ content_dict["title"] = card["post_title"]
22
+
23
+ try:
24
+ tmp = card["tsize"]
25
+ except KeyError:
26
+ try:
27
+ tmp = card["msize"]
28
+ except KeyError:
29
+ tmp = card["lsize"]
30
+ year = re.search('20\d\d(?=[\\\/])',tmp).group()
31
+ month = re.search('(?<=[\\\/])\d{1,2}(?=[\\\/])',tmp).group()
32
+
33
+ content_dict["article_link"] = "https://www.moroccoworldnews.com/" + str(year) + "/" + str(month) + "/" + str(card["ID"]) + "/" + card["post_name"]
34
+
35
+ cards_content.append(content_dict)
36
+
37
+ return cards_content