Spaces:

Python-proje
/

presum

Runtime error

App Files Files Community

Python-proje commited on May 19, 2023

Commit

55af729

1 Parent(s): c61f36b

Upload 12 files

Browse files

Files changed (12) hide show

aljazeera.py +28 -0
css.py +61 -0
hespress.py +32 -0
inference.py +63 -0
main.py +131 -0
main_scraping.py +21 -0
mwn.py +60 -0
requirements.txt +10 -0
scraping_needs.py +11 -0
trends_aljazeera.py +40 -0
trends_hespress.py +68 -0
trends_mwn.py +37 -0

aljazeera.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import requests
+from bs4 import BeautifulSoup
+import regex  as  re
+import streamlit as st
+@st.cache_data(ttl=7200)
+def scrape_aljaz(my_url):
+    #print(my_url)
+    codehtml = requests.get(my_url)
+    page_soup = BeautifulSoup(codehtml.content, "html.parser")
+    # print("Le code HTML est:",page_soup)
+    article = page_soup.find("div", {"class": "wysiwyg wysiwyg--all-content css-ibbk12"})
+    try:
+        paragraphe = article.find_all("p")
+    except AttributeError:
+        return 'This is not a valid article, please choose another.'
+    fullArticle = ""
+    i=0
+    for news in paragraphe:
+        if i==0: # skip first iteration
+            i = 1
+            fullArticle = fullArticle + news.text.strip() # no newline before the first paragraph
+            continue
+        fullArticle = fullArticle + "\n" + news.text.strip()
+    #suppression des espaces entrelignes
+    fullArticle = re.sub(r'\n[\t\n\s]+\n*',r"\n",fullArticle)
+    return fullArticle

css.py ADDED Viewed

	@@ -0,0 +1,61 @@

+ext_css = """
+<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
+"""
+main_css = """
+<style>
+        .navbar-light .navbar-brand:focus, .navbar-light .navbar-brand:hover {
+            color: rgba(0,0,0,.9);
+            text-decoration: none;
+            background_color:#6f42c1;
+        }
+        .navbar navbar-expand-lg navbar-light{
+            background-color:red;
+        }
+        .title {
+            color: black;
+            font-weight: bold;
+            text-align: center;
+            text_decoration:none;
+        }
+        .title:hover {
+            text_decoration:none;
+        }
+        div.stButton > button:first-child {
+            background-color: rgb(207, 87, 87);
+        }
+        div.block-container.css-z5fcl4.egzxvld4{
+            padding-top : 46px;
+        }
+        img{
+        width:100px;
+        hight:100px;
+        margin:0;
+        }
+</style>
+"""
+nav_bar_css = """
+<nav class="navbar bg-light navbar-expand-lg navbar-light">
+  <div class="container-fluid">
+    <a class="navbar-brand" href="#">
+      <img src="https://assets.lightfunnels.com/account-3085/images_library/54068d2e-eb65-4140-aa77-7acb8a31b779.presum.pmg.png">
+       <span class="title text-decoration-none">PreSum</span>
+    </a>
+    <button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
+      <span class="navbar-toggler-icon"></span>
+    </button>
+    <div class="collapse navbar-collapse" id="navbarNav">
+      <ul class="navbar-nav">
+        <li class="nav-item">
+          <a class="nav-link active" aria-current="page" href="#">About</a>
+        </li>
+      </ul>
+    </div>
+  </div>
+</nav>
+<br>
+"""

hespress.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from bs4 import BeautifulSoup
+import requests
+import re
+def scrape_hes(url):
+    HEADERS = {
+        'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
+    page = requests.get(url, headers=HEADERS)
+    src = page.content  # variable to store page content
+    soup = BeautifulSoup(src, "html.parser")  # beautify code
+    # print(soup)
+    Matches_Details = []
+    # find all divs where exists class...
+    article_content = soup.find("div", {'article-content'})
+    all_paragraphes = article_content.find_all("p")  # get all a tags
+    # matches_number = len(all_matches)
+    article_text = ""
+    i=0
+    for x in all_paragraphes:
+        if i==0:
+            i=1
+            x = x.text.strip()
+            article_text = article_text+'\n'+x
+            continue
+        x = x.text.strip()
+        article_text = article_text+'\n'+x
+    #suppression espaces vides
+    article_text = re.sub(r'\n[\t\n\s]+\n*',r"\n",article_text)
+    return article_text.strip()

inference.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import streamlit as st
+from transformers import pipeline
+import spacy
+from difflib import SequenceMatcher
+nlp = spacy.load("en_core_web_sm")
+def get_n_first_sent(text, n = 1): # extract first n sentences of text
+    doc = nlp(text)
+    sentences = [sent.text for sent in doc.sents]
+    if n == -1: # return all sentences
+        return sentences
+    return sentences[0:n-1]
+def rem_similiar(list_sent_text,list_sent_sum,treshhold = 0.9): # uses SequenceMatcher to find similiar sentences
+    for i, sent_sum in enumerate(list_sent_sum):
+        if i == len(list_sent_text):
+            break
+        for sent_text in list_sent_text: # calcule la similiartité avec ttes les autres phrases
+            score_similarité = SequenceMatcher(None, sent_sum, sent_text).ratio()
+            if score_similarité >= treshhold:
+                list_sent_text.pop(i)
+@st.cache_resource
+def load_model():
+    return pipeline("summarization", model="Yahiael1/mymodel_final_v2")
+def summary_wrapper(sum_obj,text,min_len,max_len):
+    return sum_obj(text, max_length = max_len,
+                             min_length = min_len,
+                             early_stopping = True,
+                             clean_up_tokenization_spaces = True,
+                             truncation=True, # max token number = 1024
+                             num_beams = 8, # nombres de tokens à générer après chaque mot, le modèle ensuite choisit l'un de ces tokens; associée à do_sample
+                             #do_sample=True, # associée à num_beams, utilise un algorithme non-glouton pour le choix du token suivant
+                             repetition_penalty = 1.1, # pénalise les mots redondants en diminuant leur score
+                             temperature = 1.3, # modifie hasardément les scores des tokens à choisir pour augmenter ou diminuer la "créativité du modèle"
+                             num_beam_groups = 4 # doit etre diviseur de num_beams, ajoute un mécanisme promouvant la diversité des tokens générés, ne peut pas etre utlisé avec do_sample
+                            )[0]["summary_text"]
+def summarize(summarizer_object,desired_length,text):
+    if desired_length == 'long':
+             max_len = 128
+             min_len = 100
+    elif desired_length == 'medium':
+             max_len = 90
+             min_len = 50
+    elif desired_length == 'short':
+             max_len = 40
+             min_len = 10
+    first_summary = summary_wrapper(summarizer_object,text,min_len,max_len)
+    sent_text = get_n_first_sent(text, 2) # get 5 first sentences of text
+    sent_sum = get_n_first_sent(first_summary, -1) # get all sentences of summary
+    rem_similiar(sent_text,sent_sum) # on supprime les phrases extraites
+    new_text = '\n'.join(sent_text)
+    return summary_wrapper(summarizer_object,new_text,min_len,max_len)

main.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import streamlit as st
+st.set_page_config(layout="wide")
+from scraping_needs import list_user_agents #list of user agents for scraping
+import random
+import streamlit.components.v1 as components
+from streamlit_image_select import image_select
+from main_scraping import main_scrape
+from trends_aljazeera import trends_aljazeera
+from trends_mwn import trends_mwn
+from trends_hespress import trends_hespress
+from css import main_css,nav_bar_css,ext_css
+from inference import *
+# on  charge le modèle
+resumeur = load_model()
+mwn_trends = trends_mwn(list_user_agents[19])[0:6]
+hes_tends = trends_hespress()
+aljaz_tends = trends_aljazeera(list_user_agents[11])[0:6]
+image_list=[]
+caption_list=[]
+def get_link_from_image(image_link):
+    for i in mwn_trends + hes_tends + aljaz_tends:
+        if image_link == i["image_link"]:
+            return i["article_link"]
+def link_to_text(user_input):
+  return main_scrape(user_input)
+st.markdown(ext_css,unsafe_allow_html=True)
+st.markdown(main_css,unsafe_allow_html=True)
+st.markdown(nav_bar_css,unsafe_allow_html=True)
+with st.expander(r":balloon: ${\huge \text{Quick grab}}$",expanded = 0):
+  option_site = st.selectbox("Choose a news outlet", options=("Morocco World News", "Hespress","Aljazeera"))
+  if option_site=="Morocco World News":
+    for i in range(len(mwn_trends)):
+      image_list.append(mwn_trends[i]["image_link"])
+      caption_list.append(mwn_trends[i]["title"])
+  elif option_site=="Hespress":
+    for i in range(len(hes_tends)):
+      image_list.append(hes_tends[i]["image_link"])
+      caption_list.append(hes_tends[i]["title"])
+  elif option_site=="Aljazeera":
+    for i in range(5):
+      image_list.append(aljaz_tends[i]["image_link"])
+      caption_list.append(aljaz_tends[i]["title"])
+  img = image_select(
+      label="Choose an article to summarize: ",
+      images=image_list,
+      captions=caption_list,
+      #use_container_width=False
+  )
+  sum_from_img = st.button("Get text of article!")
+col1,col2,col3,col3_5,col4,col5 = st.columns([1.5,0.2,1,0.2,0.75,0.8]) # dont change these values cuz they work on compact mode
+with col1:
+  st.markdown("<br>",unsafe_allow_html=True)
+  option = st.selectbox("Choose your input option", options=("Text", "Link"),help="""
+For link inputs, you can choose any article from these 3 websites:
+* [Morocco World News](https://www.moroccoworldnews.com/)
+* [Hespress english](https://en.hespress.com/)
+* [Aljazeera english](https://www.aljazeera.com/)
+    """
+    )
+with col3:
+   #summary_length = st.radio("Desired summary length:",
+    #                        ["short", "medium", "long"])
+   summary_length= st.select_slider(
+    'Desired summary length:',
+    options=['short', 'medium', 'long'])
+with col4:
+  st.markdown("<br>",unsafe_allow_html=True)
+  sum_button = st.button("Summarize")
+flag = 1
+if sum_from_img:
+  chosen_link = str(get_link_from_image(img))
+  option = "Text"
+  text_from_link=st.text_area("Your article's text : ", value=main_scrape(chosen_link))
+  st.session_state.user_text = text_from_link
+  flag = 0
+if flag:
+  if (option == "Text"): #to make sure lines 118 and 110 are not both executed(if user chose image, we shouldnt add another text area)
+    if 'user_text' in st.session_state:
+      text_from_link = st.text_area("Put the text of the article here",value=st.session_state.user_text)
+    else:
+      text_from_link = st.text_area("Put the text of the article here")
+  elif option == "Link":
+      user_input_link = st.text_input("Paste your link here")
+if(sum_button):  # user clicked summarize
+  if (flag == 0): # article text area already generated
+      st.divider()
+          # model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
+          # tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+          # inputs = tokenizer([user_input], return_tensors='pt')
+          # summary = model.generate(inputs['input_ids'], max_length=summary_lenth, min_length=30, do_sample=False)
+          # st.write([tokenizer.decode(g, skip_special_tokens=True) for g in summary])
+  else: # no text area created yet
+    if (option == "Link") :
+      text_from_link=st.text_area("Your article's text : ", value=link_to_text(user_input_link))
+      #summary = summarizer(text_from_link, max_length=100, min_length=30, do_sample=False)
+      #st.write(summary[0]['summary_text'])
+  summary = st.text_area('Article summary:', value=summarize(resumeur,summary_length,text_from_link), disabled=True)

main_scraping.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from urllib.parse import urlparse
+from hespress import scrape_hes
+from aljazeera import scrape_aljaz
+from mwn import scrape_mwn
+import streamlit as st
+import validators
+@st.cache_data
+def main_scrape(url):
+    if not validators.url(url):
+        return "We're sorry, but the link you provided seems to be invalid. Please double-check that the URL is correct and properly formatted."
+    domain = urlparse(url).netloc
+    if "moroccoworldnews.com" in domain:
+        return scrape_mwn(url)
+    if "en.hespress.com" in domain:
+        return scrape_hes(url)
+    if "aljazeera.com" in domain:
+        return scrape_aljaz(url)
+    else:
+        return "We're sorry, this website is not currently supported. To see a list of supported websites, click the '?' next to the input option."

mwn.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from bs4 import BeautifulSoup
+import requests
+import json
+import re
+def scrape_mwn(url):
+    # extraction du code html
+    html_content = requests.get(url)
+    # parser le code html
+    soup = BeautifulSoup(html_content.text, 'html.parser')
+    # extaction des paragraphes p l'article
+    app_div = soup.find('div', id='app')
+    p = app_div.attrs["data-page"]
+    p = json.loads(p)  # transformer le contenu en dict json
+    p = p["props"]["post"]["post_content"]  # chemin des p de l'article
+    # print(json_object) # debug
+    # parser l'html du texte pour nettoyage
+    main_text_soup = BeautifulSoup(p, 'html.parser')
+    # extraction des tags p
+    main_text_soup.find_all('p')
+    # suppression du message de copyright
+    main_text_soup.find("p", {"class": "article_copyright"}).decompose()
+    # suppression du Read also
+    for p in main_text_soup.find_all("p"): #on itère sur les paragraphes, on debute par le dernier car 'read also' se trouve en fin de texte
+        for link_tag in p.find_all("a"): # on cherche les liens
+            joint_list = filter(lambda x: (x != None), [link_tag.find_previous_sibling(string=True) , link_tag.find_previous_sibling("strong")]) # elemnts avant lien + enlever none
+            for prev in joint_list: # le read also est tpujours avant le lien, donc on ne cherche que les tags avant
+                if ('read also:' in prev.string.lower()): # verifie si c est le tag voulu
+                    p.decompose() # on supprime le tag parent
+    # suppression des sous titres des paragraphes
+    try:
+        for sub_title_tag in filter(lambda x: x != None, (main_text_soup.find_all("h3") + main_text_soup.find_all("strong"))): #on cherche les tags strong et h3 et on itère
+            if((sub_title_tag.parent.name != "a") & (len(sub_title_tag.findChildren("a")) == 0 )): # pour ne pas supprimer les liens marqués strong
+                if(sub_title_tag.parent.name  == "[document]"): # si le tag est au plus haut niveau cad parent = document
+                    #print("self")
+                    sub_title_tag.decompose() # on détruit le tag
+                else:                                              # si le tag est a l'interieur d'un <a>
+                    #print("parent decomp")
+                    sub_title_tag.parent.decompose() # on détruit le tag parent
+    except Exception as e:
+        pass
+    text = main_text_soup.text.strip() # texte
+    #suppression de la ville au début
+    if(re.search(r"\s-\s",text[:20])): #cherche pour ville dans 20 premiers chars
+      city_endpos = re.search(r"\s-\s",text[:20]).end()
+      text = text[city_endpos:] # supprimer la ville au début du texte
+    #suppression des espaces entrelignes
+    text = re.sub(r'\n[\t\n\s]+\n*',r"\n",text)
+    return text

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+requests
+regex
+streamlit
+Transformers
+bs4
+streamlit_image_select
+validators
+torch
+spacy
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl

scraping_needs.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import requests
+list_user_agents = ["Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36                       (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.58", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36"]
+def get_content(url,user_agent,bool_text):
+    r = requests.get(url,headers={'User-Agent':user_agent})
+    if r.status_code == 200:
+        if bool_text:
+            return str(r.text)
+        return r.content
+    return 0

trends_aljazeera.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from bs4 import BeautifulSoup as soup
+import streamlit as st
+import re
+from scraping_needs import get_content
+@st.cache_data(ttl=7800)  # cache clears after 7800s
+def trends_aljazeera(user_agent):
+    main_url = "https://www.aljazeera.com/"
+    contenaire = soup(get_content(main_url, user_agent, 0), "html.parser")
+    headers_link = contenaire.find_all("li", {"class": "fte-featured-articles-list__item"})
+    H3 = []
+    for i in headers_link:
+        Live=i.find(class_="post-label__text")
+        if(Live!=None):
+          if(Live.text=="Live updates"):
+            continue
+        pic = {}
+        title = i.find(class_="fte-article__title").find('span').text
+        picture = i.find("img")
+        image_url = main_url + picture.attrs["src"]
+        image_url = re.sub(r"\?(.*)", "", image_url)
+        image_url2 = image_url + "?resize=900%2C500%"
+        link = i.find("a")
+        article_link = main_url + link.attrs["href"]
+        pic["title"] = title
+        pic["image_link"] = image_url2
+        pic["article_link"] = article_link
+        H3.append(pic)
+    return H3

trends_hespress.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from bs4 import BeautifulSoup
+import requests
+import streamlit as st
+def get_trends_image(page):
+    HEADERS = {
+        'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
+    page = requests.get(
+        page,
+        headers=HEADERS)
+    src = page.content  # variable to store page content
+    soup = BeautifulSoup(src, "html.parser")  # beautify code
+    # print(soup)
+    image = soup.find("img")  # find all divs where exists class...
+    image_link=image.get('src')
+    return image_link
+@st.cache_data(ttl=7200) # cache clears after 7200s
+def trends_hespress():
+        HEADERS = {
+            'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
+        page = requests.get(
+            'https://en.hespress.com/',
+            headers=HEADERS)
+        src = page.content  # variable to store page content
+        soup = BeautifulSoup(src, "html.parser")  # beautify code
+        # print(soup)
+        page = soup.find("div", {'left-side heading-box col'})  # find all divs where exists class...
+        all_trends = page.find_all("a", {'wpp-post-title'})  # get all a tags
+        article_text = ""
+        trends_list=[]
+        for x in all_trends:
+            trend_link = x.get('href')
+            trend_title = x.text
+            trend_image=get_trends_image(trend_link)
+            subdict={ 'title':trend_title,
+                      'image_link':trend_image,
+                      'article_link':trend_link}
+            trends_list.append(subdict)
+        extra_page = soup.find("div", {'group-item col-sm-12 col-md-6 col-xl-4 category-society bloc_col'})
+        extra_trend=extra_page.find("div", {'ratio-medium'})
+        image = extra_trend.find("img")
+        trend_image=image.get('src')
+        trend_title=image.get('alt')
+        trend_link = extra_page.find_all('a')
+        trend_link = trend_link[1].get('href')
+        trend_4={ 'title':trend_title,
+                      'image_link':trend_image,
+                      'article_link':trend_link}
+        trends_list.append(trend_4)
+        return trends_list

trends_mwn.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import re
+import json
+import streamlit as st
+from scraping_needs import get_content
+@st.cache_data(ttl=7800) # cache clears after 7800s
+def trends_mwn(user_agent):
+    #page uses javascript
+    str_main_list = get_content("https://www.moroccoworldnews.com/home/post/zheadlines",user_agent,1) # we want text not html
+    main_list = json.loads(str_main_list)
+    cards_content = []
+    for card in main_list:
+        content_dict = {}
+        content_dict["image_link"] = card["thumb"]
+        content_dict["title"] = card["post_title"]
+        try:
+            tmp = card["tsize"]
+        except KeyError:
+            try:
+                tmp = card["msize"]
+            except KeyError:
+                tmp = card["lsize"]
+        year = re.search('20\d\d(?=[\\\/])',tmp).group()
+        month = re.search('(?<=[\\\/])\d{1,2}(?=[\\\/])',tmp).group()
+        content_dict["article_link"] = "https://www.moroccoworldnews.com/" + str(year) + "/" + str(month) + "/" + str(card["ID"]) + "/" + card["post_name"]
+        cards_content.append(content_dict)
+    return cards_content