In [1]:
feed_keywords = ['a16z.com/',
             'sequoiacap.com/article',
             'zettavp.com/playbook/',
             'atomico.com/insights/',
             'nt-z.ro/',
             'accel.com/noteworthy',
             'felicis.com/',
             'scalevp.com/blog/',
             'redpoint.com/start/',
             '83north.com/',
             'bvp.com/atlas/']
feed_age = '28 days'

In [2]:
keywords = ["Electro mobility",
            "Batteries ",
            "Battery Management systems",
            "Lidars",
            "RADARS",
            "AI",
            "Industrial AI",
            "Transportation",
            "Mobility",
            "Climate Tech",
            "Sustainable grid",
            "Sensor fusion",
            "Computer vision",
            "Data Analytics",
            "Digital Twins",
            "Automotive Cybersecurity",
            "Logistics",
            "Ports",
            "Construction sites",
            "Mines",
            "Quarries",
            "Trucks",
            "Power train",
            "Software defined vehicle"]

feed = "https://www.rssground.com/p/Newsletter"

In [3]:
#!pip install keybert
#!pip install feedparser
#!pip install keyphrase_vectorizers
#!pip install sentence-transformers

In [4]:
from keybert import KeyBERT
import pandas as pd
from keyphrase_vectorizers import KeyphraseCountVectorizer
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import feedparser
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
import time
import pickle
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
#from functools import lru_cache

# Define function to extract keywords from the HTML body using the YAKE keyword extractor
def extract_keyphrases(text, kw_model, vectorizer, embedding_model):
    kph = [kw for kw, score in kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', vectorizer=vectorizer, use_mmr=True)]
    keyphrase_embeddings = embedding_model.encode(kph)
    return kph, keyphrase_embeddings

def get_similarity_scores(keyword_embeddings, keyphrase_embeddings):
    similarity_scores = cosine_similarity(keyphrase_embeddings, keyword_embeddings).max(axis=1).astype(str).tolist()
    similarity_max = cosine_similarity(keyphrase_embeddings, keyword_embeddings).flatten().max().astype(str)
    return similarity_scores, similarity_max

# Define function to get the redirected URL (if any) for a given URL
def get_redirected_url(url_record, headers, expected_codes=(301, 302, 303, 307), timeout=60):
  try:
    res = requests.head(url_record['url'], headers=headers, timeout=timeout)
    if res.status_code in expected_codes:
      url_record['url'] = res.headers['location']
    elif res.status_code == 200:
      url_record['url'] = url_record['url']
    else:
      print(f"Retrieving {url_record['url']} failed: Expected {expected_codes}, but received {res.status_code}: {res.reason}")
  except requests.exceptions.Timeout:
    print(f"\nRequest timed out for {url_record['url']}")
    return url_record
  except:
    return url_record

  return url_record

# Define function to get the HTML body of a given URL
def get_html_body(url, headers):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        html = response.content
        soup = BeautifulSoup(html, 'html.parser')
        return soup.body.get_text()
    except:
        return ''

# Define function to write data to the Excel sheet
def write_data_to_excel(url_dict, filename):
    # Create a new Excel workbook and worksheet
    workbook = Workbook()
    worksheet = workbook.active
    worksheet.title = 'RSS Feeds'

    # Write the headers for the Excel sheet
    worksheet.cell(row=1, column=1, value='Feed Name')
    worksheet.cell(row=1, column=2, value='URL')
    worksheet.cell(row=1, column=3, value='Updated')
    worksheet.cell(row=1, column=4, value='Keyphrases')
    worksheet.cell(row=1, column=5, value='Similarity to supplied keywords')
    worksheet.cell(row=1, column=6, value='Similarity (max)')
    worksheet.cell(row=1, column=7, value='HTML Body')

    # Loop over the unique URLs and write them to the Excel sheet
    row_num = 2
    for url, data in url_dict.items():
        worksheet.cell(row=row_num, column=1, value=data['feed_name'])
        worksheet.cell(row=row_num, column=2, value=url)
        worksheet.cell(row=row_num, column=3, value=data['updated'])
        worksheet.cell(row=row_num, column=4, value=data['keyphrases'])
        worksheet.cell(row=row_num, column=5, value=data['similarity'])
        worksheet.cell(row=row_num, column=6, value=data['similarity_max'])
        worksheet.cell(row=row_num, column=7, value=data['html_body'])

        row_num += 1

    worksheet.freeze_panes = 'A2'

    # Set the number format for column A, except the first row
    for row in worksheet.iter_rows(min_row=2, min_col=3, max_col=3):
        for cell in row:
            cell.number_format = 'mm/dd/yyyy hh:mm:ss'

    # Save the Excel workbook
    workbook.save(filename)

    # Print confirmation message
    #print(f'RSS output written to excel sheet: {filename}')

def remaining_entries_from_dict(filename, dictionary):
    pickle_data = {}
    if os.path.exists(filename):
        with open(filename, 'rb') as f:
            pickle_data = pickle.load(f)
    return list(set(dictionary.keys()) - set(pickle_data.keys()))

def process_url(url):
    global url_dict
    
    #body = get_html_body(url, headers)
    #kph,keyphrase_embeddings = extract_keyphrases(body, kw_model, vectorizer, embedding_model)
    #similarity, similarity_max = get_similarity_scores(keyword_embeddings, keyphrase_embeddings)

    #url_dict[url]['keyphrases'] = ', '.join(kph)
    #url_dict[url]['similarity'] = ', '.join(similarity)
    #url_dict[url]['similarity_max'] = similarity_max
    #url_dict[url]['html_body'] = body
    
    url_dict[url]['keyphrases'] = ''
    url_dict[url]['similarity'] = ''
    url_dict[url]['similarity_max'] = ''
    url_dict[url]['html_body'] = "Skipping this part, to speed up the process"

    # Store temporary results to disk
    #with open("retrieved_urls.pkl", 'wb') as f:
    #    pickle.dump(url_dict, f)

In [5]:
import pprint
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from datetime import datetime
import nltk


# Initialize the SentenceTransformer model
kw_model = KeyBERT('distilbert-base-nli-mean-tokens')
vectorizer = KeyphraseCountVectorizer()
embedding_model = SentenceTransformer('distilbert-base-nli-mean-tokens')
nltk.download('stopwords', quiet=True)

# Initialize variables
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
keyword_embeddings = embedding_model.encode(keywords) # Encode keywords using the embedding model

def filter_strings(lst1, lst2):
    """
    Filters the list `lst2` and returns only the elements that have any of the elements of `lst1` as a substring.
  
    Args:
        lst1 (list): The list of substrings to match against.
        lst2 (list): The list of strings to filter.

    Returns:
        list: A new list containing the filtered elements from `lst2`.

    Examples:
        >>> lst1 = ['apple', 'banana', 'orange']
        >>> lst2 = ['apple pie', 'banana bread', 'cherry pie', 'orange juice']
        >>> filter_strings(lst1, lst2)
        ['apple pie', 'banana bread', 'orange juice']
    """
    filtered_lst2 = [s for s in lst2 if any(substring in s for substring in lst1)]
    return filtered_lst2


def read_feeds(rss_feed, how_old):
    global urls
    import sys
    import io
    import re
    from datetime import datetime, timedelta
    import pytz

    old_stdout = sys.stdout
    sys.stdout = mystdout = io.StringIO()

    # Loop over the RSS feeds and keywords
    urls_temp = []
    urls = []

    # Get the desired timezone
    timezone = pytz.timezone('Europe/Stockholm')  # Replace 'Your_Timezone_Here' with the desired timezone

    # Calculate the age with timezone
    feed_item_age_minimum = datetime.now(timezone) - timedelta(days=int(how_old.split()[0]))

    feed = feedparser.parse(rss_feed)
    for entry in tqdm(feed.entries, total=len(feed.entries),  file=sys.stdout, bar_format='\tReading feed entries: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):
        soup = BeautifulSoup(entry.summary, 'html.parser')
        updated = datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %z')
        if re.search(r'@([^ ]+)', entry.title):
            feed_name = re.search(r'@([^ ]+)', entry.title).group(1)
        else:
            feed_name = ''
        if updated > feed_item_age_minimum:
            urls_temp.extend([{'url': link.get('href'), 'updated': updated, 'feed_name': feed_name} for link in soup.find_all('a')])

    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = [executor.submit(get_redirected_url, url, headers) for url in urls_temp]
        for future in tqdm(as_completed(futures), total=len(futures), file=sys.stdout, bar_format='Checking URLs: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):
            urls.append(future.result())

    sys.stdout = old_stdout
    return mystdout.getvalue()

def read_process_urls():
    import sys
    import io
    from datetime import datetime, timedelta
    old_stdout = sys.stdout
    sys.stdout = mystdout = io.StringIO()

    global urls
    global url_dict

    #print(f"Urls: {urls}")
    url_dict = {}
    for item in filter_strings(feed_keywords, urls):
        feed_name = item['feed_name']
        updated = item['updated']
        url = item['url']

        import pprint
        pprint.pprint(url)
        if url not in url_dict.keys():
            url_dict[url] = {'updated': updated, 'feed_name': feed_name}
        else:
            if url_dict[url]['updated'] > updated:
                url_dict[url]['updated'] = updated

    start_parallel_loop_time = time.time()
    results = []
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = [executor.submit(process_url, url) for url in url_dict.keys()]#remaining_entries_from_dict("retrieved_urls.pkl", url_dict)]
        for future in tqdm(as_completed(futures), total=len(futures),  file=sys.stdout, bar_format='Reading URLs: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):
            results.append(future.result())
    #print(f"Parallel URL processing: {time.time() - start_parallel_loop_time:.3f} seconds")
    print(f"Total links processed: {len(url_dict.keys())}")

    #with open("retrieved_urls.pkl", 'wb') as f:
    #    pickle.dump(url_dict, f)

    # Write dataset to the Excel sheet
    write_data_to_excel(url_dict, 'newsletter_results.xlsx')

    sys.stdout = old_stdout
    return mystdout.getvalue()

In [6]:
from ipywidgets import HTML

read_feeds(feed, feed_age)
display(HTML(f"Total links examined: {len(urls)}"))

read_process_urls()
display(HTML(f"Relevant links found: {len(url_dict.keys())}"))
display(HTML(f"------------------------------"))

for url in url_dict.keys():
    #print(url)
    display(HTML(f"{url}"))
