Spaces:

georgiad
/

newsletter_helper

Sleeping

File size: 18,543 Bytes

{
  "cells": [
    {
      "cell_type": "raw",
      "metadata": {
        "id": "uIxcPJeuGGAF"
      },
      "source": [
        "---\n",
        "title: Newsletter Helper\n",
        "description: Follow the instructions on screen\n",
        "show-code: false\n",
        "params:\n",
        "    feed_keywords:\n",
        "        label: Sources\n",
        "        input: select\n",
        "        value: ['a16z.com/',\n",
        "             'sequoiacap.com/article',\n",
        "             'zettavp.com/playbook/',\n",
        "             'atomico.com/insights/',\n",
        "             'nt-z.ro/',\n",
        "             'accel.com/noteworthy',\n",
        "             'felicis.com/',\n",
        "             'scalevp.com/blog/',\n",
        "             'redpoint.com/start/',\n",
        "             '83north.com/',\n",
        "             'bvp.com/atlas/']\n",
        "        choices: ['a16z.com/',\n",
        "             'sequoiacap.com/article',\n",
        "             'zettavp.com/playbook/',\n",
        "             'atomico.com/insights/',\n",
        "             'nt-z.ro/',\n",
        "             'accel.com/noteworthy',\n",
        "             'felicis.com/',\n",
        "             'scalevp.com/blog/',\n",
        "             'redpoint.com/start/',\n",
        "             '83north.com/',\n",
        "             'bvp.com/atlas/']\n",
        "        multi: True\n",
        "    feed_age:\n",
        "        label: How old?\n",
        "        input: select\n",
        "        value: '7 days'\n",
        "        choices: ['7 days', '14 days', '30 days']\n",
        "        multi: False\n",
        "---"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "pfJ5NpqjCT1U"
      },
      "outputs": [],
      "source": [
        "feed_keywords = ['a16z.com/',\n",
        "             'sequoiacap.com/article',\n",
        "             'zettavp.com/playbook/',\n",
        "             'atomico.com/insights/',\n",
        "             'nt-z.ro/',\n",
        "             'accel.com/noteworthy',\n",
        "             'felicis.com/',\n",
        "             'scalevp.com/blog/',\n",
        "             'redpoint.com/start/',\n",
        "             '83north.com/',\n",
        "             'bvp.com/atlas/']\n",
        "feed_age = '28 days'"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "id": "mEOS4asyGGAI"
      },
      "outputs": [],
      "source": [
        "keywords = [\"Electro mobility\",\n",
        "            \"Batteries \",\n",
        "            \"Battery Management systems\",\n",
        "            \"Lidars\",\n",
        "            \"RADARS\",\n",
        "            \"AI\",\n",
        "            \"Industrial AI\",\n",
        "            \"Transportation\",\n",
        "            \"Mobility\",\n",
        "            \"Climate Tech\",\n",
        "            \"Sustainable grid\",\n",
        "            \"Sensor fusion\",\n",
        "            \"Computer vision\",\n",
        "            \"Data Analytics\",\n",
        "            \"Digital Twins\",\n",
        "            \"Automotive Cybersecurity\",\n",
        "            \"Logistics\",\n",
        "            \"Ports\",\n",
        "            \"Construction sites\",\n",
        "            \"Mines\",\n",
        "            \"Quarries\",\n",
        "            \"Trucks\",\n",
        "            \"Power train\",\n",
        "            \"Software defined vehicle\"]\n",
        "\n",
        "feed = \"https://www.rssground.com/p/Newsletter\""
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "id": "WMswc6FCGR9T"
      },
      "outputs": [],
      "source": [
        "#!pip install keybert\n",
        "#!pip install feedparser\n",
        "#!pip install keyphrase_vectorizers\n",
        "#!pip install sentence-transformers"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "id": "Ig5nSCbI6yuL"
      },
      "outputs": [],
      "source": [
        "from keybert import KeyBERT\n",
        "import pandas as pd\n",
        "from keyphrase_vectorizers import KeyphraseCountVectorizer\n",
        "from sentence_transformers import SentenceTransformer\n",
        "import numpy as np\n",
        "from sklearn.metrics.pairwise import cosine_similarity\n",
        "\n",
        "import feedparser\n",
        "import requests\n",
        "from bs4 import BeautifulSoup\n",
        "from openpyxl import Workbook\n",
        "import time\n",
        "import pickle\n",
        "import os\n",
        "from tqdm import tqdm\n",
        "from concurrent.futures import ThreadPoolExecutor\n",
        "#from functools import lru_cache\n",
        "\n",
        "# Define function to extract keywords from the HTML body using the YAKE keyword extractor\n",
        "def extract_keyphrases(text, kw_model, vectorizer, embedding_model):\n",
        "    kph = [kw for kw, score in kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', vectorizer=vectorizer, use_mmr=True)]\n",
        "    keyphrase_embeddings = embedding_model.encode(kph)\n",
        "    return kph, keyphrase_embeddings\n",
        "\n",
        "def get_similarity_scores(keyword_embeddings, keyphrase_embeddings):\n",
        "    similarity_scores = cosine_similarity(keyphrase_embeddings, keyword_embeddings).max(axis=1).astype(str).tolist()\n",
        "    similarity_max = cosine_similarity(keyphrase_embeddings, keyword_embeddings).flatten().max().astype(str)\n",
        "    return similarity_scores, similarity_max\n",
        "\n",
        "# Define function to get the redirected URL (if any) for a given URL\n",
        "def get_redirected_url(url_record, headers, expected_codes=(301, 302, 303, 307), timeout=60):\n",
        "  try:\n",
        "    res = requests.head(url_record['url'], headers=headers, timeout=timeout)\n",
        "    if res.status_code in expected_codes:\n",
        "      url_record['url'] = res.headers['location']\n",
        "    elif res.status_code == 200:\n",
        "      url_record['url'] = url_record['url']\n",
        "    else:\n",
        "      print(f\"Retrieving {url_record['url']} failed: Expected {expected_codes}, but received {res.status_code}: {res.reason}\")\n",
        "  except requests.exceptions.Timeout:\n",
        "    print(f\"\\nRequest timed out for {url_record['url']}\")\n",
        "    return url_record\n",
        "  except:\n",
        "    return url_record\n",
        "\n",
        "  return url_record\n",
        "\n",
        "# Define function to get the HTML body of a given URL\n",
        "def get_html_body(url, headers):\n",
        "    try:\n",
        "        response = requests.get(url, headers=headers, timeout=10)\n",
        "        html = response.content\n",
        "        soup = BeautifulSoup(html, 'html.parser')\n",
        "        return soup.body.get_text()\n",
        "    except:\n",
        "        return ''\n",
        "\n",
        "# Define function to write data to the Excel sheet\n",
        "def write_data_to_excel(url_dict, filename):\n",
        "    # Create a new Excel workbook and worksheet\n",
        "    workbook = Workbook()\n",
        "    worksheet = workbook.active\n",
        "    worksheet.title = 'RSS Feeds'\n",
        "\n",
        "    # Write the headers for the Excel sheet\n",
        "    worksheet.cell(row=1, column=1, value='Feed Name')\n",
        "    worksheet.cell(row=1, column=2, value='URL')\n",
        "    worksheet.cell(row=1, column=3, value='Updated')\n",
        "    worksheet.cell(row=1, column=4, value='Keyphrases')\n",
        "    worksheet.cell(row=1, column=5, value='Similarity to supplied keywords')\n",
        "    worksheet.cell(row=1, column=6, value='Similarity (max)')\n",
        "    worksheet.cell(row=1, column=7, value='HTML Body')\n",
        "\n",
        "    # Loop over the unique URLs and write them to the Excel sheet\n",
        "    row_num = 2\n",
        "    for url, data in url_dict.items():\n",
        "        worksheet.cell(row=row_num, column=1, value=data['feed_name'])\n",
        "        worksheet.cell(row=row_num, column=2, value=url)\n",
        "        worksheet.cell(row=row_num, column=3, value=data['updated'])\n",
        "        worksheet.cell(row=row_num, column=4, value=data['keyphrases'])\n",
        "        worksheet.cell(row=row_num, column=5, value=data['similarity'])\n",
        "        worksheet.cell(row=row_num, column=6, value=data['similarity_max'])\n",
        "        worksheet.cell(row=row_num, column=7, value=data['html_body'])\n",
        "\n",
        "        row_num += 1\n",
        "\n",
        "    worksheet.freeze_panes = 'A2'\n",
        "\n",
        "    # Set the number format for column A, except the first row\n",
        "    for row in worksheet.iter_rows(min_row=2, min_col=3, max_col=3):\n",
        "        for cell in row:\n",
        "            cell.number_format = 'mm/dd/yyyy hh:mm:ss'\n",
        "\n",
        "    # Save the Excel workbook\n",
        "    workbook.save(filename)\n",
        "\n",
        "    # Print confirmation message\n",
        "    #print(f'RSS output written to excel sheet: {filename}')\n",
        "\n",
        "def remaining_entries_from_dict(filename, dictionary):\n",
        "    pickle_data = {}\n",
        "    if os.path.exists(filename):\n",
        "        with open(filename, 'rb') as f:\n",
        "            pickle_data = pickle.load(f)\n",
        "    return list(set(dictionary.keys()) - set(pickle_data.keys()))\n",
        "\n",
        "def process_url(url):\n",
        "    global url_dict\n",
        "    \n",
        "    #body = get_html_body(url, headers)\n",
        "    #kph,keyphrase_embeddings = extract_keyphrases(body, kw_model, vectorizer, embedding_model)\n",
        "    #similarity, similarity_max = get_similarity_scores(keyword_embeddings, keyphrase_embeddings)\n",
        "\n",
        "    #url_dict[url]['keyphrases'] = ', '.join(kph)\n",
        "    #url_dict[url]['similarity'] = ', '.join(similarity)\n",
        "    #url_dict[url]['similarity_max'] = similarity_max\n",
        "    #url_dict[url]['html_body'] = body\n",
        "    \n",
        "    url_dict[url]['keyphrases'] = ''\n",
        "    url_dict[url]['similarity'] = ''\n",
        "    url_dict[url]['similarity_max'] = ''\n",
        "    url_dict[url]['html_body'] = \"Skipping this part, to speed up the process\"\n",
        "\n",
        "    # Store temporary results to disk\n",
        "    #with open(\"retrieved_urls.pkl\", 'wb') as f:\n",
        "    #    pickle.dump(url_dict, f)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "id": "5cHnJQDSDy1Q"
      },
      "outputs": [],
      "source": [
        "import pprint\n",
        "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
        "from tqdm import tqdm\n",
        "from datetime import datetime\n",
        "import nltk\n",
        "\n",
        "\n",
        "# Initialize the SentenceTransformer model\n",
        "kw_model = KeyBERT('distilbert-base-nli-mean-tokens')\n",
        "vectorizer = KeyphraseCountVectorizer()\n",
        "embedding_model = SentenceTransformer('distilbert-base-nli-mean-tokens')\n",
        "nltk.download('stopwords', quiet=True)\n",
        "\n",
        "# Initialize variables\n",
        "headers = {\n",
        "    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'\n",
        "}\n",
        "keyword_embeddings = embedding_model.encode(keywords) # Encode keywords using the embedding model\n",
        "\n",
        "def filter_strings(lst1, lst2):\n",
        "    \"\"\"\n",
        "    Filters the list `lst2` and returns only the elements that have any of the elements of `lst1` as a substring.\n",
        "  \n",
        "    Args:\n",
        "        lst1 (list): The list of substrings to match against.\n",
        "        lst2 (list): The list of strings to filter.\n",
        "\n",
        "    Returns:\n",
        "        list: A new list containing the filtered elements from `lst2`.\n",
        "\n",
        "    Examples:\n",
        "        >>> lst1 = ['apple', 'banana', 'orange']\n",
        "        >>> lst2 = ['apple pie', 'banana bread', 'cherry pie', 'orange juice']\n",
        "        >>> filter_strings(lst1, lst2)\n",
        "        ['apple pie', 'banana bread', 'orange juice']\n",
        "    \"\"\"\n",
        "    filtered_lst2 = [s for s in lst2 if any(substring in s for substring in lst1)]\n",
        "    return filtered_lst2\n",
        "\n",
        "\n",
        "def read_feeds(rss_feed, how_old):\n",
        "    global urls\n",
        "    import sys\n",
        "    import io\n",
        "    import re\n",
        "    from datetime import datetime, timedelta\n",
        "    import pytz\n",
        "\n",
        "    old_stdout = sys.stdout\n",
        "    sys.stdout = mystdout = io.StringIO()\n",
        "\n",
        "    # Loop over the RSS feeds and keywords\n",
        "    urls_temp = []\n",
        "    urls = []\n",
        "\n",
        "    # Get the desired timezone\n",
        "    timezone = pytz.timezone('Europe/Stockholm')  # Replace 'Your_Timezone_Here' with the desired timezone\n",
        "\n",
        "    # Calculate the age with timezone\n",
        "    feed_item_age_minimum = datetime.now(timezone) - timedelta(days=int(how_old.split()[0]))\n",
        "\n",
        "    feed = feedparser.parse(rss_feed)\n",
        "    for entry in tqdm(feed.entries, total=len(feed.entries),  file=sys.stdout, bar_format='\\tReading feed entries: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n",
        "        soup = BeautifulSoup(entry.summary, 'html.parser')\n",
        "        updated = datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %z')\n",
        "        if re.search(r'@([^ ]+)', entry.title):\n",
        "            feed_name = re.search(r'@([^ ]+)', entry.title).group(1)\n",
        "        else:\n",
        "            feed_name = ''\n",
        "        if updated > feed_item_age_minimum:\n",
        "            urls_temp.extend([{'url': link.get('href'), 'updated': updated, 'feed_name': feed_name} for link in soup.find_all('a')])\n",
        "\n",
        "    with ThreadPoolExecutor(max_workers=4) as executor:\n",
        "        futures = [executor.submit(get_redirected_url, url, headers) for url in urls_temp]\n",
        "        for future in tqdm(as_completed(futures), total=len(futures), file=sys.stdout, bar_format='Checking URLs: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n",
        "            urls.append(future.result())\n",
        "\n",
        "    sys.stdout = old_stdout\n",
        "    return mystdout.getvalue()\n",
        "\n",
        "def read_process_urls():\n",
        "    import sys\n",
        "    import io\n",
        "    from datetime import datetime, timedelta\n",
        "    old_stdout = sys.stdout\n",
        "    sys.stdout = mystdout = io.StringIO()\n",
        "\n",
        "    global urls\n",
        "    global url_dict\n",
        "\n",
        "    #print(f\"Urls: {urls}\")\n",
        "    url_dict = {}\n",
        "    for item in filter_strings(feed_keywords, urls):\n",
        "        feed_name = item['feed_name']\n",
        "        updated = item['updated']\n",
        "        url = item['url']\n",
        "\n",
        "        import pprint\n",
        "        pprint.pprint(url)\n",
        "        if url not in url_dict.keys():\n",
        "            url_dict[url] = {'updated': updated, 'feed_name': feed_name}\n",
        "        else:\n",
        "            if url_dict[url]['updated'] > updated:\n",
        "                url_dict[url]['updated'] = updated\n",
        "\n",
        "    start_parallel_loop_time = time.time()\n",
        "    results = []\n",
        "    with ThreadPoolExecutor(max_workers=4) as executor:\n",
        "        futures = [executor.submit(process_url, url) for url in url_dict.keys()]#remaining_entries_from_dict(\"retrieved_urls.pkl\", url_dict)]\n",
        "        for future in tqdm(as_completed(futures), total=len(futures),  file=sys.stdout, bar_format='Reading URLs: {n}/{total} ({percentage:.0f}%), time elapsed: {elapsed}'):\n",
        "            results.append(future.result())\n",
        "    #print(f\"Parallel URL processing: {time.time() - start_parallel_loop_time:.3f} seconds\")\n",
        "    print(f\"Total links processed: {len(url_dict.keys())}\")\n",
        "\n",
        "    #with open(\"retrieved_urls.pkl\", 'wb') as f:\n",
        "    #    pickle.dump(url_dict, f)\n",
        "\n",
        "    # Write dataset to the Excel sheet\n",
        "    write_data_to_excel(url_dict, 'newsletter_results.xlsx')\n",
        "\n",
        "    sys.stdout = old_stdout\n",
        "    return mystdout.getvalue()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {
        "id": "FNR1jfm-jsgb"
      },
      "outputs": [],
      "source": [
        "from ipywidgets import HTML\n",
        "\n",
        "read_feeds(feed, feed_age)\n",
        "display(HTML(f\"Total links examined: {len(urls)}\"))\n",
        "\n",
        "read_process_urls()\n",
        "display(HTML(f\"Relevant links found: {len(url_dict.keys())}\"))\n",
        "display(HTML(f\"------------------------------\"))\n",
        "\n",
        "for url in url_dict.keys():\n",
        "    #print(url)\n",
        "    display(HTML(f\"{url}\"))\n"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "provenance": []
    },
    "gpuClass": "standard",
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.7"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}