Spaces:

drift-ai
/

faq-website

Runtime error

vincentclaes commited on Mar 31, 2023

Commit

1fdb555

•

1 Parent(s): 679ff32

process 1 page only

Files changed (1) hide show

scrape_website.py CHANGED Viewed

@@ -37,16 +37,30 @@ def process_webpage(url:str):
         text_list.append(text_content)
     # Get the text content of the landing page
-    get_child_pages(url)
-    # make main page as first item
-    text_list.reverse()
-    text_list_cut_off = text_list[:TOKEN_CUT_OFF]
-    page_content = "\n".join(text_list_cut_off)
-    # Print the text content of the landing page and all child pages
-    print(page_content)
-    return page_content
 if __name__ == '__main__':
     process_webpage(url="https://www.meet-drift.ai/")

         text_list.append(text_content)
     # Get the text content of the landing page
+    # get_child_pages(url)
+    # Make a GET request to the page and get the HTML content
+    response = requests.get(url)
+    html_content = response.content
+    # Parse the HTML content using BeautifulSoup
+    soup = BeautifulSoup(html_content, "html.parser")
+    # Get all the text content from the relevant HTML tags
+    text_content = ""
+    for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]:
+        for element in soup.find_all(tag):
+            text_content += element.get_text() + " "
+    # # make main page as first item
+    # text_list.reverse()
+    # text_list_cut_off = text_list[:TOKEN_CUT_OFF]
+    # page_content = "\n".join(text_list_cut_off)
+    # # Print the text content of the landing page and all child pages
+    # print(page_content)
+    # return page_content
+    print(text_content)
+    return text_content
 if __name__ == '__main__':
     process_webpage(url="https://www.meet-drift.ai/")