import requests from bs4 import BeautifulSoup import google.generativeai as genai from langchain.llms import OpenAI from langchain.chat_models import ChatOpenAI from langchain.agents.agent_types import AgentType #from langchain_experimental.agents.agent_toolkits import create_csv_agent from llama_index.llms import OpenAI from llama_index import VectorStoreIndex, SimpleDirectoryReader from llama_index.llms import OpenAI from llama_index import StorageContext, load_index_from_storage #os.environ["OPENAI_API_KEY"] import concurrent.futures # URL of the page to scrape base_url = 'https://help.storemate.cloud/docs/reports/' def fetch_web_data(url): try: # Send a GET request to the URL response = requests.get(url) # Parse the page content with BeautifulSoup soup = BeautifulSoup(response.content, 'html.parser') # Find the title and section content title = soup.find('h1').get_text() # Find the section with the title "Renew Package Subscription" section = soup.find('h1').find_next('div').find_next('div') # Extract the text content from the section section_text = section.get_text().strip() section_text = section_text + f"\nMore detail link: {url}" # Save the data into a text file with open(f"user_guide/{title}.txt", "w") as file: file.write(f"{title}\n{section_text}") except Exception as e: print(f"Failed to fetch data from {url}: {e}") def get_base_links(): # Send a GET request to the base URL response = requests.get(base_url) # Parse the page content with BeautifulSoup soup = BeautifulSoup(response.content, 'html.parser') # Find all tags with href attributes links = soup.find_all('a', href=True) # Collect all valid links valid_links = [] for link in links: href = link['href'] if href.startswith("https://help.storemate.cloud/docs/"): valid_links.append(href) print("Base links collected") # Use ThreadPoolExecutor to fetch web data in parallel with concurrent.futures.ThreadPoolExecutor() as executor: executor.map(fetch_web_data, valid_links) def update_user_guide(): get_base_links() # try: # storage_context = StorageContext.from_defaults(persist_dir="llama_index") # index = load_index_from_storage(storage_context=storage_context) # print("loaded") # except: documents = SimpleDirectoryReader("user_guide").load_data() index = VectorStoreIndex.from_documents(documents) index.storage_context.persist("llama_index") print("index created") return "done"