import gradio as gr import csv from datetime import datetime import requests from bs4 import BeautifulSoup from urllib.parse import urljoin import os from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader import huggingface_hub from huggingface_hub import Repository DATASET_REPO_URL = os.environ.get("repo") DATA_FILENAME = "data.csv" DATA_FILE = os.path.join("data", DATA_FILENAME) HF_TOKEN = os.environ.get("hf") print("is none?", HF_TOKEN is None) print("hfh", huggingface_hub.__version__) repo = Repository( local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=HF_TOKEN, repo_type="dataset" ) print('done cloning repo') access = os.environ.get("access") os.environ["OPENAI_API_KEY"] = os.environ.get("openai-1") os.makedirs("/home/user/app/data1", exist_ok=True) os.makedirs("/home/user/app/data2", exist_ok=True) base_url_1 = os.environ.get("base_url_1") visited_urls = [] counter = 0 limit = 10000 def scrape_page(url): global counter counter += 1 try: response = requests.get(url) soup = BeautifulSoup(response.content, "html.parser") paragraphs = soup.find_all("p") print('------------------------------------------------------------------------------------------------------') print('counter: ', counter) print('reference url: ', url) print('text: ') for paragraph in paragraphs: print(paragraph.text) print('------------------------------------------------------------------------------------------------------') result = "reference url: " + url + "\n" content = "\n".join([paragraph.text for paragraph in paragraphs]) result += content with open("/home/user/app/data2/base_url_1_"+str(counter)+".txt", "w") as file: file.write(result) visited_urls.append(url) links = soup.find_all("a", href=True) for link in links: absolute_url = urljoin(url, link["href"]) if absolute_url not in visited_urls and absolute_url.startswith(base_url_1) and 'tel' not in absolute_url and counter <= limit: content += "\n" + scrape_page(absolute_url) return "" except requests.exceptions.InvalidSchema: print(f"Ignoring invalid URL: {url}") return "" result = scrape_page(base_url_1) base_url_2 = os.environ.get("base_url_2") date_urls = [base_url_2+str(year)+"/" for year in range(2023,2010,-1)] visited_urls = [] counter = 0 limit = 10000 def scrape_page(url): global counter counter += 1 try: response = requests.get(url) soup = BeautifulSoup(response.content, "html.parser") paragraphs = soup.find_all("p") print('------------------------------------------------------------------------------------------------------') print('counter: ', counter) print('reference url: ', url) print('text: ') for paragraph in paragraphs: print(paragraph.text) print('------------------------------------------------------------------------------------------------------') result = "reference url: " + url + "\n" content = "\n".join([paragraph.text for paragraph in paragraphs]) result += content with open("/home/user/app/data2/base_url_2_"+str(counter)+".txt", "w") as file: file.write(result) visited_urls.append(url) links = soup.find_all("a", href=True) for link in links: absolute_url = urljoin(url, link["href"]) starts_with = False for date_url in date_urls: if absolute_url.startswith(date_url): starts_with = True if absolute_url not in visited_urls and starts_with and 'tel' not in absolute_url and counter <= limit: content += "\n" + scrape_page(absolute_url) return "" except requests.exceptions.InvalidSchema: print(f"Ignoring invalid URL: {url}") return "" result = scrape_page(base_url_2) documents = SimpleDirectoryReader("/home/user/app/data2/").load_data() index = GPTVectorStoreIndex.from_documents(documents) query_engine = index.as_query_engine() def generate_text(input_text): output_text = 'Access is limited to specific users' words = input_text.split(" ") if words[0] == access: input_text = " ".join(words[1:]) output_text = query_engine.query(input_text).response with open(DATA_FILE, mode='a', newline='') as file: writer = csv.writer(file) writer.writerow([input_text, output_text, datetime.now().strftime("%Y-%m-%d %H:%M:%S")]) commit_url = repo.push_to_hub() return output_text interface = gr.Interface( fn=generate_text, inputs=gr.inputs.Textbox(lines=10, label="Input Text"), outputs="text", title="OpenAI Test 1", description="By: Navid Moghaddam ------------- Notice: This app may produce inaccurate information. All interactions are logged.", theme="default", allow_flagging = 'auto' ) interface.launch(debug = True)