Spaces:
Runtime error
Runtime error
File size: 5,111 Bytes
71243fb 5192229 71243fb 7f7ac2d 5192229 c710da8 5192229 5fdcecb c710da8 25afb9f c710da8 783b9a2 fe7ec7e 6d84895 fe7ec7e b709a2a 7f7ac2d b709a2a 0965a5f 4766547 bfc96df 4766547 7f7ac2d 4766547 5e25ab1 4766547 bfc96df 4766547 5e25ab1 4766547 fd14eee 24f4091 fd14eee 7f7ac2d 40eed8a 7f7ac2d bfc96df 7f7ac2d 2110b01 7f7ac2d c8e293f 4766547 5d8720c 0965a5f 0870a0b 7137872 7f7ac2d 5192229 71243fb cb38d4b 69e6c8d c710da8 69e6c8d 71243fb 5e25ab1 71243fb 5c0fddb c8e293f 71243fb 4766547 71243fb 4766547 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import gradio as gr
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader
import huggingface_hub
from huggingface_hub import Repository
DATASET_REPO_URL = os.environ.get("repo")
DATA_FILENAME = "data.csv"
DATA_FILE = os.path.join("data", DATA_FILENAME)
HF_TOKEN = os.environ.get("hf")
print("is none?", HF_TOKEN is None)
print("hfh", huggingface_hub.__version__)
repo = Repository(
local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=HF_TOKEN, repo_type="dataset"
)
print('done cloning repo')
access = os.environ.get("access")
os.environ["OPENAI_API_KEY"] = os.environ.get("openai-1")
os.makedirs("/home/user/app/data1", exist_ok=True)
os.makedirs("/home/user/app/data2", exist_ok=True)
base_url_1 = os.environ.get("base_url_1")
visited_urls = []
counter = 0
limit = 10000
def scrape_page(url):
global counter
counter += 1
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
paragraphs = soup.find_all("p")
print('------------------------------------------------------------------------------------------------------')
print('counter: ', counter)
print('reference url: ', url)
print('text: ')
for paragraph in paragraphs:
print(paragraph.text)
print('------------------------------------------------------------------------------------------------------')
result = "reference url: " + url + "\n"
content = "\n".join([paragraph.text for paragraph in paragraphs])
result += content
with open("/home/user/app/data2/base_url_1_"+str(counter)+".txt", "w") as file:
file.write(result)
visited_urls.append(url)
links = soup.find_all("a", href=True)
for link in links:
absolute_url = urljoin(url, link["href"])
if absolute_url not in visited_urls and absolute_url.startswith(base_url_1) and 'tel' not in absolute_url and counter <= limit:
content += "\n" + scrape_page(absolute_url)
return ""
except requests.exceptions.InvalidSchema:
print(f"Ignoring invalid URL: {url}")
return ""
result = scrape_page(base_url_1)
base_url_2 = os.environ.get("base_url_2")
date_urls = [base_url_2+str(year)+"/" for year in range(2023,2010,-1)]
visited_urls = []
counter = 0
limit = 10000
def scrape_page(url):
global counter
counter += 1
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
paragraphs = soup.find_all("p")
print('------------------------------------------------------------------------------------------------------')
print('counter: ', counter)
print('reference url: ', url)
print('text: ')
for paragraph in paragraphs:
print(paragraph.text)
print('------------------------------------------------------------------------------------------------------')
result = "reference url: " + url + "\n"
content = "\n".join([paragraph.text for paragraph in paragraphs])
result += content
with open("/home/user/app/data2/base_url_2_"+str(counter)+".txt", "w") as file:
file.write(result)
visited_urls.append(url)
links = soup.find_all("a", href=True)
for link in links:
absolute_url = urljoin(url, link["href"])
starts_with = False
for date_url in date_urls:
if absolute_url.startswith(date_url): starts_with = True
if absolute_url not in visited_urls and starts_with and 'tel' not in absolute_url and counter <= limit:
content += "\n" + scrape_page(absolute_url)
return ""
except requests.exceptions.InvalidSchema:
print(f"Ignoring invalid URL: {url}")
return ""
result = scrape_page(base_url_2)
documents = SimpleDirectoryReader("/home/user/app/data2/").load_data()
index = GPTVectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
def generate_text(input_text):
output_text = 'Access is limited to specific users'
words = input_text.split(" ")
if words[0] == access:
input_text = " ".join(words[1:])
output_text = query_engine.query(input_text).response
with open(DATA_FILE, mode='a', newline='') as file:
writer = csv.writer(file)
writer.writerow([input_text, output_text, datetime.now().strftime("%Y-%m-%d %H:%M:%S")])
commit_url = repo.push_to_hub()
return output_text
interface = gr.Interface(
fn=generate_text,
inputs=gr.inputs.Textbox(lines=10, label="Input Text"),
outputs="text",
title="OpenAI Test 1",
description="By: Navid Moghaddam ------------- Notice: This app may produce inaccurate information. All interactions are logged.",
theme="default",
allow_flagging = 'auto'
)
interface.launch(debug = True) |