Spaces:
Runtime error
Runtime error
import gradio as gr | |
import csv | |
from datetime import datetime | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin | |
import os | |
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader | |
import huggingface_hub | |
from huggingface_hub import Repository | |
DATASET_REPO_URL = os.environ.get("repo") | |
DATA_FILENAME = "data.csv" | |
DATA_FILE = os.path.join("data", DATA_FILENAME) | |
HF_TOKEN = os.environ.get("hf") | |
print("is none?", HF_TOKEN is None) | |
print("hfh", huggingface_hub.__version__) | |
repo = Repository( | |
local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=HF_TOKEN, repo_type="dataset" | |
) | |
print('done cloning repo') | |
access = os.environ.get("access") | |
os.environ["OPENAI_API_KEY"] = os.environ.get("openai-1") | |
os.makedirs("/home/user/app/data1", exist_ok=True) | |
os.makedirs("/home/user/app/data2", exist_ok=True) | |
base_url_1 = os.environ.get("base_url_1") | |
visited_urls = [] | |
counter = 0 | |
limit = 10000 | |
def scrape_page(url): | |
global counter | |
counter += 1 | |
try: | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, "html.parser") | |
paragraphs = soup.find_all("p") | |
print('------------------------------------------------------------------------------------------------------') | |
print('counter: ', counter) | |
print('reference url: ', url) | |
print('text: ') | |
for paragraph in paragraphs: | |
print(paragraph.text) | |
print('------------------------------------------------------------------------------------------------------') | |
result = "reference url: " + url + "\n" | |
content = "\n".join([paragraph.text for paragraph in paragraphs]) | |
result += content | |
with open("/home/user/app/data2/base_url_1_"+str(counter)+".txt", "w") as file: | |
file.write(result) | |
visited_urls.append(url) | |
links = soup.find_all("a", href=True) | |
for link in links: | |
absolute_url = urljoin(url, link["href"]) | |
if absolute_url not in visited_urls and absolute_url.startswith(base_url_1) and 'tel' not in absolute_url and counter <= limit: | |
content += "\n" + scrape_page(absolute_url) | |
return "" | |
except requests.exceptions.InvalidSchema: | |
print(f"Ignoring invalid URL: {url}") | |
return "" | |
result = scrape_page(base_url_1) | |
base_url_2 = os.environ.get("base_url_2") | |
date_urls = [base_url_2+str(year)+"/" for year in range(2023,2010,-1)] | |
visited_urls = [] | |
counter = 0 | |
limit = 10000 | |
def scrape_page(url): | |
global counter | |
counter += 1 | |
try: | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, "html.parser") | |
paragraphs = soup.find_all("p") | |
print('------------------------------------------------------------------------------------------------------') | |
print('counter: ', counter) | |
print('reference url: ', url) | |
print('text: ') | |
for paragraph in paragraphs: | |
print(paragraph.text) | |
print('------------------------------------------------------------------------------------------------------') | |
result = "reference url: " + url + "\n" | |
content = "\n".join([paragraph.text for paragraph in paragraphs]) | |
result += content | |
with open("/home/user/app/data2/base_url_2_"+str(counter)+".txt", "w") as file: | |
file.write(result) | |
visited_urls.append(url) | |
links = soup.find_all("a", href=True) | |
for link in links: | |
absolute_url = urljoin(url, link["href"]) | |
starts_with = False | |
for date_url in date_urls: | |
if absolute_url.startswith(date_url): starts_with = True | |
if absolute_url not in visited_urls and starts_with and 'tel' not in absolute_url and counter <= limit: | |
content += "\n" + scrape_page(absolute_url) | |
return "" | |
except requests.exceptions.InvalidSchema: | |
print(f"Ignoring invalid URL: {url}") | |
return "" | |
result = scrape_page(base_url_2) | |
documents = SimpleDirectoryReader("/home/user/app/data2/").load_data() | |
index = GPTVectorStoreIndex.from_documents(documents) | |
query_engine = index.as_query_engine() | |
def generate_text(input_text): | |
output_text = 'Access is limited to specific users' | |
words = input_text.split(" ") | |
if words[0] == access: | |
input_text = " ".join(words[1:]) | |
output_text = query_engine.query(input_text).response | |
with open(DATA_FILE, mode='a', newline='') as file: | |
writer = csv.writer(file) | |
writer.writerow([input_text, output_text, datetime.now().strftime("%Y-%m-%d %H:%M:%S")]) | |
commit_url = repo.push_to_hub() | |
return output_text | |
interface = gr.Interface( | |
fn=generate_text, | |
inputs=gr.inputs.Textbox(lines=10, label="Input Text"), | |
outputs="text", | |
title="OpenAI Test 1", | |
description="By: Navid Moghaddam ------------- Notice: This app may produce inaccurate information. All interactions are logged.", | |
theme="default", | |
allow_flagging = 'auto' | |
) | |
interface.launch(debug = True) |