OpenAI-test1 / app.py
for876543's picture
Update app.py
2110b01
import gradio as gr
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader
import huggingface_hub
from huggingface_hub import Repository
DATASET_REPO_URL = os.environ.get("repo")
DATA_FILENAME = "data.csv"
DATA_FILE = os.path.join("data", DATA_FILENAME)
HF_TOKEN = os.environ.get("hf")
print("is none?", HF_TOKEN is None)
print("hfh", huggingface_hub.__version__)
repo = Repository(
local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=HF_TOKEN, repo_type="dataset"
)
print('done cloning repo')
access = os.environ.get("access")
os.environ["OPENAI_API_KEY"] = os.environ.get("openai-1")
os.makedirs("/home/user/app/data1", exist_ok=True)
os.makedirs("/home/user/app/data2", exist_ok=True)
base_url_1 = os.environ.get("base_url_1")
visited_urls = []
counter = 0
limit = 10000
def scrape_page(url):
global counter
counter += 1
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
paragraphs = soup.find_all("p")
print('------------------------------------------------------------------------------------------------------')
print('counter: ', counter)
print('reference url: ', url)
print('text: ')
for paragraph in paragraphs:
print(paragraph.text)
print('------------------------------------------------------------------------------------------------------')
result = "reference url: " + url + "\n"
content = "\n".join([paragraph.text for paragraph in paragraphs])
result += content
with open("/home/user/app/data2/base_url_1_"+str(counter)+".txt", "w") as file:
file.write(result)
visited_urls.append(url)
links = soup.find_all("a", href=True)
for link in links:
absolute_url = urljoin(url, link["href"])
if absolute_url not in visited_urls and absolute_url.startswith(base_url_1) and 'tel' not in absolute_url and counter <= limit:
content += "\n" + scrape_page(absolute_url)
return ""
except requests.exceptions.InvalidSchema:
print(f"Ignoring invalid URL: {url}")
return ""
result = scrape_page(base_url_1)
base_url_2 = os.environ.get("base_url_2")
date_urls = [base_url_2+str(year)+"/" for year in range(2023,2010,-1)]
visited_urls = []
counter = 0
limit = 10000
def scrape_page(url):
global counter
counter += 1
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
paragraphs = soup.find_all("p")
print('------------------------------------------------------------------------------------------------------')
print('counter: ', counter)
print('reference url: ', url)
print('text: ')
for paragraph in paragraphs:
print(paragraph.text)
print('------------------------------------------------------------------------------------------------------')
result = "reference url: " + url + "\n"
content = "\n".join([paragraph.text for paragraph in paragraphs])
result += content
with open("/home/user/app/data2/base_url_2_"+str(counter)+".txt", "w") as file:
file.write(result)
visited_urls.append(url)
links = soup.find_all("a", href=True)
for link in links:
absolute_url = urljoin(url, link["href"])
starts_with = False
for date_url in date_urls:
if absolute_url.startswith(date_url): starts_with = True
if absolute_url not in visited_urls and starts_with and 'tel' not in absolute_url and counter <= limit:
content += "\n" + scrape_page(absolute_url)
return ""
except requests.exceptions.InvalidSchema:
print(f"Ignoring invalid URL: {url}")
return ""
result = scrape_page(base_url_2)
documents = SimpleDirectoryReader("/home/user/app/data2/").load_data()
index = GPTVectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
def generate_text(input_text):
output_text = 'Access is limited to specific users'
words = input_text.split(" ")
if words[0] == access:
input_text = " ".join(words[1:])
output_text = query_engine.query(input_text).response
with open(DATA_FILE, mode='a', newline='') as file:
writer = csv.writer(file)
writer.writerow([input_text, output_text, datetime.now().strftime("%Y-%m-%d %H:%M:%S")])
commit_url = repo.push_to_hub()
return output_text
interface = gr.Interface(
fn=generate_text,
inputs=gr.inputs.Textbox(lines=10, label="Input Text"),
outputs="text",
title="OpenAI Test 1",
description="By: Navid Moghaddam ------------- Notice: This app may produce inaccurate information. All interactions are logged.",
theme="default",
allow_flagging = 'auto'
)
interface.launch(debug = True)