OpenAI-test1 / app.py
for876543's picture
Update app.py
6d84895
raw
history blame
3.31 kB
import gradio as gr
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader
import huggingface_hub
from huggingface_hub import Repository
DATASET_REPO_URL = "https://huggingface.co/datasets/for876543/OpenAI-test1-logs"
DATA_FILENAME = "data.csv"
DATA_FILE = os.path.join("data", DATA_FILENAME)
FLAG_FILENAME = "flags.csv"
FLAG_FILE = os.path.join("data", FLAG_FILENAME)
HF_TOKEN = os.environ.get("hf")
print("is none?", HF_TOKEN is None)
print("hfh", huggingface_hub.__version__)
repo = Repository(
local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=HF_TOKEN, repo_type="dataset"
)
print('done cloning repo')
access = os.environ.get("access")
os.environ["OPENAI_API_KEY"] = os.environ.get("openai-1")
os.makedirs("/home/user/app/data1", exist_ok=True)
base_url = "https://en.wikipedia.org/wiki/Artificial_intelligence"
visited_urls = []
counter = 0
limit = 10
def scrape_page(url):
global counter
counter += 1
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
paragraphs = soup.find_all("p")
print('------------------------------------------------------------------------------------------------------')
print('counter: ', counter)
print('reference url: ', url)
print('text: ')
for paragraph in paragraphs:
print(paragraph.text)
print('------------------------------------------------------------------------------------------------------')
result = "reference url: " + url + "\n"
content = "\n".join([paragraph.text for paragraph in paragraphs])
result += content
with open("/home/user/app/data1/test_"+str(counter)+".txt", "w") as file:
file.write(result)
visited_urls.append(url)
links = soup.find_all("a", href=True)
for link in links:
absolute_url = urljoin(url, link["href"])
if absolute_url not in visited_urls and absolute_url.startswith(base_url) and counter <= limit:
content += "\n" + scrape_page(absolute_url)
return ""
except requests.exceptions.InvalidSchema:
print(f"Ignoring invalid URL: {url}")
return ""
result = scrape_page(base_url)
documents = SimpleDirectoryReader("/home/user/app/data1/").load_data()
index = GPTVectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
def generate_text(input_text):
output_text = 'Access is limited to specific users'
if input_text.split(' ')[0] == access:
output_text = query_engine.query(input_text).response
with open(DATA_FILE, mode='a', newline='') as file:
writer = csv.writer(file)
writer.writerow([input_text, output_text, datetime.now().strftime("%Y-%m-%d %H:%M:%S")])
commit_url = repo.push_to_hub()
return output_text
interface = gr.Interface(
fn=generate_text,
inputs=gr.inputs.Textbox(lines=10, label="Input Text"),
outputs="text",
title="Text Processing",
description="Enter a text and see the processed output.",
theme="default",
allow_flagging = 'auto'
)
interface.launch(debug = True)