File size: 5,111 Bytes
71243fb
 
 
 
 
 
 
 
5192229
 
71243fb
7f7ac2d
5192229
 
 
c710da8
5192229
 
 
5fdcecb
c710da8
25afb9f
c710da8
 
783b9a2
 
fe7ec7e
6d84895
fe7ec7e
b709a2a
7f7ac2d
b709a2a
0965a5f
4766547
 
bfc96df
4766547
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f7ac2d
4766547
 
 
5e25ab1
4766547
 
 
bfc96df
4766547
5e25ab1
4766547
 
 
 
fd14eee
24f4091
fd14eee
7f7ac2d
40eed8a
7f7ac2d
 
bfc96df
7f7ac2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2110b01
7f7ac2d
 
 
 
 
 
 
 
 
c8e293f
4766547
 
 
 
 
5d8720c
 
0965a5f
 
 
 
0870a0b
7137872
7f7ac2d
5192229
71243fb
cb38d4b
69e6c8d
c710da8
69e6c8d
71243fb
 
 
5e25ab1
71243fb
 
5c0fddb
c8e293f
71243fb
4766547
71243fb
 
4766547
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import gradio as gr
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader
import huggingface_hub
from huggingface_hub import Repository

DATASET_REPO_URL = os.environ.get("repo")
DATA_FILENAME = "data.csv"
DATA_FILE = os.path.join("data", DATA_FILENAME)

HF_TOKEN = os.environ.get("hf")
print("is none?", HF_TOKEN is None)

print("hfh", huggingface_hub.__version__)

repo = Repository(
    local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=HF_TOKEN, repo_type="dataset"
)

print('done cloning repo')

access = os.environ.get("access")
os.environ["OPENAI_API_KEY"] = os.environ.get("openai-1")

os.makedirs("/home/user/app/data1", exist_ok=True)
os.makedirs("/home/user/app/data2", exist_ok=True)

base_url_1 = os.environ.get("base_url_1")
visited_urls = []
counter = 0
limit = 10000

def scrape_page(url):
    global counter
    counter += 1
    try:

        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")

        paragraphs = soup.find_all("p")
        print('------------------------------------------------------------------------------------------------------')
        print('counter: ', counter)
        print('reference url: ', url)
        print('text: ')
        for paragraph in paragraphs:
            print(paragraph.text)
        print('------------------------------------------------------------------------------------------------------')
        result = "reference url: " + url + "\n"
        content = "\n".join([paragraph.text for paragraph in paragraphs])
        result += content
        with open("/home/user/app/data2/base_url_1_"+str(counter)+".txt", "w") as file:
          file.write(result)

        visited_urls.append(url) 

        links = soup.find_all("a", href=True)
        for link in links:
            absolute_url = urljoin(url, link["href"])
            if absolute_url not in visited_urls and absolute_url.startswith(base_url_1) and 'tel' not in absolute_url and counter <= limit:
                content += "\n" + scrape_page(absolute_url)

        return ""
    except requests.exceptions.InvalidSchema:
        print(f"Ignoring invalid URL: {url}")
        return ""

result = scrape_page(base_url_1)

base_url_2 = os.environ.get("base_url_2")
date_urls = [base_url_2+str(year)+"/" for year in range(2023,2010,-1)]
visited_urls = []
counter = 0
limit = 10000

def scrape_page(url):
    global counter
    counter += 1
    try:

        response = requests.get(url)

        soup = BeautifulSoup(response.content, "html.parser")

        paragraphs = soup.find_all("p")
        print('------------------------------------------------------------------------------------------------------')
        print('counter: ', counter)
        print('reference url: ', url)
        print('text: ')
        for paragraph in paragraphs:
            print(paragraph.text)
        print('------------------------------------------------------------------------------------------------------')
        result = "reference url: " + url + "\n"
        content = "\n".join([paragraph.text for paragraph in paragraphs])
        result += content
        with open("/home/user/app/data2/base_url_2_"+str(counter)+".txt", "w") as file:
          file.write(result)

        visited_urls.append(url) 

        links = soup.find_all("a", href=True)
        for link in links:
            absolute_url = urljoin(url, link["href"])
            starts_with = False
            for date_url in date_urls:
              if absolute_url.startswith(date_url): starts_with = True
            if absolute_url not in visited_urls and starts_with and 'tel' not in absolute_url and counter <= limit:
                content += "\n" + scrape_page(absolute_url)

        return ""
    except requests.exceptions.InvalidSchema:
        print(f"Ignoring invalid URL: {url}")
        return ""

result = scrape_page(base_url_2)

documents = SimpleDirectoryReader("/home/user/app/data2/").load_data()
index = GPTVectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine()

def generate_text(input_text):

    output_text = 'Access is limited to specific users'
    
    words = input_text.split(" ")
    
    if words[0] == access:
        input_text = " ".join(words[1:])
        output_text = query_engine.query(input_text).response
        
    with open(DATA_FILE, mode='a', newline='') as file:
      writer = csv.writer(file)
      writer.writerow([input_text, output_text, datetime.now().strftime("%Y-%m-%d %H:%M:%S")])
        
    commit_url = repo.push_to_hub()
    
    return output_text

interface = gr.Interface(
    fn=generate_text,
    inputs=gr.inputs.Textbox(lines=10, label="Input Text"),
    outputs="text",
    title="OpenAI Test 1",
    description="By: Navid Moghaddam      -------------       Notice: This app may produce inaccurate information. All interactions are logged.",
    theme="default",
    allow_flagging = 'auto'
)

interface.launch(debug = True)