Spaces:
Runtime error
Runtime error
update app
Browse files
app.py
CHANGED
@@ -9,13 +9,15 @@ from langchain.vectorstores import FAISS
|
|
9 |
from langchain.chains.question_answering import load_qa_chain
|
10 |
from langchain.chat_models import ChatOpenAI
|
11 |
from langchain.callbacks import get_openai_callback
|
12 |
-
from requests.exceptions import Timeout
|
13 |
import requests
|
14 |
from bs4 import BeautifulSoup
|
15 |
from urllib.parse import urlparse, urljoin
|
16 |
import time
|
17 |
import random
|
18 |
import os
|
|
|
|
|
19 |
|
20 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
21 |
|
@@ -30,6 +32,18 @@ headers = {
|
|
30 |
}
|
31 |
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
def get_internal_links(url):
|
34 |
print('start get internal links')
|
35 |
internal_links = []
|
@@ -53,9 +67,10 @@ def get_internal_links(url):
|
|
53 |
|
54 |
|
55 |
def get_page_content(url):
|
|
|
56 |
response = requests.get(url, headers=headers, timeout=5)
|
57 |
soup = BeautifulSoup(response.content, 'html.parser')
|
58 |
-
content = soup.get_text()
|
59 |
|
60 |
|
61 |
time.sleep(random.randint(1, 3))
|
@@ -63,9 +78,8 @@ def get_page_content(url):
|
|
63 |
|
64 |
def crawl_site(url):
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
links_to_visit = get_internal_links(url)
|
69 |
|
70 |
content = ""
|
71 |
|
@@ -99,9 +113,27 @@ def get_pdf_response(file):
|
|
99 |
if file is not None:
|
100 |
text = decode_pdf(file)
|
101 |
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
def get_website_response(url):
|
|
|
|
|
105 |
content = crawl_site(url)
|
106 |
result = get_response(content)
|
107 |
|
@@ -110,9 +142,6 @@ def get_website_response(url):
|
|
110 |
|
111 |
def get_response(text):
|
112 |
|
113 |
-
|
114 |
-
print(text)
|
115 |
-
|
116 |
# split into chunks
|
117 |
text_splitter = CharacterTextSplitter(
|
118 |
separator="\n",
|
@@ -134,8 +163,12 @@ def get_response(text):
|
|
134 |
def ask_question(knowledge_base):
|
135 |
|
136 |
|
|
|
|
|
|
|
|
|
137 |
user_question = """this content is a web3 project pitch deck. return result as JSON format. Please use the following JSON format to return data. if some fields are incomplete or missing, use 'N/A' to replace it.
|
138 |
-
{{"project_name":"this project name","introduction":"project introduction, less than 200 words","slogan":"project slogan","features":"project features","description":"project description","roadmap":"g","fundraising":"fundraising target,round, valuation etc.",contact_email":"project contact email","website":"project official website","twitter":"official twitter","github":"official github","telegram":"official telegram"
|
139 |
|
140 |
print("Question:", user_question)
|
141 |
|
@@ -185,9 +218,9 @@ def upload_file(file):
|
|
185 |
with gr.Blocks(title="Use AI boost your deal flow - Ventureflow") as demo:
|
186 |
gr.Markdown("# Use AI boost your deal flow")
|
187 |
with gr.Tab("Upload Deck"):
|
188 |
-
file_input = gr.File(file_types=[".pdf"])
|
|
|
189 |
json_output = gr.JSON()
|
190 |
-
upload_button = gr.UploadButton("Click to Upload a Deck(.pdf))")
|
191 |
upload_button.upload(upload_file, upload_button, json_output)
|
192 |
with gr.Tab("Enter Project website"):
|
193 |
text_input = gr.Textbox(label="Enter Project website")
|
|
|
9 |
from langchain.chains.question_answering import load_qa_chain
|
10 |
from langchain.chat_models import ChatOpenAI
|
11 |
from langchain.callbacks import get_openai_callback
|
12 |
+
# from requests.exceptions import Timeout
|
13 |
import requests
|
14 |
from bs4 import BeautifulSoup
|
15 |
from urllib.parse import urlparse, urljoin
|
16 |
import time
|
17 |
import random
|
18 |
import os
|
19 |
+
import mimetypes
|
20 |
+
from openai.error import Timeout
|
21 |
|
22 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
23 |
|
|
|
32 |
}
|
33 |
|
34 |
|
35 |
+
def is_webpage(url):
|
36 |
+
"""
|
37 |
+
判断一个链接是否为网页链接
|
38 |
+
"""
|
39 |
+
content_type = requests.head(url, headers=headers).headers.get('Content-Type')
|
40 |
+
if content_type is not None:
|
41 |
+
mimetype, encoding = mimetypes.guess_type(url, strict=False)
|
42 |
+
if mimetype is not None and mimetype.startswith('text/html'):
|
43 |
+
return True
|
44 |
+
return False
|
45 |
+
|
46 |
+
|
47 |
def get_internal_links(url):
|
48 |
print('start get internal links')
|
49 |
internal_links = []
|
|
|
67 |
|
68 |
|
69 |
def get_page_content(url):
|
70 |
+
|
71 |
response = requests.get(url, headers=headers, timeout=5)
|
72 |
soup = BeautifulSoup(response.content, 'html.parser')
|
73 |
+
content = soup.get_text('\n')
|
74 |
|
75 |
|
76 |
time.sleep(random.randint(1, 3))
|
|
|
78 |
|
79 |
def crawl_site(url):
|
80 |
|
81 |
+
# links_to_visit = get_internal_links(url)
|
82 |
+
links_to_visit = [url]
|
|
|
83 |
|
84 |
content = ""
|
85 |
|
|
|
113 |
if file is not None:
|
114 |
text = decode_pdf(file)
|
115 |
|
116 |
+
print('pdf text:', text)
|
117 |
+
|
118 |
+
if text:
|
119 |
+
return get_response(text)
|
120 |
+
else:
|
121 |
+
return {"error": "covert pdf to text failed"}
|
122 |
+
|
123 |
+
def fix_url(url):
|
124 |
+
try:
|
125 |
+
response = requests.head(url)
|
126 |
+
if response.status_code != 405:
|
127 |
+
return url
|
128 |
+
else:
|
129 |
+
return "https://" + url
|
130 |
+
except requests.exceptions.MissingSchema:
|
131 |
+
return "https://" + url
|
132 |
+
|
133 |
|
134 |
def get_website_response(url):
|
135 |
+
|
136 |
+
url = fix_url(url)
|
137 |
content = crawl_site(url)
|
138 |
result = get_response(content)
|
139 |
|
|
|
142 |
|
143 |
def get_response(text):
|
144 |
|
|
|
|
|
|
|
145 |
# split into chunks
|
146 |
text_splitter = CharacterTextSplitter(
|
147 |
separator="\n",
|
|
|
163 |
def ask_question(knowledge_base):
|
164 |
|
165 |
|
166 |
+
# user_question = """this content is a web3 project pitch deck. return result as JSON format. Please use the following JSON format to return data. if some fields are incomplete or missing, use 'N/A' to replace it.
|
167 |
+
# {{"project_name":"this project name","introduction":"project introduction, less than 200 words","slogan":"project slogan","features":"project features","description":"project description","roadmap":"g","fundraising":"fundraising target,round, valuation etc."}}"""
|
168 |
+
|
169 |
+
|
170 |
user_question = """this content is a web3 project pitch deck. return result as JSON format. Please use the following JSON format to return data. if some fields are incomplete or missing, use 'N/A' to replace it.
|
171 |
+
{{"project_name":"this project name","introduction":"project introduction, less than 200 words","slogan":"project slogan","features":"project features","description":"project description","roadmap":"g","fundraising":"fundraising target,round, valuation etc.",contact_email":"project contact email","website":"project official website","twitter":"official twitter","github":"official github","telegram":"official telegram"}}"""
|
172 |
|
173 |
print("Question:", user_question)
|
174 |
|
|
|
218 |
with gr.Blocks(title="Use AI boost your deal flow - Ventureflow") as demo:
|
219 |
gr.Markdown("# Use AI boost your deal flow")
|
220 |
with gr.Tab("Upload Deck"):
|
221 |
+
# file_input = gr.File(file_types=[".pdf"])
|
222 |
+
upload_button = gr.UploadButton("Click to Upload a Deck(.pdf))", file_types=[".pdf"])
|
223 |
json_output = gr.JSON()
|
|
|
224 |
upload_button.upload(upload_file, upload_button, json_output)
|
225 |
with gr.Tab("Enter Project website"):
|
226 |
text_input = gr.Textbox(label="Enter Project website")
|