0xleec commited on
Commit
6a6f2fa
1 Parent(s): aea1676

update app

Browse files
Files changed (1) hide show
  1. app.py +45 -12
app.py CHANGED
@@ -9,13 +9,15 @@ from langchain.vectorstores import FAISS
9
  from langchain.chains.question_answering import load_qa_chain
10
  from langchain.chat_models import ChatOpenAI
11
  from langchain.callbacks import get_openai_callback
12
- from requests.exceptions import Timeout
13
  import requests
14
  from bs4 import BeautifulSoup
15
  from urllib.parse import urlparse, urljoin
16
  import time
17
  import random
18
  import os
 
 
19
 
20
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
21
 
@@ -30,6 +32,18 @@ headers = {
30
  }
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def get_internal_links(url):
34
  print('start get internal links')
35
  internal_links = []
@@ -53,9 +67,10 @@ def get_internal_links(url):
53
 
54
 
55
  def get_page_content(url):
 
56
  response = requests.get(url, headers=headers, timeout=5)
57
  soup = BeautifulSoup(response.content, 'html.parser')
58
- content = soup.get_text()
59
 
60
 
61
  time.sleep(random.randint(1, 3))
@@ -63,9 +78,8 @@ def get_page_content(url):
63
 
64
  def crawl_site(url):
65
 
66
-
67
-
68
- links_to_visit = get_internal_links(url)
69
 
70
  content = ""
71
 
@@ -99,9 +113,27 @@ def get_pdf_response(file):
99
  if file is not None:
100
  text = decode_pdf(file)
101
 
102
- return get_response(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  def get_website_response(url):
 
 
105
  content = crawl_site(url)
106
  result = get_response(content)
107
 
@@ -110,9 +142,6 @@ def get_website_response(url):
110
 
111
  def get_response(text):
112
 
113
-
114
- print(text)
115
-
116
  # split into chunks
117
  text_splitter = CharacterTextSplitter(
118
  separator="\n",
@@ -134,8 +163,12 @@ def get_response(text):
134
  def ask_question(knowledge_base):
135
 
136
 
 
 
 
 
137
  user_question = """this content is a web3 project pitch deck. return result as JSON format. Please use the following JSON format to return data. if some fields are incomplete or missing, use 'N/A' to replace it.
138
- {{"project_name":"this project name","introduction":"project introduction, less than 200 words","slogan":"project slogan","features":"project features","description":"project description","roadmap":"g","fundraising":"fundraising target,round, valuation etc.",contact_email":"project contact email","website":"project official website","twitter":"official twitter","github":"official github","telegram":"official telegram","team_member":"team member list, include name, position, introduction, twitter, github, telegram, etc."}}"""
139
 
140
  print("Question:", user_question)
141
 
@@ -185,9 +218,9 @@ def upload_file(file):
185
  with gr.Blocks(title="Use AI boost your deal flow - Ventureflow") as demo:
186
  gr.Markdown("# Use AI boost your deal flow")
187
  with gr.Tab("Upload Deck"):
188
- file_input = gr.File(file_types=[".pdf"])
 
189
  json_output = gr.JSON()
190
- upload_button = gr.UploadButton("Click to Upload a Deck(.pdf))")
191
  upload_button.upload(upload_file, upload_button, json_output)
192
  with gr.Tab("Enter Project website"):
193
  text_input = gr.Textbox(label="Enter Project website")
 
9
  from langchain.chains.question_answering import load_qa_chain
10
  from langchain.chat_models import ChatOpenAI
11
  from langchain.callbacks import get_openai_callback
12
+ # from requests.exceptions import Timeout
13
  import requests
14
  from bs4 import BeautifulSoup
15
  from urllib.parse import urlparse, urljoin
16
  import time
17
  import random
18
  import os
19
+ import mimetypes
20
+ from openai.error import Timeout
21
 
22
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
23
 
 
32
  }
33
 
34
 
35
+ def is_webpage(url):
36
+ """
37
+ 判断一个链接是否为网页链接
38
+ """
39
+ content_type = requests.head(url, headers=headers).headers.get('Content-Type')
40
+ if content_type is not None:
41
+ mimetype, encoding = mimetypes.guess_type(url, strict=False)
42
+ if mimetype is not None and mimetype.startswith('text/html'):
43
+ return True
44
+ return False
45
+
46
+
47
  def get_internal_links(url):
48
  print('start get internal links')
49
  internal_links = []
 
67
 
68
 
69
  def get_page_content(url):
70
+
71
  response = requests.get(url, headers=headers, timeout=5)
72
  soup = BeautifulSoup(response.content, 'html.parser')
73
+ content = soup.get_text('\n')
74
 
75
 
76
  time.sleep(random.randint(1, 3))
 
78
 
79
  def crawl_site(url):
80
 
81
+ # links_to_visit = get_internal_links(url)
82
+ links_to_visit = [url]
 
83
 
84
  content = ""
85
 
 
113
  if file is not None:
114
  text = decode_pdf(file)
115
 
116
+ print('pdf text:', text)
117
+
118
+ if text:
119
+ return get_response(text)
120
+ else:
121
+ return {"error": "covert pdf to text failed"}
122
+
123
+ def fix_url(url):
124
+ try:
125
+ response = requests.head(url)
126
+ if response.status_code != 405:
127
+ return url
128
+ else:
129
+ return "https://" + url
130
+ except requests.exceptions.MissingSchema:
131
+ return "https://" + url
132
+
133
 
134
  def get_website_response(url):
135
+
136
+ url = fix_url(url)
137
  content = crawl_site(url)
138
  result = get_response(content)
139
 
 
142
 
143
  def get_response(text):
144
 
 
 
 
145
  # split into chunks
146
  text_splitter = CharacterTextSplitter(
147
  separator="\n",
 
163
  def ask_question(knowledge_base):
164
 
165
 
166
+ # user_question = """this content is a web3 project pitch deck. return result as JSON format. Please use the following JSON format to return data. if some fields are incomplete or missing, use 'N/A' to replace it.
167
+ # {{"project_name":"this project name","introduction":"project introduction, less than 200 words","slogan":"project slogan","features":"project features","description":"project description","roadmap":"g","fundraising":"fundraising target,round, valuation etc."}}"""
168
+
169
+
170
  user_question = """this content is a web3 project pitch deck. return result as JSON format. Please use the following JSON format to return data. if some fields are incomplete or missing, use 'N/A' to replace it.
171
+ {{"project_name":"this project name","introduction":"project introduction, less than 200 words","slogan":"project slogan","features":"project features","description":"project description","roadmap":"g","fundraising":"fundraising target,round, valuation etc.",contact_email":"project contact email","website":"project official website","twitter":"official twitter","github":"official github","telegram":"official telegram"}}"""
172
 
173
  print("Question:", user_question)
174
 
 
218
  with gr.Blocks(title="Use AI boost your deal flow - Ventureflow") as demo:
219
  gr.Markdown("# Use AI boost your deal flow")
220
  with gr.Tab("Upload Deck"):
221
+ # file_input = gr.File(file_types=[".pdf"])
222
+ upload_button = gr.UploadButton("Click to Upload a Deck(.pdf))", file_types=[".pdf"])
223
  json_output = gr.JSON()
 
224
  upload_button.upload(upload_file, upload_button, json_output)
225
  with gr.Tab("Enter Project website"):
226
  text_input = gr.Textbox(label="Enter Project website")