0xleec commited on
Commit
52e3677
•
1 Parent(s): 80e7536
Files changed (3) hide show
  1. README.md +6 -8
  2. app.py +205 -0
  3. requirements.txt +10 -0
README.md CHANGED
@@ -1,12 +1,10 @@
1
  ---
2
- title: Ai Deal Demo
3
- emoji: 🦀
4
- colorFrom: blue
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 3.32.0
8
  app_file: app.py
9
  pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
1
  ---
2
+ title: AI Deal Source Loader
3
+ emoji: 🧐
4
+ colorFrom: yellow
5
+ colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 3.30.0
8
  app_file: app.py
9
  pinned: false
10
+ ---
 
 
app.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from dotenv import load_dotenv
3
+ import gradio as gr
4
+ import os
5
+ from PyPDF2 import PdfReader
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain.embeddings.openai import OpenAIEmbeddings
8
+ from langchain.vectorstores import FAISS
9
+ from langchain.chains.question_answering import load_qa_chain
10
+ from langchain.chat_models import ChatOpenAI
11
+ from langchain.callbacks import get_openai_callback
12
+ from requests.exceptions import Timeout
13
+ import requests
14
+ from bs4 import BeautifulSoup
15
+ from urllib.parse import urlparse, urljoin
16
+ import time
17
+ import random
18
+ import os
19
+
20
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
21
+
22
+ load_dotenv()
23
+
24
+
25
+ knowledge_base = None
26
+
27
+
28
+ headers = {
29
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
30
+ }
31
+
32
+
33
+ def get_internal_links(url):
34
+ print('start get internal links')
35
+ internal_links = []
36
+ domain = urlparse(url).netloc # 获取当前网站域名
37
+ response = requests.get(url, headers=headers, timeout=5)
38
+ soup = BeautifulSoup(response.content, 'html.parser')
39
+ for a in soup.find_all('a', href=True):
40
+ href = a['href']
41
+ if href.startswith('http'): # 外链
42
+ if urlparse(href).netloc == domain: # 如果是本站链接
43
+ internal_links.append(href)
44
+ else: # 内链
45
+ internal_link = urljoin(url, href)
46
+ if urlparse(internal_link).netloc == domain:
47
+ internal_links.append(internal_link)
48
+ internal_links = list(set(internal_links))
49
+ print(internal_links)
50
+ return internal_links
51
+
52
+
53
+
54
+
55
+ def get_page_content(url):
56
+ response = requests.get(url, headers=headers, timeout=5)
57
+ soup = BeautifulSoup(response.content, 'html.parser')
58
+ content = soup.get_text()
59
+
60
+
61
+ time.sleep(random.randint(1, 3))
62
+ return content
63
+
64
+ def crawl_site(url):
65
+
66
+
67
+
68
+ links_to_visit = get_internal_links(url)
69
+
70
+ content = ""
71
+
72
+ while links_to_visit:
73
+ link = links_to_visit.pop(0)
74
+
75
+ content += get_page_content(link)
76
+ print(f'Page content for {link}:\n')
77
+ return content
78
+
79
+
80
+ def decode_pdf(file_path):
81
+ encodings = ['utf-8', 'gbk', 'gb2312', 'big5', 'cp1252'] # 常见编码方式
82
+ text = ""
83
+ with open(file_path, 'rb') as f:
84
+ pdf_reader = PdfReader(f)
85
+ for encoding in encodings:
86
+ try:
87
+ for page in pdf_reader.pages:
88
+ temp_text = page.extract_text()
89
+ encode_temp_text = temp_text.encode(encoding)
90
+ decode_temp_text = encode_temp_text.decode(encoding,'strict')
91
+ text += decode_temp_text
92
+ break
93
+ except UnicodeDecodeError:
94
+ pass
95
+ return text
96
+
97
+
98
+ def get_pdf_response(file):
99
+ if file is not None:
100
+ text = decode_pdf(file)
101
+
102
+ return get_response(text)
103
+
104
+ def get_website_response(url):
105
+ content = crawl_site(url)
106
+ result = get_response(content)
107
+
108
+ return result
109
+
110
+
111
+ def get_response(text):
112
+
113
+
114
+ print(text)
115
+
116
+ # split into chunks
117
+ text_splitter = CharacterTextSplitter(
118
+ separator="\n",
119
+ chunk_size=1000,
120
+ chunk_overlap=200,
121
+ length_function=len
122
+ )
123
+ chunks = text_splitter.split_text(text)
124
+
125
+ # create embeddings
126
+ embeddings = OpenAIEmbeddings()
127
+
128
+
129
+ knowledge_base = FAISS.from_texts(chunks, embeddings)
130
+
131
+ return ask_question(knowledge_base)
132
+
133
+
134
+ def ask_question(knowledge_base):
135
+
136
+
137
+ user_question = """this content is a web3 project pitch deck. return result as JSON format. Please use the following JSON format to return data. if some fields are incomplete or missing, use 'N/A' to replace it.
138
+ {{"project_name":"this project name","introduction":"project introduction, less than 200 words","slogan":"project slogan","features":"project features","description":"project description","roadmap":"g","fundraising":"fundraising target,round, valuation etc.",contact_email":"project contact email","website":"project official website","twitter":"official twitter","github":"official github","telegram":"official telegram","team_member":"team member list, include name, position, introduction, twitter, github, telegram, etc."}}"""
139
+
140
+ print("Question:", user_question)
141
+
142
+
143
+ if user_question:
144
+ # show user input
145
+ docs = knowledge_base.similarity_search(user_question)
146
+
147
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7)
148
+ chain = load_qa_chain(llm, chain_type="stuff")
149
+
150
+ try:
151
+ with get_openai_callback() as cb:
152
+ response = chain.run(input_documents=docs, question=user_question)
153
+ print(f"Total Tokens: {cb.total_tokens}")
154
+ print(f"Prompt Tokens: {cb.prompt_tokens}")
155
+ print(f"Completion Tokens: {cb.completion_tokens}")
156
+ print(f"Total Cost (USD): ${cb.total_cost}")
157
+
158
+ print("Answer:", response)
159
+
160
+ json.loads(response)
161
+ except json.decoder.JSONDecodeError:
162
+ response = {"error": "Data can't found"}
163
+
164
+ except Timeout:
165
+ response = {"error": "Reuest timeout, please try again"}
166
+
167
+ print(json.dumps(response, ensure_ascii=False))
168
+ return response
169
+
170
+
171
+
172
+
173
+ def upload_file(file):
174
+ file_path = file.name
175
+ file_size = os.path.getsize(file_path)
176
+ print("File size:", file_size)
177
+
178
+ result = get_pdf_response(file_path)
179
+
180
+ return result
181
+
182
+
183
+
184
+
185
+ with gr.Blocks(title="Use AI boost your deal flow - Ventureflow") as demo:
186
+ gr.Markdown("# Use AI boost your deal flow")
187
+ with gr.Tab("Upload Deck"):
188
+ file_input = gr.File(file_types=[".pdf"])
189
+ json_output = gr.JSON()
190
+ upload_button = gr.UploadButton("Click to Upload a Deck(.pdf))")
191
+ upload_button.upload(upload_file, upload_button, json_output)
192
+ with gr.Tab("Enter Project website"):
193
+ text_input = gr.Textbox(label="Enter Project website")
194
+ json_output = gr.JSON()
195
+ submit_button = gr.Button("Click to Submit")
196
+ submit_button.click(get_website_response, text_input, json_output)
197
+ gr.Markdown("""
198
+ ## Links
199
+ - Website: [Ventureflow.xyz](https://ventureflow.xyz)
200
+ - Twitter: [@VentureFlow_xyz](https://twitter.com/VentureFlow_xyz)
201
+ - App: [app.ventureflow.xyz](https://app.ventureflow.xyz)
202
+ - Docs: [docs.ventureflow.xyz](https://docs.ventureflow.xyz)
203
+ """)
204
+
205
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
1
+ python-dotenv==1.0.0
2
+ gradio==3.30.0
3
+ gradio_client==0.2.4
4
+ PyPDF2==3.0.1
5
+ langchain==0.0.137
6
+ requests==2.28.2
7
+ bs4==0.0.1
8
+ openai==0.27.4
9
+ tiktoken==0.3.3
10
+ faiss-cpu==1.7.4