Qifan Zhang commited on
Commit
f89a7d8
1 Parent(s): d487adb

add read web, text; update truncation

Browse files
Files changed (6) hide show
  1. .gitignore +2 -1
  2. app.py +35 -5
  3. requirements.txt +5 -1
  4. utils/chatgpt.py +0 -8
  5. utils/read_web.py +19 -0
  6. utils/truncate.py +9 -0
.gitignore CHANGED
@@ -1 +1,2 @@
1
- .idea/
 
 
1
+ .idea/
2
+ data/
app.py CHANGED
@@ -2,16 +2,46 @@ import gradio as gr
2
 
3
  from utils.chatgpt import ChatGPTAPI
4
  from utils.read_pdf import read_pdf
 
 
5
 
6
 
7
- def process(api_key: str = '', prompt: str = '', file=None) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  chatgpt = ChatGPTAPI(api_key, max_input_length=1024)
9
 
10
- pdf_contents = read_pdf(file.name)
11
- pdf_str = '\n'.join(pdf_contents)
12
- content = prompt + '\n' + pdf_str
 
 
13
  response = chatgpt(content)
14
  return response
15
 
16
 
17
- gr.Interface(fn=process, inputs=["text", "text", "file"], outputs="text").launch()
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  from utils.chatgpt import ChatGPTAPI
4
  from utils.read_pdf import read_pdf
5
+ from utils.read_web import read_web
6
+ from utils.truncate import truncate_string
7
 
8
 
9
+ def file2str(filepath: str) -> str:
10
+ if not filepath:
11
+ return ''
12
+ if filepath.endswith('.pdf'):
13
+ content_list = read_pdf(filepath)
14
+ text = '\n'.join(content_list)
15
+ elif filepath.endswith('.txt'):
16
+ with open(filepath, 'r') as f:
17
+ text = f.readlines()
18
+ else:
19
+ raise Exception('File type not supported')
20
+ text = truncate_string(text, max_length=1024)
21
+ return text
22
+
23
+
24
+ def process(api_key: str = '', prompt: str = '', file=None, url='') -> str:
25
  chatgpt = ChatGPTAPI(api_key, max_input_length=1024)
26
 
27
+ file_text = file2str(file.name) if file else ''
28
+ web_txt = read_web(url)
29
+ web_txt = truncate_string(web_txt, max_length=1024)
30
+
31
+ content = prompt + '\n' + file_text + '\n' + web_txt
32
  response = chatgpt(content)
33
  return response
34
 
35
 
36
+ prompt_input = gr.components.Textbox(
37
+ value='用中文总结下面的文章',
38
+ lines=2,
39
+ type="text"
40
+ )
41
+
42
+ app = gr.Interface(
43
+ fn=process,
44
+ inputs=["text", prompt_input, "file", "text"],
45
+ outputs="text"
46
+ )
47
+ app.launch()
requirements.txt CHANGED
@@ -1,5 +1,9 @@
1
  openai
 
 
2
  gradio
 
3
  pypdf
4
- tiktoken
 
5
 
 
1
  openai
2
+ tiktoken
3
+
4
  gradio
5
+
6
  pypdf
7
+ requests
8
+ bs4
9
 
utils/chatgpt.py CHANGED
@@ -1,5 +1,4 @@
1
  import openai
2
- import tiktoken
3
 
4
 
5
  class ChatGPTAPI:
@@ -12,17 +11,10 @@ class ChatGPTAPI:
12
 
13
  openai.api_key = api_key
14
  self.max_input_length = max_input_length
15
- self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
16
-
17
- def truncate_string(self, s):
18
- e = self.encoding.encode(s)[:self.max_input_length]
19
- s = self.encoding.decode(e)
20
- return s
21
 
22
  def __call__(self, content: str):
23
  assert isinstance(content, str), 'ChatGPT Error: content must be a string'
24
  content = content.strip()
25
- content = self.truncate_string(content)
26
  messages = [{'role': 'user', 'content': content}]
27
  try:
28
  resp = openai.ChatCompletion.create(
 
1
  import openai
 
2
 
3
 
4
  class ChatGPTAPI:
 
11
 
12
  openai.api_key = api_key
13
  self.max_input_length = max_input_length
 
 
 
 
 
 
14
 
15
  def __call__(self, content: str):
16
  assert isinstance(content, str), 'ChatGPT Error: content must be a string'
17
  content = content.strip()
 
18
  messages = [{'role': 'user', 'content': content}]
19
  try:
20
  resp = openai.ChatCompletion.create(
utils/read_web.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+
6
+
7
+ def read_web(url: str) -> str:
8
+ if not url:
9
+ return ''
10
+ resp = requests.get(url)
11
+ soup = BeautifulSoup(resp.text, 'html.parser')
12
+ text = soup.get_text()
13
+ text = re.sub('\n{3,}', '\n\n', text)
14
+ return text
15
+
16
+
17
+ if __name__ == '__main__':
18
+ r = read_web('https://en.wikipedia.org/wiki/Wiki')
19
+ print(r)
utils/truncate.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import tiktoken
2
+
3
+ encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
4
+
5
+
6
+ def truncate_string(s, max_length=1024) -> str:
7
+ e = encoding.encode(s)[:max_length]
8
+ s = encoding.decode(e)
9
+ return s