hadxu commited on
Commit
4ca8caf
1 Parent(s): b37a59b
Files changed (3) hide show
  1. app.py +106 -0
  2. requirements.txt +7 -0
  3. util.py +82 -0
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib.request
2
+ import fitz
3
+ import re
4
+ import numpy as np
5
+ import tensorflow_hub as hub
6
+ import openai
7
+ import gradio as gr
8
+ import os
9
+ import shutil
10
+ from pathlib import Path
11
+ from tempfile import NamedTemporaryFile
12
+ from sklearn.neighbors import NearestNeighbors
13
+ import huggingface_hub
14
+
15
+ openai.base_url = "https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1/v1/"
16
+ openai.api_key = huggingface_hub.get_token()
17
+
18
+ from util import pdf_to_text, text_to_chunks, SemanticSearch
19
+
20
+ recommender = SemanticSearch()
21
+ def load_recommender(path, start_page=1):
22
+ global recommender
23
+ texts = pdf_to_text(path, start_page=start_page)
24
+ chunks = text_to_chunks(texts, start_page=start_page)
25
+ recommender.fit(chunks)
26
+ return 'Corpus Loaded.'
27
+
28
+
29
+ def generate_text(prompt, model = "gpt-3.5-turbo-16k-0613"):
30
+
31
+ model="mistralai/Mixtral-8x7B-Instruct-v0.1"
32
+
33
+ temperature=0.7
34
+ max_tokens=256
35
+ top_p=1
36
+ frequency_penalty=0
37
+ presence_penalty=0
38
+ message = openai.ChatCompletion.create(
39
+ model=model,
40
+ messages=[
41
+ {"role": "system", "content": "You are a helpful assistant."},
42
+ {"role": "assistant", "content": "Here is some initial assistant message."},
43
+ {"role": "user", "content": prompt}
44
+ ],
45
+ temperature=.3,
46
+ max_tokens=max_tokens,
47
+ top_p=top_p,
48
+ frequency_penalty=frequency_penalty,
49
+ presence_penalty=presence_penalty,
50
+ ).choices[0].message['content']
51
+ return message
52
+
53
+ def generate_answer(question):
54
+ topn_chunks = recommender(question)
55
+ prompt = 'search results:\n\n'
56
+ for c in topn_chunks:
57
+ prompt += c + '\n\n'
58
+
59
+ prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
60
+ "Cite each reference using [ Page Number] notation. "\
61
+ "Only answer what is asked. The answer should be short and concise. \n\nQuery: "
62
+
63
+ prompt += f"{question}\nAnswer:"
64
+ answer = generate_text(prompt)
65
+ return answer
66
+
67
+ import google.generativeai as genai
68
+
69
+ def question_anwser(chat_history, file, question):
70
+ suffix = Path(file.name).suffix
71
+ with NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
72
+ shutil.copyfile(file.name, tmp.name)
73
+ tmp_path = Path(tmp.name)
74
+
75
+ load_recommender(str(tmp_path))
76
+ answer = generate_answer(question)
77
+ chat_history.append([question, answer])
78
+ return chat_history
79
+
80
+ title = 'PDF GPT '
81
+ description = """ PDF GPT """
82
+
83
+ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as demo:
84
+
85
+ gr.Markdown(f'<center><h3>{title}</h3></center>')
86
+ gr.Markdown(description)
87
+
88
+ with gr.Row():
89
+
90
+ with gr.Group():
91
+ with gr.Accordion("URL or pdf file"):
92
+ file = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'])
93
+ question = gr.Textbox(label='Enter your question here')
94
+ btn = gr.Button(value='Submit')
95
+
96
+ with gr.Group():
97
+ chatbot = gr.Chatbot(label="Chat History", elem_id="chatbot")
98
+
99
+ btn.click(
100
+ question_anwser,
101
+ inputs=[chatbot, file, question],
102
+ outputs=[chatbot],
103
+ api_name="predict",
104
+ )
105
+
106
+ demo.launch(server_name="0.0.0.0")
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ openai
2
+ PyMuPDF
3
+ numpy
4
+ scikit-learn
5
+ tensorflow-cpu
6
+ tensorflow-hub
7
+ gradio
util.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import fitz
3
+ import numpy as np
4
+ from sklearn.neighbors import NearestNeighbors
5
+ import tensorflow_hub as hub
6
+
7
+ def preprocess(text):
8
+ text = text.replace('\n', ' ')
9
+ text = re.sub('\s+', ' ', text)
10
+ return text
11
+
12
+
13
+ def pdf_to_text(path, start_page=1, end_page=None):
14
+ doc = fitz.open(path)
15
+ total_pages = doc.page_count
16
+
17
+ if end_page is None:
18
+ end_page = total_pages
19
+
20
+ text_list = []
21
+
22
+ for i in range(start_page-1, end_page):
23
+ text = doc.load_page(i).get_text("text")
24
+ text = preprocess(text)
25
+ text_list.append(text)
26
+
27
+ doc.close()
28
+ return text_list
29
+
30
+
31
+ def text_to_chunks(texts, word_length=150, start_page=1):
32
+ text_toks = [t.split(' ') for t in texts]
33
+ page_nums = []
34
+ chunks = []
35
+
36
+ for idx, words in enumerate(text_toks):
37
+ for i in range(0, len(words), word_length):
38
+ chunk = words[i:i+word_length]
39
+ if (i+word_length) > len(words) and (len(chunk) < word_length) and (
40
+ len(text_toks) != (idx+1)):
41
+ text_toks[idx+1] = chunk + text_toks[idx+1]
42
+ continue
43
+ chunk = ' '.join(chunk).strip()
44
+ chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
45
+ chunks.append(chunk)
46
+ return chunks
47
+
48
+ class SemanticSearch:
49
+
50
+ def __init__(self):
51
+ # self.use = hub.load('./tf_encoder_model/')
52
+ self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
53
+ self.fitted = False
54
+
55
+ def fit(self, data, batch=1000, n_neighbors=5):
56
+ self.data = data
57
+ self.embeddings = self.get_text_embedding(data, batch=batch)
58
+ n_neighbors = min(n_neighbors, len(self.embeddings))
59
+ self.nn = NearestNeighbors(n_neighbors=n_neighbors)
60
+ self.nn.fit(self.embeddings)
61
+ self.fitted = True
62
+
63
+
64
+ def __call__(self, text, return_data=True):
65
+ inp_emb = self.use([text])
66
+ neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
67
+
68
+ if return_data:
69
+ return [self.data[i] for i in neighbors]
70
+ else:
71
+ return neighbors
72
+
73
+
74
+ def get_text_embedding(self, texts, batch=1000):
75
+ embeddings = []
76
+ for i in range(0, len(texts), batch):
77
+ text_batch = texts[i:(i+batch)]
78
+ emb_batch = self.use(text_batch)
79
+ embeddings.append(emb_batch)
80
+ embeddings = np.vstack(embeddings)
81
+ return embeddings
82
+