Curranj commited on
Commit
1f012c1
1 Parent(s): 3f6479a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -0
app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import sqlite3
3
+ import numpy as np
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ import os
6
+ import gradio as gr
7
+ from docx import Document
8
+ from PyPDF2 import PdfFileReader
9
+ import re
10
+
11
+ # Set OpenAI API key from environment variable
12
+ openai.api_key = os.environ["Secret"]
13
+
14
+ def find_closest_neighbors(vector1, dictionary_of_vectors):
15
+ vector = openai.Embedding.create(
16
+ input=vector1,
17
+ engine="text-embedding-ada-002"
18
+ )['data'][0]['embedding']
19
+ vector = np.array(vector)
20
+
21
+ cosine_similarities = {}
22
+ for key, value in dictionary_of_vectors.items():
23
+ cosine_similarities[key] = cosine_similarity(vector.reshape(1, -1), value.reshape(1, -1))[0][0]
24
+
25
+ sorted_cosine_similarities = sorted(cosine_similarities.items(), key=lambda x: x[1], reverse=True)
26
+ return sorted_cosine_similarities[0:4]
27
+
28
+ def extract_words_from_docx(filename):
29
+ doc = Document(filename)
30
+ full_text = []
31
+ for paragraph in doc.paragraphs:
32
+ full_text.append(paragraph.text)
33
+ text = '\n'.join(full_text)
34
+ return re.findall(r'\b\w+\b', text)
35
+
36
+ def extract_words_from_pdf(filename):
37
+ with open(filename, "rb") as file:
38
+ pdf = PdfFileReader(file)
39
+ text = ""
40
+ for page_num in range(pdf.getNumPages()):
41
+ text += pdf.getPage(page_num).extractText()
42
+ return re.findall(r'\b\w+\b', text)
43
+
44
+ def process_file(file_obj):
45
+ if file_obj is not None:
46
+ # Determine file type
47
+ if file_obj.name.endswith('.docx'):
48
+ words = extract_words_from_docx(file_obj.name)
49
+ elif file_obj.name.endswith('.pdf'):
50
+ words = extract_words_from_pdf(file_obj.name)
51
+ else:
52
+ return "Unsupported file type."
53
+
54
+ # Chunk the words into 200-word chunks and add to database
55
+ conn = sqlite3.connect('text_chunks_with_embeddings (1).db')
56
+ cursor = conn.cursor()
57
+
58
+ chunks = [" ".join(words[i:i+200]) for i in range(0, len(words), 200)]
59
+ for chunk in chunks:
60
+ embedding = openai.Embedding.create(input=chunk, engine="text-embedding-ada-002")['data'][0]['embedding']
61
+ embedding_str = " ".join(map(str, embedding))
62
+ cursor.execute("INSERT INTO chunks (text, embedding) VALUES (?, ?)", (chunk, embedding_str))
63
+
64
+ conn.commit()
65
+ conn.close()
66
+ return "File processed and added to database."
67
+
68
+ return "No file uploaded."
69
+
70
+ def predict(message, history, file_obj=None):
71
+ # If there's a file, process it first
72
+ if file_obj:
73
+ process_file(file_obj)
74
+
75
+ # Connect to the database
76
+ conn = sqlite3.connect('text_chunks_with_embeddings (1).db')
77
+ cursor = conn.cursor()
78
+ cursor.execute("SELECT text, embedding FROM chunks")
79
+ rows = cursor.fetchall()
80
+
81
+ dictionary_of_vectors = {}
82
+ for row in rows:
83
+ text = row[0]
84
+ embedding_str = row[1]
85
+ embedding = np.fromstring(embedding_str, sep=' ')
86
+ dictionary_of_vectors[text] = embedding
87
+ conn.close()
88
+
89
+ match_list = find_closest_neighbors(message, dictionary_of_vectors)
90
+ context = ''
91
+ for match in match_list:
92
+ context += str(match[0])
93
+ context = context[:1500] # Limit context to 1500 characters
94
+
95
+ prep = f"This is an OpenAI model designed to answer questions specific to grant-making applications for an aquarium. Here is some question-specific context: {context}. Q: {message} A: "
96
+
97
+ history_openai_format = []
98
+ for human, assistant in history:
99
+ history_openai_format.append({"role": "user", "content": human})
100
+ history_openai_format.append({"role": "assistant", "content": assistant})
101
+ history_openai_format.append({"role": "user", "content": prep})
102
+
103
+ response = openai.ChatCompletion.create(
104
+ model='gpt-4',
105
+ messages=history_openai_format,
106
+ temperature=1.0,
107
+ stream=True
108
+ )
109
+
110
+ partial_message = ""
111
+ for chunk in response:
112
+ if len(chunk['choices'][0]['delta']) != 0:
113
+ partial_message += chunk['choices'][0]['delta']['content']
114
+ yield partial_message
115
+
116
+ # Modify the Gradio interface to include the file upload component
117
+ gr.Interface(fn=predict,
118
+ inputs=["text", "list", gr.inputs.File(label="Upload PDF or DOCX file (optional)")],
119
+ outputs="chat",
120
+ live=True).launch()
121
+
122
+