giovannefeitosa
commited on
Commit
•
f7db77c
1
Parent(s):
ecbf46a
Initial commit
Browse files- .gitignore +7 -0
- README.md +3 -3
- commons/Configs.py +21 -0
- commons/File.py +22 -0
- commons/Model.py +26 -0
- commons/OpenAIClient.py +89 -0
- commons/SpacyUtils.py +24 -0
- prepareutils/Dataset.py +56 -0
- prepareutils/Embeddings.py +44 -0
- requirements.txt +15 -0
- serve.py +43 -0
.gitignore
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
.venv
|
3 |
+
io/data/original-sample.txt
|
4 |
+
**/*.pyc
|
5 |
+
**/__pycache__
|
6 |
+
io/generated/**
|
7 |
+
!io/generated/.gitkeep
|
README.md
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
1 |
+
# Chatbot about Pele
|
2 |
+
|
3 |
+
This is demo project.
|
commons/Configs.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
|
4 |
+
class Configs:
|
5 |
+
def __init__(self):
|
6 |
+
# environment variables
|
7 |
+
os.environ["PROJECT_ROOT"] = os.getcwd()
|
8 |
+
# openai
|
9 |
+
self.OPENAI_KEY = ""
|
10 |
+
self.chatCompletionModel = "gpt-3.5-turbo"
|
11 |
+
self.embeddingsModel = "text-embedding-ada-002"
|
12 |
+
# generated files
|
13 |
+
self.generatedDatasetPath = f"{os.environ['PROJECT_ROOT']}/io/generated/dataset.json"
|
14 |
+
self.generatedEmbeddingsPath = f"{os.environ['PROJECT_ROOT']}/io/generated/embeddings.json"
|
15 |
+
# spacy
|
16 |
+
self.spacyModel = 'en_core_web_sm'
|
17 |
+
# model
|
18 |
+
self.generatedModelPath = f"{os.environ['PROJECT_ROOT']}/io/generated/model.sklearn"
|
19 |
+
|
20 |
+
|
21 |
+
configs = Configs()
|
commons/File.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
|
4 |
+
|
5 |
+
class File:
|
6 |
+
def readFile(self, file):
|
7 |
+
with open(file, 'r') as f:
|
8 |
+
return f.read()
|
9 |
+
|
10 |
+
def readJsonFile(self, file):
|
11 |
+
with open(file, 'r') as f:
|
12 |
+
return json.load(f)
|
13 |
+
|
14 |
+
def writeFile(self, outputFilePath, data):
|
15 |
+
with open(outputFilePath, 'w') as f:
|
16 |
+
f.write(json.dumps(data))
|
17 |
+
|
18 |
+
def exists(self, filePath):
|
19 |
+
return os.path.exists(filePath)
|
20 |
+
|
21 |
+
|
22 |
+
file = File()
|
commons/Model.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.linear_model import LogisticRegression
|
2 |
+
import joblib
|
3 |
+
from commons.Configs import configs
|
4 |
+
from commons.File import file
|
5 |
+
|
6 |
+
|
7 |
+
class Model:
|
8 |
+
def __init__(self, debug=False):
|
9 |
+
self.debug = debug
|
10 |
+
|
11 |
+
def train(self, x, y):
|
12 |
+
return LogisticRegression(solver='lbfgs', random_state=42).fit(x, y)
|
13 |
+
|
14 |
+
def save(self, clf):
|
15 |
+
# save model
|
16 |
+
joblib.dump(clf, configs.generatedModelPath)
|
17 |
+
print("Model saved to: ", configs.generatedModelPath)
|
18 |
+
|
19 |
+
def load(self):
|
20 |
+
if not file.exists(configs.generatedModelPath):
|
21 |
+
print("Model not found at: ", configs.generatedModelPath)
|
22 |
+
exit(1)
|
23 |
+
return joblib.load(configs.generatedModelPath)
|
24 |
+
|
25 |
+
|
26 |
+
model = Model()
|
commons/OpenAIClient.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from commons.Configs import configs
|
3 |
+
from commons.File import file
|
4 |
+
import openai
|
5 |
+
from openai.embeddings_utils import cosine_similarity
|
6 |
+
import json
|
7 |
+
|
8 |
+
|
9 |
+
class OpenAIClient:
|
10 |
+
def __init__(self, debug=False):
|
11 |
+
self.debug = debug
|
12 |
+
openai.api_key = configs.OPENAI_KEY
|
13 |
+
self.embeddingsModel = configs.embeddingsModel
|
14 |
+
|
15 |
+
def buildPrompt(self, name, variables):
|
16 |
+
# used by prepareutils.Dataset
|
17 |
+
promptFilePath = os.path.join(configs.promptsDir, f"{name}.prompt.txt")
|
18 |
+
prompt = file.readFile(promptFilePath)
|
19 |
+
for key, value in variables.items():
|
20 |
+
prompt = prompt.replace(f"{{{key}}}", value)
|
21 |
+
return prompt
|
22 |
+
|
23 |
+
def generateSyntheticQuestions(self, prompt, debugSentence=""):
|
24 |
+
# used by prepareutils.Dataset
|
25 |
+
"""Use OpenAI completion API to generate synthetic questions for each sentence"""
|
26 |
+
# ----------------------------------------------
|
27 |
+
# generate questions (responseText)
|
28 |
+
# ----------------------------------------------
|
29 |
+
response = openai.ChatCompletion.create(
|
30 |
+
model=configs.chatCompletionModel,
|
31 |
+
messages=[{"role": "user", "content": prompt}]
|
32 |
+
)
|
33 |
+
responseText = response['choices'][0]['message']['content']
|
34 |
+
# ----------------------------------------------
|
35 |
+
# split questions and answers
|
36 |
+
# ----------------------------------------------
|
37 |
+
# make all question/answers to be on the same line
|
38 |
+
# and remove the response header
|
39 |
+
questionAnswers = responseText.replace("\n", "").split('(Q)', 1)[1]
|
40 |
+
# one line per question/answer
|
41 |
+
questionAnswers = questionAnswers.split('(Q)')
|
42 |
+
# split question and answers
|
43 |
+
questionAnswers = [x.split('(A)', 1) for x in questionAnswers]
|
44 |
+
# remove invalid rows and strip
|
45 |
+
questionAnswers = [[x[0].strip(), x[1].strip()]
|
46 |
+
for x in questionAnswers if len(x) == 2]
|
47 |
+
jsonData = [{"question": x[0], "answer": x[1]}
|
48 |
+
for x in questionAnswers]
|
49 |
+
# ----------------------------------------------
|
50 |
+
# debug
|
51 |
+
if self.debug:
|
52 |
+
print("Sentence: ", debugSentence)
|
53 |
+
print("Response text: ", responseText)
|
54 |
+
print("jsonData: ", json.dumps(jsonData, indent=4))
|
55 |
+
# ----------------------------------------------
|
56 |
+
return jsonData
|
57 |
+
|
58 |
+
def generateEmbeddings(self, sentences):
|
59 |
+
# used by prepareutils.Embeddings
|
60 |
+
response = openai.Embedding.create(
|
61 |
+
input=sentences,
|
62 |
+
model=self.embeddingsModel,
|
63 |
+
)
|
64 |
+
embeddings = []
|
65 |
+
for x in response['data']:
|
66 |
+
embeddings.append(x['embedding'])
|
67 |
+
assert len(embeddings) == len(sentences)
|
68 |
+
return embeddings
|
69 |
+
|
70 |
+
def searchBestEmbeddingIndex(self, embeddedQuestion, embeddingsToSearch):
|
71 |
+
# find the most similar sentence
|
72 |
+
# used by ask.py
|
73 |
+
"""Search for the best embedding index"""
|
74 |
+
maxSimilarity = 0
|
75 |
+
maxSimilarityIndex = 0
|
76 |
+
for i, embedding in enumerate(embeddingsToSearch):
|
77 |
+
# similarity = cosineSimilarity(
|
78 |
+
# np.array(questionEmbedding['data'][0]['embedding']), embedding)
|
79 |
+
similarity = cosine_similarity(embeddedQuestion, embedding)
|
80 |
+
if similarity > maxSimilarity:
|
81 |
+
maxSimilarity = similarity
|
82 |
+
maxSimilarityIndex = i
|
83 |
+
# return the most similar sentence index
|
84 |
+
return maxSimilarityIndex
|
85 |
+
# return the most similar embedding
|
86 |
+
# return df.iloc[maxSimilarityIndex].sentences
|
87 |
+
|
88 |
+
|
89 |
+
openaiClient = OpenAIClient()
|
commons/SpacyUtils.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
from commons.Configs import configs
|
3 |
+
|
4 |
+
|
5 |
+
class SpacyUtils:
|
6 |
+
def __init__(self, debug=False):
|
7 |
+
self.debug = debug
|
8 |
+
|
9 |
+
# Receives a raw text and returns an array of sentences
|
10 |
+
def splitSentences(self, text):
|
11 |
+
"""Split text into sentences"""
|
12 |
+
nlp = self.spacyLoad()
|
13 |
+
doc = nlp(text)
|
14 |
+
return [str(sent.text).replace('"', '') for sent in doc.sents]
|
15 |
+
|
16 |
+
# Returns a spacy.load() model
|
17 |
+
def spacyLoad(self):
|
18 |
+
"""Load spacy model"""
|
19 |
+
if not hasattr(self, 'spacyInstance'):
|
20 |
+
self.spacyInstance = spacy.load(configs.spacyModel)
|
21 |
+
return self.spacyInstance
|
22 |
+
|
23 |
+
|
24 |
+
spacyUtils = SpacyUtils()
|
prepareutils/Dataset.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tqdm import tqdm
|
2 |
+
from commons.Configs import configs
|
3 |
+
from commons.File import file
|
4 |
+
from commons.OpenAIClient import openaiClient
|
5 |
+
from commons.SpacyUtils import spacyUtils
|
6 |
+
|
7 |
+
|
8 |
+
class Dataset:
|
9 |
+
def __init__(self, debug=False):
|
10 |
+
self.debug = debug
|
11 |
+
|
12 |
+
# Receives an <inputFile>
|
13 |
+
# generate synthetic questions and answers
|
14 |
+
# save to <outputFile>
|
15 |
+
def generateDatasetFromFile(self, inputFile):
|
16 |
+
outputFile = configs.generatedDatasetPath
|
17 |
+
# allQaRows is an array where each item is a dict with {"question","answer"} keys
|
18 |
+
# ? should I use a list of tuples instead?
|
19 |
+
allQaRows = []
|
20 |
+
print("Reading input file: ", inputFile)
|
21 |
+
text = file.readFile(inputFile)
|
22 |
+
# split text into sentences and augment each sentence with synthetic questions and answers
|
23 |
+
print("Generating questions and answers for each sentence")
|
24 |
+
for sent in tqdm(spacyUtils.splitSentences(text)):
|
25 |
+
prompt = openaiClient.buildPrompt("generateQuestionsPerson", {
|
26 |
+
'NAME': configs.PROMPT_PERSON_NAME,
|
27 |
+
'SOCIALNAME': configs.PROMPT_PERSON_SOCIALNAME,
|
28 |
+
'TITLE': configs.PROMPT_PERSON_TITLE,
|
29 |
+
'HESHEIT': configs.PROMPT_PERSON_HESHEIT,
|
30 |
+
'BIRTHDAY': configs.PROMPT_PERSON_BIRTHDAY,
|
31 |
+
'DEATHDAY': configs.PROMPT_PERSON_DEATHDAY,
|
32 |
+
'BIRTHPLACE': configs.PROMPT_PERSON_BIRTHPLACE,
|
33 |
+
'DEATHPLACE': configs.PROMPT_PERSON_DEATHPLACE,
|
34 |
+
'NUMBER_OF_QUESTIONS': configs.PROMPT_PERSON_NUMBER_OF_QUESTIONS,
|
35 |
+
'SENTENCE': sent
|
36 |
+
})
|
37 |
+
genq = openaiClient.generateSyntheticQuestions(
|
38 |
+
prompt, debugSentence=sent)
|
39 |
+
allQaRows.extend(genq)
|
40 |
+
# debug
|
41 |
+
if self.debug:
|
42 |
+
for x in genq:
|
43 |
+
print("Sentence: ", sent)
|
44 |
+
print("Q: ", x['question'])
|
45 |
+
print("A: ", x['answer'])
|
46 |
+
# save all the generated questions and answers in a generated dataset file
|
47 |
+
# Default: io/generated/dataset.json
|
48 |
+
print("Writing dataset to file: ", outputFile)
|
49 |
+
file.writeFile(outputFile, allQaRows)
|
50 |
+
|
51 |
+
def loadDataset(self):
|
52 |
+
inputFilePath = configs.generatedDatasetPath
|
53 |
+
return file.readJsonFile(inputFilePath)
|
54 |
+
|
55 |
+
|
56 |
+
dataset = Dataset()
|
prepareutils/Embeddings.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tqdm import tqdm
|
2 |
+
import numpy as np
|
3 |
+
from commons.Configs import configs
|
4 |
+
from commons.File import file
|
5 |
+
from commons.OpenAIClient import openaiClient
|
6 |
+
|
7 |
+
|
8 |
+
class Embeddings:
|
9 |
+
def __init__(self, debug=False):
|
10 |
+
self.debug = debug
|
11 |
+
|
12 |
+
def generateEmbeddings(self):
|
13 |
+
inputFilePath = configs.generatedDatasetPath
|
14 |
+
outputFilePath = configs.generatedEmbeddingsPath
|
15 |
+
dataset = file.readJsonFile(inputFilePath)
|
16 |
+
embeddings = []
|
17 |
+
print("")
|
18 |
+
# for each sentence
|
19 |
+
for i, qa in enumerate(tqdm(dataset)):
|
20 |
+
sentences = [qa['question'], qa['answer']]
|
21 |
+
emb = openaiClient.generateEmbeddings(sentences)
|
22 |
+
embjson = {'question': emb[0], 'answer': emb[1], 'label': i}
|
23 |
+
print("Sentence: ", i, sentences)
|
24 |
+
embeddings.append(embjson)
|
25 |
+
# save all the generated embeddings
|
26 |
+
# Default: io/generated/embeddings.json
|
27 |
+
print("Writing embeddings to file: ", outputFilePath)
|
28 |
+
file.writeFile(outputFilePath, embeddings)
|
29 |
+
|
30 |
+
def loadEmbeddings(self):
|
31 |
+
inputFilePath = configs.generatedEmbeddingsPath
|
32 |
+
embeddings = file.readJsonFile(inputFilePath)
|
33 |
+
questionEmbeddings = [x['question'] for x in embeddings]
|
34 |
+
answerEmbeddings = [x['answer'] for x in embeddings]
|
35 |
+
labels = [x['label'] for x in embeddings]
|
36 |
+
# i would use float16, but I've had issues with GPU
|
37 |
+
# I know I'm not using GPU now, but I might in the future
|
38 |
+
return \
|
39 |
+
np.array(questionEmbeddings, dtype=np.float32), \
|
40 |
+
np.array(answerEmbeddings, dtype=np.float32), \
|
41 |
+
np.array(labels, dtype=np.int32)
|
42 |
+
|
43 |
+
|
44 |
+
embeddings = Embeddings()
|
requirements.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# openai
|
2 |
+
openai==0.27.2
|
3 |
+
scipy
|
4 |
+
|
5 |
+
# spacy https://pypi.org/project/spacy/
|
6 |
+
setuptools
|
7 |
+
wheel
|
8 |
+
spacy
|
9 |
+
|
10 |
+
# model
|
11 |
+
scikit-learn
|
12 |
+
numpy
|
13 |
+
|
14 |
+
# demo webserver
|
15 |
+
gradio
|
serve.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from commons.Model import model
|
3 |
+
from commons.Configs import configs
|
4 |
+
from commons.OpenAIClient import openaiClient
|
5 |
+
from prepareutils.Dataset import dataset
|
6 |
+
import numpy as np
|
7 |
+
import openai
|
8 |
+
|
9 |
+
# load the model once
|
10 |
+
clf = model.load()
|
11 |
+
# load dataset
|
12 |
+
qaDataset = dataset.loadDataset()
|
13 |
+
|
14 |
+
|
15 |
+
def predict(question, openaiKey):
|
16 |
+
# set openaiKey
|
17 |
+
configs.OPENAI_KEY = openaiKey
|
18 |
+
openai.api_key = openaiKey
|
19 |
+
# embed question
|
20 |
+
questionEmbedding = openaiClient.generateEmbeddings([question])[0]
|
21 |
+
# predict answer index
|
22 |
+
answerIndex = clf.predict([questionEmbedding]).item()
|
23 |
+
# get answer
|
24 |
+
bestAnswer = qaDataset[answerIndex]
|
25 |
+
return bestAnswer["answer"]
|
26 |
+
|
27 |
+
|
28 |
+
def randomExamples(numberOfExamples=15):
|
29 |
+
# create random indexes in the range between 0 and len(qaDataset)
|
30 |
+
randomIndexes = np.random.randint(0, len(qaDataset), numberOfExamples)
|
31 |
+
examples = []
|
32 |
+
for index in randomIndexes:
|
33 |
+
question = qaDataset[index]["question"]
|
34 |
+
examples.append([question])
|
35 |
+
return examples
|
36 |
+
|
37 |
+
|
38 |
+
gr.Interface(
|
39 |
+
fn=predict,
|
40 |
+
inputs=["text", "text"],
|
41 |
+
outputs="text",
|
42 |
+
examples=randomExamples(),
|
43 |
+
).launch()
|