RomyMy commited on
Commit
4304dbd
β€’
1 Parent(s): 116461b

fix code 6

Browse files
Files changed (3) hide show
  1. app.py +11 -15
  2. constants.py +14 -0
  3. preprocess.py +32 -28
app.py CHANGED
@@ -1,9 +1,6 @@
1
- import os
2
-
3
  import numpy as np
4
  import redis
5
  import streamlit as st
6
- from dotenv import load_dotenv
7
  from langchain import HuggingFaceHub
8
  from langchain.chains import LLMChain
9
  from langchain.chat_models import ChatOpenAI
@@ -17,18 +14,17 @@ from constants import (
17
  FALCON_MAX_TOKENS,
18
  FALCON_REPO_ID,
19
  FALCON_TEMPERATURE,
 
 
 
20
  OPENAI_MODEL_NAME,
21
  OPENAI_TEMPERATURE,
22
  TEMPLATE_1,
23
  TEMPLATE_2,
 
24
  )
25
  from database import create_redis
26
 
27
- load_dotenv()
28
- HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
29
- ITEM_KEYWORD_EMBEDDING = "item_vector"
30
- TOPK = 5
31
-
32
 
33
  # connect to redis database
34
  @st.cache_resource()
@@ -54,15 +50,12 @@ def encode_keywords_chain():
54
 
55
 
56
  # the present products chain
57
- @st.cache_resource()
58
  def present_products_chain():
59
  template = TEMPLATE_2
60
  memory = ConversationBufferMemory(memory_key="chat_history")
61
  prompt = PromptTemplate(input_variables=["chat_history", "user_msg"], template=template)
62
  chain = LLMChain(
63
- llm=ChatOpenAI(
64
- openai_api_key=os.getenv("OPENAI_API_KEY"), temperature=OPENAI_TEMPERATURE, model=OPENAI_MODEL_NAME
65
- ),
66
  prompt=prompt,
67
  verbose=False,
68
  memory=memory,
@@ -81,7 +74,11 @@ def main():
81
  st.caption("πŸ€– Powered by Falcon Open Source AI model")
82
  redis_conn = connect_to_redis()
83
  keywords_chain = encode_keywords_chain()
84
- chat_chain = present_products_chain()
 
 
 
 
85
  embedding_model = instance_embedding_model()
86
 
87
  if "messages" not in st.session_state:
@@ -102,7 +99,6 @@ def main():
102
  query_vector = embedding_model.encode(keywords)
103
  query_vector_bytes = np.array(query_vector).astype(np.float32).tobytes()
104
  # prepare the query
105
-
106
  q = (
107
  Query(f"*=>[KNN {TOPK} @{ITEM_KEYWORD_EMBEDDING} $vec_param AS vector_score]")
108
  .sort_by("vector_score")
@@ -116,7 +112,7 @@ def main():
116
  result_output = ""
117
  for product in results.docs:
118
  result_output += f"product_name:{product.item_name}, product_description:{product.item_keywords} \n"
119
- result = chat_chain.predict(user_msg=f"{result_output}\n{prompt}")
120
  st.session_state.messages.append({"role": "assistant", "content": result})
121
  st.chat_message("assistant").write(result)
122
 
 
 
 
1
  import numpy as np
2
  import redis
3
  import streamlit as st
 
4
  from langchain import HuggingFaceHub
5
  from langchain.chains import LLMChain
6
  from langchain.chat_models import ChatOpenAI
 
14
  FALCON_MAX_TOKENS,
15
  FALCON_REPO_ID,
16
  FALCON_TEMPERATURE,
17
+ HUGGINGFACEHUB_API_TOKEN,
18
+ ITEM_KEYWORD_EMBEDDING,
19
+ OPENAI_API_KEY,
20
  OPENAI_MODEL_NAME,
21
  OPENAI_TEMPERATURE,
22
  TEMPLATE_1,
23
  TEMPLATE_2,
24
+ TOPK,
25
  )
26
  from database import create_redis
27
 
 
 
 
 
 
28
 
29
  # connect to redis database
30
  @st.cache_resource()
 
50
 
51
 
52
  # the present products chain
 
53
  def present_products_chain():
54
  template = TEMPLATE_2
55
  memory = ConversationBufferMemory(memory_key="chat_history")
56
  prompt = PromptTemplate(input_variables=["chat_history", "user_msg"], template=template)
57
  chain = LLMChain(
58
+ llm=ChatOpenAI(openai_api_key=OPENAI_API_KEY, temperature=OPENAI_TEMPERATURE, model=OPENAI_MODEL_NAME),
 
 
59
  prompt=prompt,
60
  verbose=False,
61
  memory=memory,
 
74
  st.caption("πŸ€– Powered by Falcon Open Source AI model")
75
  redis_conn = connect_to_redis()
76
  keywords_chain = encode_keywords_chain()
77
+
78
+ if "window_refreshed" not in st.session_state:
79
+ st.session_state.window_refreshed = True
80
+ st.session_state.chat_chain = present_products_chain()
81
+
82
  embedding_model = instance_embedding_model()
83
 
84
  if "messages" not in st.session_state:
 
99
  query_vector = embedding_model.encode(keywords)
100
  query_vector_bytes = np.array(query_vector).astype(np.float32).tobytes()
101
  # prepare the query
 
102
  q = (
103
  Query(f"*=>[KNN {TOPK} @{ITEM_KEYWORD_EMBEDDING} $vec_param AS vector_score]")
104
  .sort_by("vector_score")
 
112
  result_output = ""
113
  for product in results.docs:
114
  result_output += f"product_name:{product.item_name}, product_description:{product.item_keywords} \n"
115
+ result = st.session_state.chat_chain.predict(user_msg=f"{result_output}\n{prompt}")
116
  st.session_state.messages.append({"role": "assistant", "content": result})
117
  st.chat_message("assistant").write(result)
118
 
constants.py CHANGED
@@ -1,11 +1,25 @@
 
 
 
 
 
 
 
1
  FALCON_REPO_ID = "tiiuae/falcon-7b-instruct"
2
  FALCON_TEMPERATURE = 0.1
3
  FALCON_MAX_TOKENS = 500
4
 
5
  OPENAI_MODEL_NAME = "gpt-3.5-turbo"
6
  OPENAI_TEMPERATURE = 0.8
 
7
 
8
  EMBEDDING_MODEL_NAME = "sentence-transformers/all-distilroberta-v1"
 
 
 
 
 
 
9
 
10
  TEMPLATE_1 = "Create comma separated product keywords to perform a query on amazon dataset for this user input: {product_description}"
11
  TEMPLATE_2 = """You are a salesman.Present the given product results in a nice way as answer to the user_msg. Don't ask questions back,
 
1
+ import os
2
+
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
8
  FALCON_REPO_ID = "tiiuae/falcon-7b-instruct"
9
  FALCON_TEMPERATURE = 0.1
10
  FALCON_MAX_TOKENS = 500
11
 
12
  OPENAI_MODEL_NAME = "gpt-3.5-turbo"
13
  OPENAI_TEMPERATURE = 0.8
14
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
15
 
16
  EMBEDDING_MODEL_NAME = "sentence-transformers/all-distilroberta-v1"
17
+ ITEM_KEYWORD_EMBEDDING = "item_vector"
18
+ TOPK = 5
19
+ NUMBER_PRODUCTS = 1000
20
+ MAX_TEXT_LENGTH = 512
21
+ TEXT_EMBEDDING_DIMENSION = 768
22
+ DATA_PATH = "product_data.csv"
23
 
24
  TEMPLATE_1 = "Create comma separated product keywords to perform a query on amazon dataset for this user input: {product_description}"
25
  TEMPLATE_2 = """You are a salesman.Present the given product results in a nice way as answer to the user_msg. Don't ask questions back,
preprocess.py CHANGED
@@ -3,42 +3,46 @@ import pandas as pd
3
  import redis
4
  from sentence_transformers import SentenceTransformer
5
 
 
 
 
 
 
 
6
  from database import create_redis
7
  from utils import create_flat_index, load_vectors
8
 
9
  pool = create_redis()
10
  redis_conn = redis.Redis(connection_pool=pool)
11
- # set maximum length for text fields
12
- MAX_TEXT_LENGTH = 512
13
- TEXT_EMBEDDING_DIMENSION = 768
14
- NUMBER_PRODUCTS = 10000
15
 
16
 
17
  def auto_truncate(text: str):
18
  return text[0:MAX_TEXT_LENGTH]
19
 
20
 
21
- data = pd.read_csv(
22
- "product_data.csv",
23
- converters={"bullet_point": auto_truncate, "item_keywords": auto_truncate, "item_name": auto_truncate},
24
- )
25
- data["primary_key"] = data["item_id"] + "-" + data["domain_name"]
26
- data.drop(columns=["item_id", "domain_name"], inplace=True)
27
- data["item_keywords"].replace("", np.nan, inplace=True)
28
- data.dropna(subset=["item_keywords"], inplace=True)
29
- data.reset_index(drop=True, inplace=True)
30
- data_metadata = data.head(10000).to_dict(orient="index")
31
-
32
- # generating embeddings (vectors) for the item keywords
33
- embedding_model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
34
- # embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)
35
-
36
- # get the item keywords attribute for each product and encode them into vector embeddings
37
- item_keywords = [data_metadata[i]["item_keywords"] for i in data_metadata.keys()]
38
- item_keywords_vectors = [embedding_model.encode(item) for item in item_keywords]
39
-
40
- # flush all data
41
- redis_conn.flushall()
42
- # create flat index & load vectors
43
- create_flat_index(redis_conn, NUMBER_PRODUCTS, TEXT_EMBEDDING_DIMENSION, "COSINE")
44
- load_vectors(redis_conn, data_metadata, item_keywords_vectors)
 
 
 
3
  import redis
4
  from sentence_transformers import SentenceTransformer
5
 
6
+ from constants import (
7
+ DATA_PATH,
8
+ MAX_TEXT_LENGTH,
9
+ NUMBER_PRODUCTS,
10
+ TEXT_EMBEDDING_DIMENSION,
11
+ )
12
  from database import create_redis
13
  from utils import create_flat_index, load_vectors
14
 
15
  pool = create_redis()
16
  redis_conn = redis.Redis(connection_pool=pool)
 
 
 
 
17
 
18
 
19
  def auto_truncate(text: str):
20
  return text[0:MAX_TEXT_LENGTH]
21
 
22
 
23
+ def data_preprocessing_and_loading():
24
+ data = pd.read_csv(
25
+ DATA_PATH,
26
+ converters={"bullet_point": auto_truncate, "item_keywords": auto_truncate, "item_name": auto_truncate},
27
+ )
28
+ data["primary_key"] = data["item_id"] + "-" + data["domain_name"]
29
+ data.drop(columns=["item_id", "domain_name"], inplace=True)
30
+ data["item_keywords"].replace("", np.nan, inplace=True)
31
+ data.dropna(subset=["item_keywords"], inplace=True)
32
+ data.reset_index(drop=True, inplace=True)
33
+ data_metadata = data.head(NUMBER_PRODUCTS).to_dict(orient="index")
34
+
35
+ # generate embeddings (vectors) for the item keywords
36
+ embedding_model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
37
+ # get the item keywords attribute for each product and encode them into vector embeddings
38
+ item_keywords = [data_metadata[i]["item_keywords"] for i in data_metadata.keys()]
39
+ item_keywords_vectors = [embedding_model.encode(item) for item in item_keywords]
40
+ # flush all data
41
+ redis_conn.flushall()
42
+ # create flat index & load vectors
43
+ create_flat_index(redis_conn, NUMBER_PRODUCTS, TEXT_EMBEDDING_DIMENSION, "COSINE")
44
+ load_vectors(redis_conn, data_metadata, item_keywords_vectors)
45
+
46
+
47
+ if __name__ == "__main__":
48
+ data_preprocessing_and_loading()