eagle0504 commited on
Commit
f967233
1 Parent(s): 94f2268

code stack updated

Browse files
Files changed (2) hide show
  1. app.py +42 -199
  2. utils/helper_functions.py +122 -0
app.py CHANGED
@@ -1,189 +1,53 @@
1
  import os
2
- from typing import List, Tuple, Dict, Union, Any
3
- import requests
4
 
 
5
  import numpy as np
6
  import openai
7
  import pandas as pd
 
8
  import streamlit as st
 
9
  from langchain.document_loaders import TextLoader
10
  from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
11
  from langchain.text_splitter import CharacterTextSplitter
12
  from langchain.vectorstores import Chroma
13
  from scipy.spatial.distance import cosine
14
 
15
- openai.api_key = os.environ["OPENAI_API_KEY"]
16
-
17
-
18
- def call_chatgpt(prompt: str) -> str:
19
- """
20
- Uses the OpenAI API to generate an AI response to a prompt.
21
-
22
- Args:
23
- prompt: A string representing the prompt to send to the OpenAI API.
24
-
25
- Returns:
26
- A string representing the AI's generated response.
27
-
28
- """
29
-
30
- # Use the OpenAI API to generate a response based on the input prompt.
31
- response = openai.Completion.create(
32
- model="gpt-3.5-turbo-instruct",
33
- prompt=prompt,
34
- temperature=0.5,
35
- max_tokens=500,
36
- top_p=1,
37
- frequency_penalty=0,
38
- presence_penalty=0,
39
- )
40
-
41
- # Extract the text from the first (and only) choice in the response output.
42
- ans = response.choices[0]["text"]
43
-
44
- # Return the generated AI response.
45
- return ans
46
-
47
-
48
- # def ai_judge(prompt: str) -> float:
49
- # """
50
- # Uses the ChatGPT function to identify whether the content can answer the question
51
-
52
- # Args:
53
- # prompt: A string that represents the prompt
54
-
55
- # Returns:
56
- # float: A score
57
- # """
58
-
59
- # return call_chatgpt(prompt)
60
-
61
-
62
- def ai_judge(sentence1: str, sentence2: str) -> float:
63
-
64
- API_URL = "https://laazu6ral9w37pfb.us-east-1.aws.endpoints.huggingface.cloud"
65
- headers = {
66
- "Accept" : "application/json",
67
- "Content-Type": "application/json"
68
- }
69
-
70
- def helper(payload):
71
- response = requests.post(API_URL, headers=headers, json=payload)
72
- return response.json()
73
-
74
- data = helper({
75
- "source_sentence": sentence1,
76
- "sentences": [sentence2, sentence2],
77
- "parameters": {}
78
- })
79
-
80
- result = data['similarities'][0]
81
-
82
- return result
83
-
84
-
85
- def query(payload: Dict[str, Any]) -> Dict[str, Any]:
86
- """
87
- Sends a JSON payload to a predefined API URL and returns the JSON response.
88
- Args:
89
- payload (Dict[str, Any]): The JSON payload to be sent to the API.
90
- Returns:
91
- Dict[str, Any]: The JSON response received from the API.
92
- """
93
-
94
- # API endpoint URL
95
- API_URL = "https://sks7h7h5qkhoxwxo.us-east-1.aws.endpoints.huggingface.cloud"
96
-
97
- # Headers to indicate both the request and response formats are JSON
98
- headers = {
99
- "Accept": "application/json",
100
- "Content-Type": "application/json"
101
- }
102
-
103
- # Sending a POST request with the JSON payload and headers
104
- response = requests.post(API_URL, headers=headers, json=payload)
105
-
106
- # Returning the JSON response
107
- return response.json()
108
-
109
-
110
- def llama2_7b_ysa(prompt: str) -> str:
111
- """
112
- Queries a model and retrieves the generated text based on the given prompt.
113
- This function sends a prompt to a model (presumably named 'llama2_7b') and extracts
114
- the generated text from the model's response. It's tailored for handling responses
115
- from a specific API or model query structure where the response is expected to be
116
- a list of dictionaries, with at least one dictionary containing a key 'generated_text'.
117
- Parameters:
118
- - prompt (str): The text prompt to send to the model.
119
- Returns:
120
- - str: The generated text response from the model.
121
- Note:
122
- - The function assumes that the 'query' function is previously defined and accessible
123
- within the same scope or module. It should send a request to the model and return
124
- the response in a structured format.
125
- - The 'parameters' dictionary is passed empty but can be customized to include specific
126
- request parameters as needed by the model API.
127
- """
128
-
129
- # Define the query payload with the prompt and any additional parameters
130
- query_payload: Dict[str, Any] = {
131
- "inputs": prompt,
132
- "parameters": {}
133
- }
134
-
135
- # Send the query to the model and store the output response
136
- output = query(query_payload)
137
-
138
- # Extract the 'generated_text' from the first item in the response list
139
- response: str = output[0]['generated_text']
140
-
141
- return response
142
-
143
-
144
- ## rag strategy 1
145
- # file_names = [f"output_files/file_{i}.txt" for i in range(131)]
146
- # # file_names = [f"output_files_large/file_{i}.txt" for i in range(1310)]
147
 
 
148
 
149
- # # Initialize an empty list to hold all documents
150
- # all_documents = [] # this is just a copy, you don't have to use this
151
 
152
- # # Iterate over each file and load its contents
153
- # for file_name in file_names:
154
- # loader = TextLoader(file_name)
155
- # documents = loader.load()
156
- # all_documents.extend(documents)
157
 
158
- # # Split the loaded documents into chunks
159
- # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
160
- # docs = text_splitter.split_documents(all_documents)
161
 
162
- # # Create the open-source embedding function
163
- # embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
164
- # # embedding_function = SentenceTransformer("all-MiniLM-L6-v2")
165
- # # embedding_function = openai_text_embedding
166
 
167
- # # Load the documents into Chroma
168
- # db = Chroma.from_documents(docs, embedding_function)
 
 
169
 
170
- ## rag strategy 2
171
- from datasets import load_dataset
172
- import chromadb
173
- import string
174
 
175
- dataset = load_dataset("eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted")
176
- client = chromadb.Client()
177
- random_number = np.random.randint(low=1e9, high=1e10)
178
- random_string = ''.join(np.random.choice(list(string.ascii_uppercase + string.digits), size=10))
179
- combined_string = f"{random_number}{random_string}"
180
  collection = client.create_collection(combined_string)
181
 
 
182
  # Embed and store the first N supports for this demo
183
- L = len(dataset["train"]['questions'])
184
  collection.add(
185
  ids=[str(i) for i in range(0, L)], # IDs are just strings
186
- documents=dataset["train"]['questions'], # Enter questions here
187
  metadatas=[{"type": "support"} for _ in range(0, L)],
188
  )
189
 
@@ -205,9 +69,14 @@ st.sidebar.markdown(
205
 
206
  This app guides you through YSA's website, utilizing a RAG-ready Q&A dataset [here](https://huggingface.co/datasets/eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted) for chatbot assistance. 🤖 Enter a question, and it finds similar ones in the database, offering answers with a distance score to gauge relevance—the lower the score, the closer the match. 🎯 For better accuracy and to reduce errors, user feedback helps refine the database. ✨
207
 
208
- """)
209
- st.sidebar.success("Please enter a distance threshold (we advise to set it around 0.2).")
210
- special_threshold = st.sidebar.number_input("Insert a number", value=0.2, placeholder="Type a number...") # 0.3
 
 
 
 
 
211
  clear_button = st.sidebar.button("Clear Conversation", key="clear")
212
 
213
  if clear_button:
@@ -221,44 +90,29 @@ if prompt := st.chat_input("Tell me about YSA"):
221
  st.session_state.messages.append({"role": "user", "content": prompt})
222
 
223
  question = prompt
224
-
225
  with st.spinner("Wait for it..."):
226
- # strategy 1
227
- # docs = db.similarity_search(question)
228
- # docs_2 = db.similarity_search_with_score(question)
229
- # docs_2_table = pd.DataFrame(
230
- # {
231
- # "source": [docs_2[i][0].metadata["source"] for i in range(len(docs))],
232
- # "content": [docs_2[i][0].page_content for i in range(len(docs))],
233
- # "distances": [docs_2[i][1] for i in range(len(docs))],
234
- # }
235
- # )
236
- # ref_from_db_search = docs_2_table["content"]
237
-
238
- # strategy 2
239
- results = collection.query(
240
- query_texts=question,
241
- n_results=5
242
- )
243
  idx = results["ids"][0]
244
  idx = [int(i) for i in idx]
245
  ref = pd.DataFrame(
246
  {
247
  "idx": idx,
248
- "questions": [dataset["train"]['questions'][i] for i in idx],
249
- "answers": [dataset["train"]['answers'][i] for i in idx],
250
- "distances": results["distances"][0]
251
  }
252
  )
253
  # special_threshold = st.sidebar.slider('How old are you?', 0, 0.6, 0.1) # 0.3
254
  filtered_ref = ref[ref["distances"] < special_threshold]
255
  if filtered_ref.shape[0] > 0:
256
  st.success("There are highly relevant information in our database.")
257
- ref_from_db_search = filtered_ref["answers"].str.cat(sep=' ')
258
  final_ref = filtered_ref
259
  else:
260
- st.warning("The database may not have relevant information to help your question so please be aware of hallucinations.")
261
- ref_from_db_search = ref["answers"].str.cat(sep=' ')
 
 
262
  final_ref = ref
263
 
264
  try:
@@ -275,17 +129,6 @@ if prompt := st.chat_input("Tell me about YSA"):
275
  for i in range(final_ref.shape[0]):
276
  this_quest = question
277
  this_content = final_ref["answers"][i]
278
- # prompt_for_ai_judge = f"""
279
- # The user asked a question: {question}
280
-
281
- # We have found this content: {this_content}
282
-
283
- # From 0 to 10, rate how well the content answer the user's question.
284
-
285
- # Only produce a number from 0 to 10 while 10 being the best at answer user's question.
286
-
287
- # If the content is a list of questions or not related to the user's question or it says inference endpoint is down, then you should say 0, because it does not answer user's question.
288
- # """
289
  this_score = ai_judge(question, this_content)
290
  independent_ai_judge_score.append(this_score)
291
 
 
1
  import os
2
+ import string
3
+ from typing import Any, Dict, List, Tuple, Union
4
 
5
+ import chromadb
6
  import numpy as np
7
  import openai
8
  import pandas as pd
9
+ import requests
10
  import streamlit as st
11
+ from datasets import load_dataset
12
  from langchain.document_loaders import TextLoader
13
  from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
14
  from langchain.text_splitter import CharacterTextSplitter
15
  from langchain.vectorstores import Chroma
16
  from scipy.spatial.distance import cosine
17
 
18
+ from utils.helper_functions import *
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ openai.api_key = os.environ["OPENAI_API_KEY"]
21
 
 
 
22
 
23
+ # Load the dataset from a provided source.
24
+ dataset = load_dataset(
25
+ "eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted"
26
+ )
 
27
 
28
+ # Initialize a new client for ChromeDB.
29
+ client = chromadb.Client()
 
30
 
31
+ # Generate a random number between 1 billion and 10 billion.
32
+ random_number: int = np.random.randint(low=1e9, high=1e10)
 
 
33
 
34
+ # Generate a random string consisting of 10 uppercase letters and digits.
35
+ random_string: str = "".join(
36
+ np.random.choice(list(string.ascii_uppercase + string.digits), size=10)
37
+ )
38
 
39
+ # Combine the random number and random string into one identifier.
40
+ combined_string: str = f"{random_number}{random_string}"
 
 
41
 
42
+ # Create a new collection in ChromeDB with the combined string as its name.
 
 
 
 
43
  collection = client.create_collection(combined_string)
44
 
45
+
46
  # Embed and store the first N supports for this demo
47
+ L = len(dataset["train"]["questions"])
48
  collection.add(
49
  ids=[str(i) for i in range(0, L)], # IDs are just strings
50
+ documents=dataset["train"]["questions"], # Enter questions here
51
  metadatas=[{"type": "support"} for _ in range(0, L)],
52
  )
53
 
 
69
 
70
  This app guides you through YSA's website, utilizing a RAG-ready Q&A dataset [here](https://huggingface.co/datasets/eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted) for chatbot assistance. 🤖 Enter a question, and it finds similar ones in the database, offering answers with a distance score to gauge relevance—the lower the score, the closer the match. 🎯 For better accuracy and to reduce errors, user feedback helps refine the database. ✨
71
 
72
+ """
73
+ )
74
+ st.sidebar.success(
75
+ "Please enter a distance threshold (we advise to set it around 0.2)."
76
+ )
77
+ special_threshold = st.sidebar.number_input(
78
+ "Insert a number", value=0.2, placeholder="Type a number..."
79
+ ) # 0.3
80
  clear_button = st.sidebar.button("Clear Conversation", key="clear")
81
 
82
  if clear_button:
 
90
  st.session_state.messages.append({"role": "user", "content": prompt})
91
 
92
  question = prompt
 
93
  with st.spinner("Wait for it..."):
94
+ results = collection.query(query_texts=question, n_results=5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  idx = results["ids"][0]
96
  idx = [int(i) for i in idx]
97
  ref = pd.DataFrame(
98
  {
99
  "idx": idx,
100
+ "questions": [dataset["train"]["questions"][i] for i in idx],
101
+ "answers": [dataset["train"]["answers"][i] for i in idx],
102
+ "distances": results["distances"][0],
103
  }
104
  )
105
  # special_threshold = st.sidebar.slider('How old are you?', 0, 0.6, 0.1) # 0.3
106
  filtered_ref = ref[ref["distances"] < special_threshold]
107
  if filtered_ref.shape[0] > 0:
108
  st.success("There are highly relevant information in our database.")
109
+ ref_from_db_search = filtered_ref["answers"].str.cat(sep=" ")
110
  final_ref = filtered_ref
111
  else:
112
+ st.warning(
113
+ "The database may not have relevant information to help your question so please be aware of hallucinations."
114
+ )
115
+ ref_from_db_search = ref["answers"].str.cat(sep=" ")
116
  final_ref = ref
117
 
118
  try:
 
129
  for i in range(final_ref.shape[0]):
130
  this_quest = question
131
  this_content = final_ref["answers"][i]
 
 
 
 
 
 
 
 
 
 
 
132
  this_score = ai_judge(question, this_content)
133
  independent_ai_judge_score.append(this_score)
134
 
utils/helper_functions.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import string
3
+ from typing import Any, Dict, List, Tuple, Union
4
+
5
+ import chromadb
6
+ import numpy as np
7
+ import openai
8
+ import pandas as pd
9
+ import requests
10
+ import streamlit as st
11
+ from datasets import load_dataset
12
+ from langchain.document_loaders import TextLoader
13
+ from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
14
+ from langchain.text_splitter import CharacterTextSplitter
15
+ from langchain.vectorstores import Chroma
16
+ from scipy.spatial.distance import cosine
17
+
18
+ openai.api_key = os.environ["OPENAI_API_KEY"]
19
+
20
+
21
+ def call_chatgpt(prompt: str) -> str:
22
+ """
23
+ Uses the OpenAI API to generate an AI response to a prompt.
24
+
25
+ Args:
26
+ prompt: A string representing the prompt to send to the OpenAI API.
27
+
28
+ Returns:
29
+ A string representing the AI's generated response.
30
+
31
+ """
32
+
33
+ # Use the OpenAI API to generate a response based on the input prompt.
34
+ response = openai.Completion.create(
35
+ model="gpt-3.5-turbo-instruct",
36
+ prompt=prompt,
37
+ temperature=0.5,
38
+ max_tokens=500,
39
+ top_p=1,
40
+ frequency_penalty=0,
41
+ presence_penalty=0,
42
+ )
43
+
44
+ # Extract the text from the first (and only) choice in the response output.
45
+ ans = response.choices[0]["text"]
46
+
47
+ # Return the generated AI response.
48
+ return ans
49
+
50
+
51
+ def ai_judge(sentence1: str, sentence2: str) -> float:
52
+ API_URL = "https://laazu6ral9w37pfb.us-east-1.aws.endpoints.huggingface.cloud"
53
+ headers = {"Accept": "application/json", "Content-Type": "application/json"}
54
+
55
+ def helper(payload):
56
+ response = requests.post(API_URL, headers=headers, json=payload)
57
+ return response.json()
58
+
59
+ data = helper(
60
+ {
61
+ "source_sentence": sentence1,
62
+ "sentences": [sentence2, sentence2],
63
+ "parameters": {},
64
+ }
65
+ )
66
+
67
+ # result = data['similarities']
68
+
69
+ return data
70
+
71
+
72
+ def query(payload: Dict[str, Any]) -> Dict[str, Any]:
73
+ """
74
+ Sends a JSON payload to a predefined API URL and returns the JSON response.
75
+ Args:
76
+ payload (Dict[str, Any]): The JSON payload to be sent to the API.
77
+ Returns:
78
+ Dict[str, Any]: The JSON response received from the API.
79
+ """
80
+
81
+ # API endpoint URL
82
+ API_URL = "https://sks7h7h5qkhoxwxo.us-east-1.aws.endpoints.huggingface.cloud"
83
+
84
+ # Headers to indicate both the request and response formats are JSON
85
+ headers = {"Accept": "application/json", "Content-Type": "application/json"}
86
+
87
+ # Sending a POST request with the JSON payload and headers
88
+ response = requests.post(API_URL, headers=headers, json=payload)
89
+
90
+ # Returning the JSON response
91
+ return response.json()
92
+
93
+
94
+ def llama2_7b_ysa(prompt: str) -> str:
95
+ """
96
+ Queries a model and retrieves the generated text based on the given prompt.
97
+ This function sends a prompt to a model (presumably named 'llama2_7b') and extracts
98
+ the generated text from the model's response. It's tailored for handling responses
99
+ from a specific API or model query structure where the response is expected to be
100
+ a list of dictionaries, with at least one dictionary containing a key 'generated_text'.
101
+ Parameters:
102
+ - prompt (str): The text prompt to send to the model.
103
+ Returns:
104
+ - str: The generated text response from the model.
105
+ Note:
106
+ - The function assumes that the 'query' function is previously defined and accessible
107
+ within the same scope or module. It should send a request to the model and return
108
+ the response in a structured format.
109
+ - The 'parameters' dictionary is passed empty but can be customized to include specific
110
+ request parameters as needed by the model API.
111
+ """
112
+
113
+ # Define the query payload with the prompt and any additional parameters
114
+ query_payload: Dict[str, Any] = {"inputs": prompt, "parameters": {}}
115
+
116
+ # Send the query to the model and store the output response
117
+ output = query(query_payload)
118
+
119
+ # Extract the 'generated_text' from the first item in the response list
120
+ response: str = output[0]["generated_text"]
121
+
122
+ return response