prasadnu commited on
Commit
d0de3b9
·
1 Parent(s): 238ba3e
Files changed (3) hide show
  1. .gitignore +1 -0
  2. requirements.txt +0 -1
  3. semantic_search/llm_eval.py +24 -52
.gitignore CHANGED
@@ -10,3 +10,4 @@ split_pdf_csv/
10
  uploaded_images/
11
  images/
12
  gen_images/
 
 
10
  uploaded_images/
11
  images/
12
  gen_images/
13
+ app.zip
requirements.txt CHANGED
@@ -22,4 +22,3 @@ matplotlib
22
  scipy
23
  seaborn
24
  Pillow
25
- nltk
 
22
  scipy
23
  seaborn
24
  Pillow
 
semantic_search/llm_eval.py CHANGED
@@ -10,41 +10,38 @@ import logging
10
  import requests
11
  import numpy as np
12
  import pandas as pd
13
- from PIL import Image
14
  from typing import List
15
  from botocore.auth import SigV4Auth
16
- from langchain.llms.bedrock import Bedrock
17
  from botocore.awsrequest import AWSRequest
18
  import streamlit as st
19
  import re
20
- import numpy as np
21
  from sklearn.metrics import ndcg_score,dcg_score
22
  from sklearn import preprocessing as pre
23
- import invoke_models
24
 
25
- bedrock_ = boto3.client(
26
- 'bedrock-runtime',
27
- aws_access_key_id=st.secrets['user_access_key'],
28
- aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1'
29
- )
30
 
31
- inference_modifier = {
32
- "max_tokens_to_sample": 4096,
33
- "temperature": 0,
34
- "top_k": 250,
35
- "top_p": 1,
36
- "stop_sequences": ["\n\nHuman"],
37
- }
38
- textgen_llm = Bedrock(
39
- model_id="anthropic.claude-v2:1",
40
- client=bedrock_,
41
- model_kwargs=inference_modifier,
42
- )
43
 
44
 
45
  #@st.cache_data
46
  def eval(question, answers):
47
- #if()
48
  search_results: str = ""
49
  prompt: str = """Human: You are a grader assessing relevance of a retrieved document to a user question. \n
50
  The User question and Retrieved documents are provided below. The Retrieved documents are retail product descriptions that the human is looking for. \n
@@ -73,21 +70,11 @@ def eval(question, answers):
73
  search_results += f"Index: {index_}, Description: {desc}\n\n"
74
  index_ = index_+1
75
  prompt = prompt.format(query, search_results)
76
- # print(answers[0]['answer'])
77
- # print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>")
78
- # print(prompt)
79
-
80
- response = textgen_llm(prompt)
81
- #invoke_models.invoke_llm_model(prompt,False)
82
- print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>")
83
- print(response)
84
  inter_trim =response.split("[")[1]
85
  final_out = json.loads('{"results":['+inter_trim.split("]")[0]+']}')
86
- #final_out_sorted_desc = sorted(final_out['results'], key=lambda d: d['Score'],reverse=True)
87
- # print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>")
88
- # print(final_out_sorted_desc)
89
-
90
- #true_relevance = np.asarray([[10, 0, 0, 1, 5]])
91
  llm_scores = []
92
  current_scores = []
93
  for idx,i in enumerate(answers[0]['answer']):
@@ -105,7 +92,6 @@ def eval(question, answers):
105
 
106
 
107
 
108
- # llm_scores.sort(reverse = True)
109
  x = np.array(llm_scores)
110
  x = x.reshape(-1, 1)
111
  x_norm = (pre.MinMaxScaler().fit_transform(x)).flatten().tolist()
@@ -116,24 +102,13 @@ def eval(question, answers):
116
 
117
 
118
  st.session_state.answers = answers
119
-
120
- # print(x_norm)
121
- # print(y_norm)
122
-
123
  dcg = dcg_score(np.asarray([llm_scores]),np.asarray([current_scores]))
124
- # print("DCG score : ", dcg)
125
-
126
  # IDCG score
127
  idcg = dcg_score(np.asarray([llm_scores]),np.asarray([llm_scores]))
128
- # print("IDCG score : ", idcg)
129
 
130
  # Normalized DCG score
131
  ndcg = dcg
132
-
133
- # print(st.session_state.input_ndcg)
134
- # if(st.session_state.input_previous_query!=""):
135
- # if(st.session_state.input_previous_query == st.session_state.input_text):
136
- # st.session_state.input_ndcg=0.0
137
  if(ndcg > st.session_state.input_ndcg and st.session_state.input_ndcg!=0.0):
138
  st.session_state.ndcg_increase = "↑~"+str('%.3f'%(ndcg-st.session_state.input_ndcg ))
139
  elif(ndcg < st.session_state.input_ndcg):
@@ -143,7 +118,4 @@ def eval(question, answers):
143
 
144
 
145
 
146
- st.session_state.input_ndcg = ndcg#round(ndcg_score(np.asarray([x_norm]), np.asarray([y_norm]), k=st.session_state.input_K),2)
147
- print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>")
148
- print(st.session_state.input_ndcg)
149
-
 
10
  import requests
11
  import numpy as np
12
  import pandas as pd
 
13
  from typing import List
14
  from botocore.auth import SigV4Auth
15
+ #from langchain.llms.bedrock import Bedrock
16
  from botocore.awsrequest import AWSRequest
17
  import streamlit as st
18
  import re
 
19
  from sklearn.metrics import ndcg_score,dcg_score
20
  from sklearn import preprocessing as pre
21
+ import invoke_models#invoke_llm_model
22
 
23
+ # bedrock_ = boto3.client(
24
+ # 'bedrock-runtime',
25
+ # aws_access_key_id=st.secrets['user_access_key'],
26
+ # aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1'
27
+ # )
28
 
29
+ # inference_modifier = {
30
+ # "max_tokens_to_sample": 4096,
31
+ # "temperature": 0,
32
+ # "top_k": 250,
33
+ # "top_p": 1,
34
+ # "stop_sequences": ["\n\nHuman"],
35
+ # }
36
+ # textgen_llm = Bedrock(
37
+ # model_id="anthropic.claude-v2:1",
38
+ # client=bedrock_,
39
+ # model_kwargs=inference_modifier,
40
+ # )
41
 
42
 
43
  #@st.cache_data
44
  def eval(question, answers):
 
45
  search_results: str = ""
46
  prompt: str = """Human: You are a grader assessing relevance of a retrieved document to a user question. \n
47
  The User question and Retrieved documents are provided below. The Retrieved documents are retail product descriptions that the human is looking for. \n
 
70
  search_results += f"Index: {index_}, Description: {desc}\n\n"
71
  index_ = index_+1
72
  prompt = prompt.format(query, search_results)
73
+ response = invoke_llm_model.invoke_llm_model(prompt,False)
74
+ #response = textgen_llm(prompt)
75
+ print("Response from LLM: ", response)
 
 
 
 
 
76
  inter_trim =response.split("[")[1]
77
  final_out = json.loads('{"results":['+inter_trim.split("]")[0]+']}')
 
 
 
 
 
78
  llm_scores = []
79
  current_scores = []
80
  for idx,i in enumerate(answers[0]['answer']):
 
92
 
93
 
94
 
 
95
  x = np.array(llm_scores)
96
  x = x.reshape(-1, 1)
97
  x_norm = (pre.MinMaxScaler().fit_transform(x)).flatten().tolist()
 
102
 
103
 
104
  st.session_state.answers = answers
 
 
 
 
105
  dcg = dcg_score(np.asarray([llm_scores]),np.asarray([current_scores]))
106
+
 
107
  # IDCG score
108
  idcg = dcg_score(np.asarray([llm_scores]),np.asarray([llm_scores]))
 
109
 
110
  # Normalized DCG score
111
  ndcg = dcg
 
 
 
 
 
112
  if(ndcg > st.session_state.input_ndcg and st.session_state.input_ndcg!=0.0):
113
  st.session_state.ndcg_increase = "&uarr;~"+str('%.3f'%(ndcg-st.session_state.input_ndcg ))
114
  elif(ndcg < st.session_state.input_ndcg):
 
118
 
119
 
120
 
121
+ st.session_state.input_ndcg = ndcg