Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- __pycache__/LLM_usage.cpython-310.pyc +0 -0
- __pycache__/prompt.cpython-310.pyc +0 -0
- __pycache__/retriever.cpython-310.pyc +0 -0
- __pycache__/tokenizing.cpython-310.pyc +0 -0
- app.py +11 -7
- requirements.txt +3 -3
- retriever.py +3 -4
- test_docs.py +9 -7
__pycache__/LLM_usage.cpython-310.pyc
ADDED
Binary file (1.78 kB). View file
|
|
__pycache__/prompt.cpython-310.pyc
ADDED
Binary file (653 Bytes). View file
|
|
__pycache__/retriever.cpython-310.pyc
ADDED
Binary file (4.88 kB). View file
|
|
__pycache__/tokenizing.cpython-310.pyc
ADDED
Binary file (1.28 kB). View file
|
|
app.py
CHANGED
@@ -19,7 +19,9 @@ def initialize_bot(api_key):
|
|
19 |
# Set the API key
|
20 |
os.environ['GROQ_API_KEY'] = api_key
|
21 |
|
22 |
-
pathes = ["./Data/hate_speech_processed.json", "./Data/reddit_jokes2_processed.json",
|
|
|
|
|
23 |
|
24 |
# Load documents (done once)
|
25 |
if not docs: # Only load if docs are not already loaded
|
@@ -70,7 +72,11 @@ setup_demo = gr.Interface(
|
|
70 |
inputs=[gr.Textbox(label="Enter your GROQ API Key")],
|
71 |
outputs=[gr.Textbox(label="Setup Status")],
|
72 |
title="Setup Joke Generator",
|
73 |
-
description="Initialize the Joke Generator Bot by providing the GROQ API key.
|
|
|
|
|
|
|
|
|
74 |
)
|
75 |
|
76 |
regime_options = ["BM25 Only", "Semantic Only", "Scores Combination"]
|
@@ -88,9 +94,8 @@ joke_demo = gr.Interface(
|
|
88 |
title="Joke Generator",
|
89 |
description="Generate jokes based on your input message(Only in English :( )). Select a retrieval regime and view the context used.\
|
90 |
Be careful, the jokes can be offensive! Try to write a message that is related to the joke you want to hear.\
|
91 |
-
(tell me a joke and its title about... or tell me a
|
92 |
-
In this case, try to rewrite a message and send again
|
93 |
-
the link again, after reinitialize joke generator with API KEY.\
|
94 |
Or try to change the regime or BM25 Coefficient.\
|
95 |
BM25 Coefficient is used to balance the BM25 and semantic scores(It is active only in Scores Combination mode). Semantic scores are multiplied by (1 - BM25 Coefficient).\
|
96 |
If you want to use only BM25 or semantic scores, select the corresponding regime or set it to 0.0 or 1.0. respectively.",
|
@@ -105,6 +110,5 @@ demo = gr.TabbedInterface(
|
|
105 |
)
|
106 |
|
107 |
# Launch the interface
|
108 |
-
# demo.launch()
|
109 |
-
# demo.launch(share=True)
|
110 |
demo.launch()
|
|
|
|
19 |
# Set the API key
|
20 |
os.environ['GROQ_API_KEY'] = api_key
|
21 |
|
22 |
+
pathes = ["./Data/hate_speech_processed.json", "./Data/reddit_jokes2_processed.json",
|
23 |
+
"./Data/stupidstuff_processed.json", "./Data/wocka_processed.json",
|
24 |
+
"./Data/reddit_jokes1_processed.json"]
|
25 |
|
26 |
# Load documents (done once)
|
27 |
if not docs: # Only load if docs are not already loaded
|
|
|
72 |
inputs=[gr.Textbox(label="Enter your GROQ API Key")],
|
73 |
outputs=[gr.Textbox(label="Setup Status")],
|
74 |
title="Setup Joke Generator",
|
75 |
+
description="Initialize the Joke Generator Bot by providing the GROQ API key. \
|
76 |
+
(If there is a connection error(on this or next tab) reload the page, wait 5-10 imnutes, \
|
77 |
+
reload the page again and reinitialize the joke generator with the API KEY)\
|
78 |
+
If you see some runtime error like memory limit exceeded, tell me on mail: vasyarusynb@gmail.com(I can see your email not so fast)\
|
79 |
+
or tg: @Beav3rrr and I will redeploy or turn on new instance",
|
80 |
)
|
81 |
|
82 |
regime_options = ["BM25 Only", "Semantic Only", "Scores Combination"]
|
|
|
94 |
title="Joke Generator",
|
95 |
description="Generate jokes based on your input message(Only in English :( )). Select a retrieval regime and view the context used.\
|
96 |
Be careful, the jokes can be offensive! Try to write a message that is related to the joke you want to hear.\
|
97 |
+
(tell me a joke and its title about... or tell me a joke and its title about... it should be a oneliner, dark, pervy, etc.). Sometimes bot works bad :(\
|
98 |
+
In this case, try to rewrite a message and send again.\
|
|
|
99 |
Or try to change the regime or BM25 Coefficient.\
|
100 |
BM25 Coefficient is used to balance the BM25 and semantic scores(It is active only in Scores Combination mode). Semantic scores are multiplied by (1 - BM25 Coefficient).\
|
101 |
If you want to use only BM25 or semantic scores, select the corresponding regime or set it to 0.0 or 1.0. respectively.",
|
|
|
110 |
)
|
111 |
|
112 |
# Launch the interface
|
|
|
|
|
113 |
demo.launch()
|
114 |
+
# demo.launch(share=True)
|
requirements.txt
CHANGED
@@ -18,7 +18,7 @@ distro==1.9.0
|
|
18 |
docopt==0.6.2
|
19 |
exceptiongroup==1.2.2
|
20 |
executing==2.1.0
|
21 |
-
fastapi==0.115.5
|
22 |
ffmpy==0.4.0
|
23 |
filelock==3.16.1
|
24 |
frozenlist==1.5.0
|
@@ -108,7 +108,7 @@ sentence-transformers==3.3.1
|
|
108 |
shellingham==1.5.4
|
109 |
six==1.16.0
|
110 |
sniffio==1.3.1
|
111 |
-
stack-data==0.6.
|
112 |
starlette==0.41.3
|
113 |
sympy==1.13.3
|
114 |
threadpoolctl==3.5.0
|
@@ -130,5 +130,5 @@ uvicorn==0.32.0
|
|
130 |
watchdog==5.0.3
|
131 |
wcwidth==0.2.13
|
132 |
websockets==12.0
|
133 |
-
yarl==1.17.2
|
134 |
zipp==3.21.0
|
|
|
18 |
docopt==0.6.2
|
19 |
exceptiongroup==1.2.2
|
20 |
executing==2.1.0
|
21 |
+
fastapi==0.115.5
|
22 |
ffmpy==0.4.0
|
23 |
filelock==3.16.1
|
24 |
frozenlist==1.5.0
|
|
|
108 |
shellingham==1.5.4
|
109 |
six==1.16.0
|
110 |
sniffio==1.3.1
|
111 |
+
stack-data==0.6.3
|
112 |
starlette==0.41.3
|
113 |
sympy==1.13.3
|
114 |
threadpoolctl==3.5.0
|
|
|
130 |
watchdog==5.0.3
|
131 |
wcwidth==0.2.13
|
132 |
websockets==12.0
|
133 |
+
yarl==1.17.2
|
134 |
zipp==3.21.0
|
retriever.py
CHANGED
@@ -92,6 +92,9 @@ class Retriever:
|
|
92 |
# In case of BM25 only, return the top n documents based on BM25 scores, if somebody sets a couple
|
93 |
# of flags to True, the func will return the top n documents based on the first flag set to True
|
94 |
|
|
|
|
|
|
|
95 |
if bm25_only:
|
96 |
semantic_only = False
|
97 |
scores_combination = False
|
@@ -112,10 +115,6 @@ class Retriever:
|
|
112 |
# Sort the documents by their BM25 scores in descending order
|
113 |
sorted_doc_indices = np.argsort(scores)
|
114 |
|
115 |
-
print("Score:", scores[sorted_doc_indices[-1]] )
|
116 |
-
print(self.docs[sorted_doc_indices[-1]])
|
117 |
-
print("Doc number:", sorted_doc_indices[-1])
|
118 |
-
|
119 |
result_docs = [self.docs[i] for i in sorted_doc_indices[-n:] if scores[i] > 0]
|
120 |
|
121 |
return result_docs[::-1] # Return the top n documents in descending order which means the most relevant documents are first
|
|
|
92 |
# In case of BM25 only, return the top n documents based on BM25 scores, if somebody sets a couple
|
93 |
# of flags to True, the func will return the top n documents based on the first flag set to True
|
94 |
|
95 |
+
# remove "tell me a joke about" ot "tell me a joke and its title about" from the user message
|
96 |
+
user_message = user_message.replace("tell me a joke about", "").replace("tell me a joke and its title about", "")
|
97 |
+
|
98 |
if bm25_only:
|
99 |
semantic_only = False
|
100 |
scores_combination = False
|
|
|
115 |
# Sort the documents by their BM25 scores in descending order
|
116 |
sorted_doc_indices = np.argsort(scores)
|
117 |
|
|
|
|
|
|
|
|
|
118 |
result_docs = [self.docs[i] for i in sorted_doc_indices[-n:] if scores[i] > 0]
|
119 |
|
120 |
return result_docs[::-1] # Return the top n documents in descending order which means the most relevant documents are first
|
test_docs.py
CHANGED
@@ -15,17 +15,18 @@ tokenized_docs_path = os.path.join(base_path, "tokenized_docs.pkl")
|
|
15 |
|
16 |
# Take all json files with names that end '_processed'
|
17 |
for path in glob.glob(f"{base_path}/*_processed.json"):
|
|
|
18 |
with open(path, 'r') as f:
|
19 |
docs.extend(json.load(f))
|
20 |
|
21 |
index = 0
|
22 |
|
23 |
-
for i, doc in enumerate(docs):
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
|
30 |
with open(bm25_path, 'rb') as f:
|
31 |
bm25 = pickle.load(f)
|
@@ -39,7 +40,7 @@ with open(bm25_path, 'rb') as f:
|
|
39 |
# with open(bm25_path, 'wb') as f:
|
40 |
# pickle.dump(bm25, f)
|
41 |
|
42 |
-
message = "tell me a joke about
|
43 |
tokenized_message = tokenize_text(message)
|
44 |
print(tokenized_message)
|
45 |
scores = torch.tensor(bm25.get_scores(tokenized_message))
|
@@ -48,6 +49,7 @@ sorted_doc_indices = np.argsort(scores)
|
|
48 |
for i in range(1, 2):
|
49 |
print("Score:", scores[sorted_doc_indices[-i]] )
|
50 |
print(docs[sorted_doc_indices[-i]])
|
|
|
51 |
|
52 |
# result_docs = [docs[i] for i in sorted_doc_indices[-30:] if scores[i] > 0]
|
53 |
|
|
|
15 |
|
16 |
# Take all json files with names that end '_processed'
|
17 |
for path in glob.glob(f"{base_path}/*_processed.json"):
|
18 |
+
print(path)
|
19 |
with open(path, 'r') as f:
|
20 |
docs.extend(json.load(f))
|
21 |
|
22 |
index = 0
|
23 |
|
24 |
+
# for i, doc in enumerate(docs):
|
25 |
+
# if 'body' in doc:
|
26 |
+
# if doc['body'] == "I don't fuck the sandwich before eating it":
|
27 |
+
# tokenized_doc = tokenize_doc(doc)
|
28 |
+
# print(tokenized_doc)
|
29 |
+
# index = i
|
30 |
|
31 |
with open(bm25_path, 'rb') as f:
|
32 |
bm25 = pickle.load(f)
|
|
|
40 |
# with open(bm25_path, 'wb') as f:
|
41 |
# pickle.dump(bm25, f)
|
42 |
|
43 |
+
message = "tell me a joke about sandwich before eating it"
|
44 |
tokenized_message = tokenize_text(message)
|
45 |
print(tokenized_message)
|
46 |
scores = torch.tensor(bm25.get_scores(tokenized_message))
|
|
|
49 |
for i in range(1, 2):
|
50 |
print("Score:", scores[sorted_doc_indices[-i]] )
|
51 |
print(docs[sorted_doc_indices[-i]])
|
52 |
+
print("Doc number:", sorted_doc_indices[-i])
|
53 |
|
54 |
# result_docs = [docs[i] for i in sorted_doc_indices[-30:] if scores[i] > 0]
|
55 |
|