sam2ai commited on
Commit
11fa0f1
1 Parent(s): 52e6714

Synced repo using 'sync_with_huggingface' Github Action

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ olive_farm.png filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.8-slim-buster
2
+
3
+ WORKDIR /app
4
+
5
+ COPY ./requirements.txt /app/requirements.txt
6
+ # COPY ./packages.txt /app/packages.txt
7
+
8
+ # RUN apt-get update && xargs -r -a /app/packages.txt apt-get install -y && rm -rf /var/lib/apt/lists/*
9
+ RUN pip3 install --no-cache-dir -r /app/requirements.txt
10
+
11
+ # User
12
+ RUN useradd -m -u 1000 user
13
+ USER user
14
+ ENV HOME /home/user
15
+ ENV PATH $HOME/.local/bin:$PATH
16
+
17
+ WORKDIR $HOME
18
+ RUN mkdir app
19
+ WORKDIR $HOME/app
20
+ COPY . $HOME/app
21
+
22
+ EXPOSE 8501
23
+ CMD streamlit run app.py \
24
+ --server.headless true \
25
+ --server.enableCORS false \
26
+ --server.enableXsrfProtection false \
27
+ --server.fileWatcherType none
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
__pycache__/custom_prompt_template.cpython-311.pyc ADDED
Binary file (2.41 kB). View file
 
__pycache__/custom_prompt_template.cpython-39.pyc ADDED
Binary file (1.33 kB). View file
 
app.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ import justext
4
+ import pdfplumber
5
+ import docx2txt
6
+ import json
7
+ import ast
8
+ import os
9
+ import re
10
+ import openai
11
+ import json
12
+
13
+ from custom_prompt_template import InstructionGenerationTemplate, AnswerGenerationTemplate
14
+
15
+
16
+ st.set_page_config(page_title="LLM instruction Generator")
17
+
18
+ st.sidebar.success("Select a page above")
19
+
20
+
21
+ # function for the odia stoplists justext
22
+ def odia_stoplist():
23
+ odia_stopwords = [
24
+ "ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର",
25
+ "ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର",
26
+ "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର",
27
+ "ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ",
28
+ "ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା",
29
+ "ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |",
30
+ "କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ",
31
+ "ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ",
32
+ "ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ",
33
+ "ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ",
34
+ "ତାପରେ", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ",
35
+ "ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି",
36
+ "ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ",
37
+ "କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ"
38
+ ]
39
+ return frozenset(odia_stopwords)
40
+
41
+
42
+ # function to extract data from url using justext
43
+ def extract_data_from_url(url, language):
44
+ try:
45
+ response = requests.get(url)
46
+
47
+ if response.status_code == 200:
48
+ print("inside the response")
49
+ response.raise_for_status()
50
+ page = response.content
51
+ para = ""
52
+ if language == "English":
53
+ paragraphs = justext.justext(page, justext.get_stoplist("English"))
54
+ elif language == "Hindi":
55
+ paragraphs = justext.justext(page, justext.get_stoplist("Hindi"), 70, 140, 0.0, 0.02, 0.5, 150, False)
56
+ elif language == "Odia":
57
+ paragraphs = justext.justext(
58
+ page, odia_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False
59
+ )
60
+
61
+ for paragraph in paragraphs:
62
+ if not paragraph.is_boilerplate:
63
+ para = para + "\n" + paragraph.text
64
+ # returning the extracted data i.e para as string
65
+ if para == "":
66
+ st.error("Unable to extract data from the URL")
67
+ return None
68
+ else:
69
+ return para
70
+ else:
71
+ st.error("Request failed ")
72
+ return None
73
+ except Exception as err:
74
+ st.error(err)
75
+ return None
76
+
77
+
78
+
79
+
80
+ # function to extract data from documents
81
+ def extract_data_from_documents(documents):
82
+ data = ""
83
+ if documents is not None:
84
+ for document in documents:
85
+ document_details = {
86
+ "filename": document.name,
87
+ "filetype": document.type,
88
+ "filesize": document.size,
89
+ }
90
+ st.write(document_details)
91
+
92
+ # Extract content from the txt file
93
+ if document.type == "text/plain":
94
+ # Read as bytes
95
+ data += str(document.read(), "utf-8")
96
+
97
+ # Extract content from the pdf file
98
+ elif document.type == "application/pdf":
99
+ # using pdfplumber
100
+ try:
101
+ with pdfplumber.open(document) as pdf:
102
+ all_text = ""
103
+ for page in pdf.pages:
104
+ text = page.extract_text()
105
+ all_text += text + "\n"
106
+ data += all_text
107
+ except requests.exceptions.RequestException as e:
108
+ st.write("None")
109
+
110
+ # Extract content from the docx file
111
+ elif (
112
+ document.type
113
+ == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
114
+ ):
115
+ data += docx2txt.process(document)
116
+
117
+ # return extract data
118
+ return data
119
+ else:
120
+ st.error("Error: An error occurred while fetching content.")
121
+ # return extract status, and the data extracted
122
+ return None
123
+
124
+
125
+ # function for the keyboard
126
+
127
+
128
+
129
+ # Check the inputs for language, promptType
130
+ def valid_drop_down(language, promptType, noOfQuestions, instructionFormat):
131
+ langFlag = False
132
+ promptFlag = False
133
+ noOfQuestionFlag = False
134
+ instructionFormatFlag = False
135
+
136
+ if language:
137
+ langFlag = True
138
+ if promptType:
139
+ promptFlag = True
140
+ if noOfQuestions:
141
+ noOfQuestionFlag = True
142
+ if instructionFormat:
143
+ instructionFormatFlag = True
144
+ # checking for the compalsory inputs and return true only if all are set
145
+ return langFlag & promptFlag & noOfQuestionFlag & instructionFormatFlag
146
+
147
+
148
+ def main():
149
+ # setting up the initial session_states
150
+ if "extract_button" not in st.session_state:
151
+ st.session_state.extract_button = False
152
+ if "submit" not in st.session_state:
153
+ st.session_state.submit = False
154
+ if "generated" not in st.session_state:
155
+ st.session_state.generated = False
156
+ if "selected" not in st.session_state:
157
+ st.session_state.selected = False
158
+ if "answered" not in st.session_state:
159
+ st.session_state.answered = False
160
+
161
+ st.subheader("LLM Instructions")
162
+
163
+ # form to get the inputs
164
+ with st.form(key="form1"):
165
+ st.write("#")
166
+
167
+ # dropdown for language
168
+ language = st.selectbox("Select a language", ("", "English", "Hindi", "Odia"))
169
+
170
+ # dropdown for prompt type
171
+ promptType = st.selectbox(
172
+ "Select the Prompt type", ("", "Input text", "Url", "Document")
173
+ )
174
+ # inputs for number
175
+ noOfQuestions = st.number_input(
176
+ "Number of questions to generate:", min_value=1, max_value=20, value=10
177
+ )
178
+
179
+ # dropdown for language
180
+ instructionFormat = st.selectbox(
181
+ "Format of instruction:", ("Imperative sentence", "Question")
182
+ )
183
+
184
+ # input text for openAiKey
185
+ openAiKey = st.text_input(label="Input the openai key")
186
+ if "openAiKey" in st.session_state:
187
+ st.session_state["openAiKey"] = openAiKey
188
+ else:
189
+ st.session_state["openAiKey"] = openAiKey
190
+
191
+ st.write("##")
192
+
193
+ # form submit button and setting up the session_state
194
+ if st.form_submit_button():
195
+ st.session_state.submit = True
196
+
197
+ if st.session_state.submit:
198
+ # extends the prompt form to extract the data
199
+ with st.expander(label="prompt"):
200
+ with st.form(key="form2"):
201
+ # calling the function inside if to check valid drop down inputs
202
+ if valid_drop_down(
203
+ language, promptType, noOfQuestions, instructionFormat
204
+ ):
205
+ if promptType == "Input text":
206
+ inputText = st.text_area(
207
+ label="For Instructions",
208
+ placeholder="Please enter your text here",
209
+ )
210
+
211
+ elif promptType == "Url":
212
+ url = st.text_input(
213
+ label="For URL", placeholder="Please enter your text here"
214
+ )
215
+ elif promptType == "Document":
216
+ documents = st.file_uploader(
217
+ label="For Documents ( pdf / txt / docx )",
218
+ type=["pdf", "txt", "docx"],
219
+ accept_multiple_files=True,
220
+ )
221
+
222
+ # if addInfoCheckbox:
223
+ # additionalInfo = st.text_input(
224
+ # label="Additional Instructions",
225
+ # placeholder="Please enter your text here",
226
+ # )
227
+
228
+ if st.form_submit_button():
229
+ st.session_state.extract_button = True
230
+ # st.experimental_rerun()
231
+
232
+ # extracting data
233
+ if st.session_state.extract_button:
234
+ # extracting data
235
+
236
+ if promptType == "Input text":
237
+ extractedData = inputText
238
+
239
+ elif promptType == "Url":
240
+ extractedURLData = extract_data_from_url(url, language)
241
+ if extractedURLData is not None:
242
+ extractedData = extractedURLData
243
+ st.text_area("Extracted Text:", value=extractedData, height=200)
244
+ else:
245
+ extractedData = False
246
+ elif promptType == "Document":
247
+ if not documents:
248
+ documents = None
249
+ else:
250
+ for doc in documents:
251
+ if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
252
+ # if documents is not the relevant type
253
+ st.error("Unsupported file: " + doc.name)
254
+
255
+ extractedDocumentData = extract_data_from_documents(documents)
256
+ extractedData = extractedDocumentData
257
+
258
+
259
+ # if the values are extracted running the custom prompt by creating an instance
260
+ if extractedData:
261
+
262
+
263
+ # ----------------------------- RUNNING THE PROMPT -----------------------------
264
+ if "extractedData" not in st.session_state:
265
+ st.session_state["extractedData"] = extractedData
266
+ else:
267
+ st.session_state["extractedData"] = extractedData
268
+
269
+ if "Initial" not in st.session_state:
270
+ st.session_state.Initial=True
271
+
272
+ if st.session_state.Initial == True:
273
+
274
+ # running the prompt form here
275
+
276
+ openai.api_key = st.session_state["openAiKey"]
277
+ my_prompt_template = InstructionGenerationTemplate()
278
+
279
+ # providing the rules for the instructions to be generated
280
+ additional_rules = """
281
+ - You do not need to provide a response to the generated examples.
282
+ - You must return the response in the specified language.
283
+ - Each generated instruction can be either an imperative sentence or a question.
284
+ """
285
+
286
+ if st.button("Generate Instructions"):
287
+ prompt = my_prompt_template.format(
288
+ num_questions=noOfQuestions,
289
+ context=extractedData,
290
+ instruction_format=instructionFormat,
291
+ lang=language,
292
+ additional_rules=additional_rules
293
+ )
294
+ response = openai.ChatCompletion.create(
295
+ model="gpt-3.5-turbo",
296
+ messages=[
297
+ {"role": "system", "content": prompt},
298
+ ])
299
+ # if st.button("Generate Instructions"):
300
+ print("Generate button")
301
+ print("Checkpoint 1!")
302
+
303
+ if "result" not in st.session_state:
304
+ content = response.choices[0].message.content
305
+ # content = "\n1. helloworld1.\n2. helloworld2"
306
+ responses_list = content.split('\n')
307
+ responses_list = [re.sub(r'^\s*\d+\.\s*', '', resp) for resp in responses_list if resp]
308
+ st.session_state["result"]=responses_list
309
+ st.session_state.generated = True
310
+ st.session_state.Initial = False
311
+ if st.session_state.generated:
312
+ # displaying the generated instructions
313
+ st.write("Generated Insuctions")
314
+ result = st.session_state["result"]
315
+ # print(type(result))
316
+ # print(result)
317
+ result_dict = {i+1: value for i,value in enumerate(result)}
318
+ selected_items = [f" {value} " for key, value in result_dict.items() if st.checkbox(f"Q{key} : {value}")]
319
+ # print(type(result_dict))
320
+ # print(result_dict)
321
+ print("Checked point 2!")
322
+ # Display the selected items as a list
323
+ if selected_items:
324
+ st.write("Selected Items:")
325
+ st.write(selected_items)
326
+ if "selected_items" not in st.session_state:
327
+ st.session_state["selected_items"] = selected_items
328
+ st.session_state["selected_items"] = selected_items
329
+ st.session_state.selected = True
330
+ else:
331
+ st.write("No items selected.")
332
+
333
+
334
+
335
+ # ----------------------------- RUNNING THE PROMPT FOR ANSWER GENERATION -----------------------------
336
+
337
+
338
+
339
+ if st.session_state.selected:
340
+
341
+ if "Initial2" not in st.session_state:
342
+ st.session_state.Initial2=True
343
+
344
+ if st.session_state.Initial2:
345
+ # running the prompt form here
346
+ openai.api_key = st.session_state["openAiKey"]
347
+ my_prompt_template2 = AnswerGenerationTemplate()
348
+
349
+ # providing the rules for the answers to be generated
350
+ additional_rules = """
351
+ Enumerate the answers and dont provide any additional tags.
352
+ """
353
+
354
+ question = st.session_state["selected_items"]
355
+ if st.button("Generate Answers"):
356
+ prompt = my_prompt_template2.format(
357
+ questions=question,
358
+ additional_rules = additional_rules
359
+ )
360
+ response = openai.ChatCompletion.create(
361
+ model="gpt-3.5-turbo",
362
+ messages=[
363
+ {"role": "system", "content": prompt},
364
+ ])
365
+
366
+ # if st.button("Generate Answers"):
367
+ # print("\n\n\n\nInside Answersss:\n\n\n\n")
368
+ # print(st.session_state["selected_items"])
369
+
370
+ # print("Generate button")
371
+ # print("Checkpoint 3!")
372
+
373
+ if "answers" not in st.session_state:
374
+ content = response.choices[0].message.content
375
+ # content = "\n1. Answer1.\n2. Answer2"
376
+ print("\n\n\n\n\nAnswerss before regex\n\n\n\n")
377
+ print(content)
378
+ # print("Answer Type:" + str(type(content)))
379
+ responses_list = content.split('\n')
380
+ # print("\n\n\n\n\nAnswerss before regex after splitting\n\n\n\n")
381
+ # print(responses_list)
382
+ # print("Answer Type:" + str(type(responses_list)))
383
+
384
+ responses_list = [re.sub(r'^\s*\d+\.\s*', '', resp) for resp in responses_list if resp]
385
+ st.session_state["answers"]=responses_list
386
+ st.session_state.answered = True
387
+ st.session_state.Initial2 = False
388
+
389
+ if st.session_state.answered:
390
+ # displaying the generated Answers
391
+
392
+ questions = st.session_state["selected_items"]
393
+ answers = st.session_state["answers"]
394
+ # print("\n\n\n\n\nAnswerss after regex\n\n\n\n")
395
+ # print(answers)
396
+ # print("Answer Type:" + str(type(answers)))
397
+ answers_dict = {i+1: value for i,value in enumerate(answers)}
398
+ # print(type(answers_dict))
399
+ # print(answers_dict)
400
+ # print("Checked point 4!")
401
+ # st.write("answers")
402
+ st.write(answers_dict)
403
+
404
+ # Create a list to hold the JSON-like data
405
+ st.write("Generated Questions and Answers")
406
+ # Create a list of dictionaries
407
+ jsonl_data = [{"Question": question, "Answer": answers_dict.get(i, 'No answer found')} for i, question in enumerate(questions, start=1)]
408
+
409
+
410
+ st.write(jsonl_data)
411
+ jsonl_string = '\n'.join(json.dumps(item, ensure_ascii=False) for item in jsonl_data)
412
+
413
+ # Display the JSONL data
414
+ print(jsonl_string)
415
+
416
+ if st.download_button(label="Save as jsonl", data=jsonl_string, mime="application/json"):
417
+ st.success("Successfully saved")
418
+
419
+
420
+
421
+
422
+ if st.button("Clear"):
423
+ st.session_state.extract_button = False
424
+ st.session_state.submit = False
425
+ st.session_state.generated = False
426
+ st.session_state.selected = False
427
+ st.session_state.answered = False
428
+
429
+
430
+
431
+ if "Initial" in st.session_state:
432
+ st.session_state.Initial = True
433
+ if "Initial2" in st.session_state:
434
+ st.session_state.Initial2 = True
435
+
436
+
437
+ if "openAiKey" in st.session_state:
438
+ del st.session_state["openAiKey"]
439
+ if "extractedData" in st.session_state:
440
+ del st.session_state["extractedData"]
441
+ if "result" in st.session_state:
442
+ del st.session_state["result"]
443
+ if "selected_items" in st.session_state:
444
+ del st.session_state["selected_items"]
445
+ if "answered" in st.session_state:
446
+ del st.session_state["answers"]
447
+ st.experimental_rerun()
448
+
449
+
450
+ if __name__ == "__main__":
451
+ main()
custom_prompt_template.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import langchain
3
+ class InstructionGenerationTemplate(langchain.prompts.PromptTemplate):
4
+ """A custom prompt template for generating instructions."""
5
+
6
+ input_variables: List[str] = ["num_questions", "context", "instruction_format", "lang", "additional_rules"]
7
+
8
+ template = """
9
+ You are a highly intelligent language model trained to assist with a variety of language tasks. Your task here is to generate {num_questions} diverse questions or instructions based on the context provided below:
10
+
11
+ Context:
12
+ {context}
13
+
14
+ Please follow these rules:
15
+ {additional_rules}
16
+
17
+ Please generate the instructions in the {instruction_format} format and in {lang} language. Remember to adhere to the rules mentioned above.
18
+ """
19
+
20
+ template_format = "f-string"
21
+ def format(self, **kwargs):
22
+ """Format the prompt."""
23
+ return self.template.format(**kwargs)
24
+
25
+ class AnswerGenerationTemplate(langchain.prompts.PromptTemplate):
26
+ """A custom prompt template for generating answers to questions."""
27
+
28
+ input_variables: List[str] = ["questions", "additional_rules"]
29
+
30
+ template = """
31
+ You are a highly intelligent language model tasked with providing answers to the following questions :
32
+
33
+ Questions:
34
+ {questions}
35
+
36
+ Please follow these rules:
37
+ {additional_rules}
38
+ """
39
+
40
+ template_format = "f-string"
41
+ def format(self, **kwargs):
42
+ """Format the prompt."""
43
+ return self.template.format(**kwargs)
data-downloader/download_eval_data.sh ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mkdir -p data/downloads
2
+ mkdir -p data/eval
3
+
4
+ # MMLU dataset
5
+ wget -O data/downloads/mmlu_data.tar https://people.eecs.berkeley.edu/~hendrycks/data.tar
6
+ mkdir -p data/downloads/mmlu_data
7
+ tar -xvf data/downloads/mmlu_data.tar -C data/downloads/mmlu_data
8
+ mv data/downloads/mmlu_data/data data/eval/mmlu && rm -r data/downloads/mmlu_data data/downloads/mmlu_data.tar
9
+
10
+
11
+ # Big-Bench-Hard dataset
12
+ wget -O data/downloads/bbh_data.zip https://github.com/suzgunmirac/BIG-Bench-Hard/archive/refs/heads/main.zip
13
+ mkdir -p data/downloads/bbh
14
+ unzip data/downloads/bbh_data.zip -d data/downloads/bbh
15
+ mv data/downloads/bbh/BIG-Bench-Hard-main/ data/eval/bbh && rm -r data/downloads/bbh data/downloads/bbh_data.zip
16
+
17
+
18
+ # Super-NaturalInstructions dataset
19
+ wget -O data/downloads/superni_data.zip https://github.com/allenai/natural-instructions/archive/refs/heads/master.zip
20
+ mkdir -p data/downloads/superni
21
+ unzip data/downloads/superni_data.zip -d data/downloads/superni
22
+ mv data/downloads/superni/natural-instructions-master/ data/eval/superni && rm -r data/downloads/superni data/downloads/superni_data.zip
23
+
24
+
25
+ # TyDiQA-GoldP dataset
26
+ mkdir -p data/eval/tydiqa
27
+ wget -P data/eval/tydiqa/ https://storage.googleapis.com/tydiqa/v1.1/tydiqa-goldp-v1.1-dev.json
28
+ wget -P data/eval/tydiqa/ https://storage.googleapis.com/tydiqa/v1.1/tydiqa-goldp-v1.1-train.json
29
+
30
+
31
+ # XOR-QA dataset
32
+ wget -P data/eval/xorqa/ https://raw.githubusercontent.com/mia-workshop/MIA-Shared-Task-2022/main/data/eval/mia_2022_dev_xorqa.jsonl
33
+ wget -P data/eval/xorqa/ https://github.com/mia-workshop/MIA-Shared-Task-2022/raw/main/data/train/mia_2022_train_data.jsonl.zip
34
+ unzip data/eval/xorqa/mia_2022_train_data.jsonl.zip -d data/eval/xorqa/ && rm data/eval/xorqa/mia_2022_train_data.jsonl.zip
35
+
36
+
37
+ # GSM dataset
38
+ wget -P data/eval/gsm/ https://github.com/openai/grade-school-math/raw/master/grade_school_math/data/test.jsonl
39
+
40
+
41
+ # Multilingual GSM dataset
42
+ wget -O data/downloads/url-nlp.zip https://github.com/google-research/url-nlp/archive/refs/heads/main.zip
43
+ mkdir -p data/downloads/url-nlp
44
+ unzip data/downloads/url-nlp.zip -d data/downloads/url-nlp
45
+ mv data/downloads/url-nlp/url-nlp-main/mgsm data/eval/mgsm && rm -r data/downloads/url-nlp data/downloads/url-nlp.zip
46
+
47
+
48
+ # Codex HumanEval
49
+ wget -P data/eval/codex_humaneval https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz
50
+
51
+
52
+ # TruthfulQA
53
+ wget -P data/eval/truthfulqa https://github.com/sylinrl/TruthfulQA/raw/main/TruthfulQA.csv
54
+
55
+
56
+ # Self-instruct eval, Vicuna eval, and Koala eval for creative instructions/tasks
57
+ mkdir -p data/eval/creative_tasks
58
+ wget -O data/eval/creative_tasks/self_instruct_test.jsonl https://github.com/yizhongw/self-instruct/raw/main/human_eval/user_oriented_instructions.jsonl
59
+ wget -O data/eval/creative_tasks/vicuna_test.jsonl https://github.com/lm-sys/FastChat/raw/main/fastchat/eval/table/question.jsonl
60
+ wget -O data/eval/creative_tasks/koala_test.jsonl https://github.com/arnav-gudibande/koala-test-set/raw/main/koala_test_set.jsonl
61
+
62
+
63
+ # Toxigen data
64
+ mkdir -p data/eval/toxigen
65
+ for minority_group in asian black chinese jewish latino lgbtq mental_disability mexican middle_east muslim native_american physical_disability trans women
66
+ do
67
+ wget -O data/eval/toxigen/hate_${minority_group}.txt https://raw.githubusercontent.com/microsoft/TOXIGEN/main/prompts/hate_${minority_group}_1k.txt
68
+ done
data-downloader/download_instructions_data.sh ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # check if there is $HF_TOKEN in the environment variables
2
+ if [ -z "$HF_TOKEN" ]
3
+ then
4
+ echo "Warning: HuggingFace dataset LIMA requires permissive access."
5
+ echo "Warning: Please request the access at https://huggingface.co/datasets/GAIR/lima and set the HF_TOKEN environment variable before running this script."
6
+ exit 1
7
+ fi
8
+
9
+ echo "Downloading Super-NaturalInstructions dataset..."
10
+ wget -P data/raw_train/super_ni/ https://github.com/allenai/natural-instructions/archive/refs/heads/master.zip
11
+ unzip data/raw_train/super_ni/master.zip -d data/raw_train/super_ni/ && rm data/raw_train/super_ni/master.zip
12
+ mv data/raw_train/super_ni/natural-instructions-master/* data/raw_train/super_ni/ && rm -r data/raw_train/super_ni/natural-instructions-master
13
+
14
+
15
+ echo "Downloading the flan_v2 chain-of-thought submix..."
16
+ wget -P data/raw_train/cot/ https://beaker.org/api/v3/datasets/01GXZ52K2Q932H6KZY499A7FE8/files/cot_zsopt.jsonl
17
+ wget -P data/raw_train/cot/ https://beaker.org/api/v3/datasets/01GXZ51ZV283RAZW7J3ECM4S58/files/cot_fsopt.jsonl
18
+
19
+
20
+ echo "Downloading the flan_v2 collection, here we subsampled only 100K instances..."
21
+ wget -P data/raw_train/flan_v2/ https://beaker.org/api/v3/datasets/01GZTTS2EJFPA83PXS4FQCS1SA/files/flan_v2_resampled_100k.jsonl
22
+
23
+
24
+ echo "Downloading self-instruct data..."
25
+ wget -P data/raw_train/self_instruct/ https://raw.githubusercontent.com/yizhongw/self-instruct/main/data/gpt3_generations/batch_221203/all_instances_82K.jsonl
26
+
27
+
28
+ echo "Downloading unnatural-instructions data..."
29
+ wget -P data/raw_train/unnatural_instructions/ https://github.com/orhonovich/unnatural-instructions/raw/main/data/core_data.zip
30
+ unzip data/raw_train/unnatural_instructions/core_data.zip -d data/raw_train/unnatural_instructions/
31
+
32
+
33
+ echo "Downloading Stanford alpaca data..."
34
+ wget -P data/raw_train/stanford_alpaca/ https://github.com/tatsu-lab/stanford_alpaca/raw/main/alpaca_data.json
35
+
36
+
37
+ echo "Downloading the dolly dataset..."
38
+ wget -P data/raw_train/dolly/ https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl
39
+
40
+
41
+ echo "Downloading the OpenAssistant data (oasst1)..."
42
+ wget -P data/raw_train/oasst1/ https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/2023-04-12_oasst_ready.trees.jsonl.gz
43
+ gzip -d data/raw_train/oasst1/2023-04-12_oasst_ready.trees.jsonl.gz
44
+
45
+
46
+ echo "Downloading the code alpaca dataset..."
47
+ wget -P data/raw_train/code_alpaca/ https://github.com/sahil280114/codealpaca/raw/master/data/code_alpaca_20k.json
48
+
49
+
50
+ echo "Downloading the gpt4-llm dataset..."
51
+ wget -P data/raw_train/gpt4_alpaca/ https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/raw/main/data/alpaca_gpt4_data.json
52
+ wget -P data/raw_train/gpt4_alpaca/ https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/raw/main/data/alpaca_gpt4_data_zh.json
53
+
54
+
55
+ echo "Downloading the baize dataset..."
56
+ wget -P data/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/alpaca_chat_data.json
57
+ wget -P data/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/medical_chat_data.json
58
+ wget -P data/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/quora_chat_data.json
59
+ wget -P data/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/stackoverflow_chat_data.json
60
+
61
+
62
+ echo "Downloading ShareGPT dataset..."
63
+ wget -P data/raw_train/sharegpt/ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part1_html_cleaned.json
64
+ wget -P data/raw_train/sharegpt/ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part2_html_cleaned.json
65
+ echo "Splitting the ShareGPT dataset..."
66
+ python scripts/split_sharegpt_conversations.py \
67
+ --in-files data/raw_train/sharegpt/sg_90k_part1_html_cleaned.json data/raw_train/sharegpt/sg_90k_part2_html_cleaned.json \
68
+ --out-file data/raw_train/sharegpt/sharegpt_html_cleaned_and_split.json \
69
+ --model-name-or-path ../hf_llama_models/7B/
70
+
71
+
72
+ echo "Downloading LIMA dataset..."
73
+ wget --header="Authorization: Bearer $HF_TOKEN" -P data/raw_train/lima/ https://huggingface.co/datasets/GAIR/lima/raw/main/train.jsonl
74
+
75
+
76
+ echo "Downloading WizardLM dataset..."
77
+ wget -P data/raw_train/wizardlm/ https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k/resolve/main/WizardLM_evol_instruct_V2_143k.json
78
+
79
+
80
+ echo "Downloading the OpenOrca dataset..."
81
+ wget -P data/raw_train/open_orca/ https://huggingface.co/datasets/Open-Orca/OpenOrca/resolve/main/1M-GPT4-Augmented.parquet
82
+ wget -P data/raw_train/open_orca/ https://huggingface.co/datasets/Open-Orca/OpenOrca/resolve/main/3_5M-GPT3_5-Augmented.parquet
83
+
84
+
85
+ echo "Reformatting the datasets..."
86
+ python open_instruct/reformat_datasets.py --raw_data_dir data/raw_train/ --output_dir data/processed/
87
+
88
+
89
+ echo "Creating Tulu data mixtures..."
90
+ mkdir -p data/processed/tulu/
91
+ cat data/processed/flan_v2/flan_v2_data.jsonl \
92
+ data/processed/cot/cot_data.jsonl \
93
+ data/processed/dolly/dolly_data.jsonl \
94
+ data/processed/oasst1/oasst1_data.jsonl \
95
+ data/processed/gpt4_alpaca/gpt4_alpaca_data.jsonl \
96
+ data/processed/code_alpaca/code_alpaca_data.jsonl \
97
+ data/processed/sharegpt/sharegpt_data.jsonl \
98
+ > data/processed/tulu/tulu_v1_mix.jsonl
99
+
100
+ cat data/processed/flan_v2/flan_v2_data.jsonl \
101
+ data/processed/cot/cot_data.jsonl \
102
+ data/processed/dolly/dolly_data.jsonl \
103
+ data/processed/oasst1/oasst1_data.jsonl \
104
+ > data/processed/tulu/tulu_v1_human_mix.jsonl
105
+
106
+ cat data/processed/flan_v2/flan_v2_data.jsonl \
107
+ data/processed/cot/cot_data.jsonl \
108
+ data/processed/oasst1/oasst1_data.jsonl \
109
+ data/processed/lima/lima_data.jsonl \
110
+ data/processed/code_alpaca/code_alpaca_data.jsonl \
111
+ data/processed/sharegpt/sharegpt_data.jsonl \
112
+ data/processed/wizardlm/wizardlm_data.jsonl \
113
+ data/processed/open_orca/open_orca_data.jsonl \
114
+ > data/processed/tulu/tulu_v2_mix.jsonl
115
+
116
+ cat data/processed/flan_v2/flan_v2_data.jsonl \
117
+ data/processed/cot/cot_data.jsonl \
118
+ data/processed/oasst1/oasst1_data.jsonl \
119
+ data/processed/lima/lima_data.jsonl \
120
+ > data/processed/tulu/tulu_v2_human_mix.jsonl
olive_farm.png ADDED

Git LFS Details

  • SHA256: 196ae590f3c139e3987860735fa7db0050f5d0080eeed2e0f88fa95b0d7bcdc8
  • Pointer size: 132 Bytes
  • Size of remote file: 1.44 MB
open_instruct/get_data_stats.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import sys
4
+ import tqdm
5
+ import pandas as pd
6
+ import numpy as np
7
+ import argparse
8
+ from datasets import load_dataset
9
+ from transformers import AutoTokenizer
10
+
11
+
12
+ def get_statistics_for_messages_data(data_path):
13
+ # load dataset
14
+ dataset = load_dataset("json", data_files={"train": data_path})
15
+ # tokenize dataset
16
+ tokenizer = AutoTokenizer.from_pretrained("/net/nfs.cirrascale/allennlp/yizhongw/hf_llama_models/7B", use_fast=False)
17
+ # get statistics
18
+ num_instances = len(dataset["train"])
19
+ num_of_turns = [len(instance["messages"]) for instance in dataset["train"]]
20
+ user_prompt_lengths = []
21
+ assistant_response_lengths = []
22
+ instance_lengths = []
23
+ for instance in tqdm.tqdm(dataset["train"], desc="Processing instances"):
24
+ instance_length = 0
25
+ for message in instance["messages"]:
26
+ if message["role"] == "user":
27
+ user_prompt_lengths.append(len(tokenizer(message["content"], truncation=False, add_special_tokens=False)["input_ids"]))
28
+ instance_length += user_prompt_lengths[-1]
29
+ elif message["role"] == "assistant":
30
+ assistant_response_lengths.append(len(tokenizer(message["content"], truncation=False, add_special_tokens=False)["input_ids"]))
31
+ instance_length += assistant_response_lengths[-1]
32
+ instance_lengths.append(instance_length)
33
+
34
+ top_100_longest_instances = np.argsort(instance_lengths)[-100:][::-1].tolist()
35
+ top_100_longest_instances = [dataset["train"][i]["id"] for i in top_100_longest_instances]
36
+
37
+ result = {
38
+ "num_instances": num_instances,
39
+ "turns_summary": pd.Series(num_of_turns).describe(),
40
+ "user_prompt_lengths_summary": pd.Series(user_prompt_lengths).describe(),
41
+ "assistant_response_lengths_summary": pd.Series(assistant_response_lengths).describe(),
42
+ "total_lengths_summary": pd.Series(instance_lengths).describe(),
43
+ "num_instances_with_total_length_gt_512": np.sum(np.array(instance_lengths) > 512),
44
+ "num_instances_with_total_length_gt_768": np.sum(np.array(instance_lengths) > 768),
45
+ "num_instances_with_total_length_gt_1024": np.sum(np.array(instance_lengths) > 1024),
46
+ "num_instances_with_total_length_gt_1536": np.sum(np.array(instance_lengths) > 1536),
47
+ "num_instances_with_total_length_gt_2048": np.sum(np.array(instance_lengths) > 2048),
48
+ "num_instances_with_total_length_gt_4096": np.sum(np.array(instance_lengths) > 4096),
49
+ "top_100_longest_instances": top_100_longest_instances,
50
+ }
51
+
52
+ # convert everything to dict or scalar
53
+ for key, value in result.items():
54
+ if isinstance(value, pd.Series):
55
+ result[key] = value.to_dict()
56
+ elif isinstance(value, np.ndarray):
57
+ result[key] = value.tolist()
58
+ elif isinstance(value, np.int64):
59
+ result[key] = int(value)
60
+
61
+ return result
62
+
63
+ def get_statistics_for_prompt_completion_data(data_path):
64
+ # load dataset
65
+ dataset = load_dataset("json", data_files={"train": data_path})
66
+ prompts = [instance["prompt"] for instance in dataset["train"]]
67
+ completions = [instance["completion"] for instance in dataset["train"]]
68
+ # tokenize dataset
69
+ tokenizer = AutoTokenizer.from_pretrained("/net/nfs.cirrascale/allennlp/yizhongw/hf_llama_models/7B")
70
+ tokenized_prompts = tokenizer(prompts, truncation=False, add_special_tokens=False)
71
+ tokenized_completions = tokenizer(completions, truncation=False, add_special_tokens=False)
72
+ # get statistics
73
+ num_instances = len(dataset["train"])
74
+ prompt_lengths = [len(tokenized_prompts["input_ids"][i]) for i in range(num_instances)]
75
+ completion_lengths = [len(tokenized_completions["input_ids"][i]) for i in range(num_instances)]
76
+ prompt_completion_lengths = [prompt_lengths[i] + completion_lengths[i] for i in range(num_instances)]
77
+
78
+ result = {
79
+ "num_instances": num_instances,
80
+ "prompt_lengths_summary": pd.Series(prompt_lengths).describe(),
81
+ "completion_lengths_summary": pd.Series(completion_lengths).describe(),
82
+ "prompt_completion_lengths_summary": pd.Series(prompt_completion_lengths).describe(),
83
+ "num_instances_with_prompt_length_gt_512": np.sum(np.array(prompt_lengths) > 512),
84
+ "num_instances_with_completion_length_gt_512": np.sum(np.array(completion_lengths) > 512),
85
+ "num_instances_with_prompt_completion_length_gt_512": np.sum(np.array(prompt_completion_lengths) > 512),
86
+ "num_instances_with_completion_length_gt_768": np.sum(np.array(completion_lengths) > 768),
87
+ "num_instances_with_prompt_completion_length_gt_1024": np.sum(np.array(prompt_completion_lengths) > 1024),
88
+ }
89
+
90
+ # convert everything to dict or scalar
91
+ for key, value in result.items():
92
+ if isinstance(value, pd.Series):
93
+ result[key] = value.to_dict()
94
+ elif isinstance(value, np.ndarray):
95
+ result[key] = value.tolist()
96
+ elif isinstance(value, np.int64):
97
+ result[key] = int(value)
98
+
99
+ return result
100
+
101
+
102
+ if __name__ == "__main__":
103
+ parser = argparse.ArgumentParser()
104
+ parser.add_argument("--data_path", type=str, required=True)
105
+ parser.add_argument("--save_path", type=str, help="Path to save the statistics.")
106
+ args = parser.parse_args()
107
+
108
+ with open(args.data_path, "r") as f:
109
+ sample = json.loads(f.readline())
110
+ if "prompt" in sample:
111
+ statistics = get_statistics_for_prompt_completion_data(args.data_path)
112
+ elif "messages" in sample:
113
+ statistics = get_statistics_for_messages_data(args.data_path)
114
+ else:
115
+ raise ValueError("Invalid data format - the data should be either prompt completion data or messages data.")
116
+
117
+ print(json.dumps(statistics, indent=4))
118
+
119
+ if args.save_path is not None:
120
+ with open(args.save_path, "w") as f:
121
+ json.dump(statistics, f, indent=4)
open_instruct/reformat_data.py ADDED
@@ -0,0 +1,551 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ '''
4
+ This script is used to reformat the downloaded datasets into the format that can be used by the model.
5
+ Here we use jsonl for the converted data. Each line in the jsonl file is a json object formatted as follows:
6
+ {
7
+ "dataset": "dataset_name",
8
+ "id": "unique_id",
9
+ "messages": [
10
+ {"role": "system", "content": "message_text"}, # optional
11
+ {"role": "user", "content": "message_text"},
12
+ {"role": "assistant", "content": "message_text"},
13
+ {"role": "user", "content": "message_text"},
14
+ {"role": "assistant", "content": "message_text"},
15
+ ...
16
+ ],
17
+ }
18
+ '''
19
+
20
+ import json
21
+ import random
22
+ import re
23
+ import os
24
+ import pandas as pd
25
+ import argparse
26
+ from instruction_encode_templates import encode_instruction_example, encode_few_shot_example
27
+
28
+
29
+ def convert_super_ni_data(data_dir, output_dir, zero_shot_examples_per_task=60, few_shot_examples_per_task=20, n_few_shot=2):
30
+ os.makedirs(output_dir, exist_ok=True)
31
+ train_tasks = []
32
+ with open(os.path.join(data_dir, "splits", "xlingual", "train_tasks.txt"), "r") as fin:
33
+ for line in fin:
34
+ if not "_mmmlu_" in line: # skip mmlu to avoid test leakage
35
+ train_tasks.append(line.strip())
36
+ with open(os.path.join(output_dir, "super_ni_data.jsonl"), "w") as fout:
37
+ for task in train_tasks:
38
+ with open(os.path.join(data_dir, "tasks", f"{task}.json"), "r") as fin:
39
+ task_data = json.load(fin)
40
+ instruction = task_data["Definition"][0]
41
+ if zero_shot_examples_per_task + few_shot_examples_per_task < len(task_data["Instances"]):
42
+ instances = random.sample(task_data["Instances"], k=zero_shot_examples_per_task+few_shot_examples_per_task)
43
+ else:
44
+ instances = task_data["Instances"]
45
+ for instance in instances[:zero_shot_examples_per_task]:
46
+ encoded_example = encode_instruction_example(
47
+ instruction=instruction,
48
+ input=instance["input"],
49
+ output=instance["output"][0],
50
+ random_template=True,
51
+ eos_token=None
52
+ )
53
+ fout.write(json.dumps({
54
+ "dataset": "super_ni",
55
+ "id": f"super_ni_{instance['id']}",
56
+ "messages": [
57
+ {"role": "user", "content": encoded_example["prompt"]},
58
+ {"role": "assistant", "content": encoded_example["completion"]},
59
+ ]
60
+ }) + "\n")
61
+ for instance in instances[zero_shot_examples_per_task:]:
62
+ if n_few_shot < len(task_data["Positive Examples"]):
63
+ examplars = random.sample(task_data["Positive Examples"], k=n_few_shot)
64
+ else:
65
+ examplars = task_data["Positive Examples"]
66
+ encoded_example = encode_few_shot_example(
67
+ instruction=instruction,
68
+ examplars=examplars,
69
+ input=instance["input"],
70
+ output=instance["output"][0],
71
+ eos_token=None
72
+ )
73
+ fout.write(json.dumps({
74
+ "dataset": "super_ni",
75
+ "id": f"super_ni_{instance['id']}",
76
+ "messages": [
77
+ {"role": "user", "content": encoded_example["prompt"]},
78
+ {"role": "assistant", "content": encoded_example["completion"]},
79
+ ]
80
+ }) + "\n")
81
+
82
+
83
+ def convert_cot_data(data_dir, output_dir, num_zero_shot_examples=50000, num_few_shot_examples=50000):
84
+ os.makedirs(output_dir, exist_ok=True)
85
+ examples = []
86
+ if num_few_shot_examples > 0:
87
+ with open(os.path.join(data_dir, "cot_zsopt.jsonl"), "r") as fin:
88
+ zero_shot_examples = [json.loads(line) for line in fin]
89
+ if num_zero_shot_examples < len(zero_shot_examples):
90
+ zero_shot_examples = random.sample(zero_shot_examples, k=num_zero_shot_examples)
91
+ examples.extend(zero_shot_examples)
92
+ if num_few_shot_examples > 0:
93
+ with open(os.path.join(data_dir, "cot_fsopt.jsonl"), "r") as fin:
94
+ few_shot_examples = [json.loads(line) for line in fin]
95
+ if num_few_shot_examples < len(few_shot_examples):
96
+ few_shot_examples = random.sample(few_shot_examples, k=num_few_shot_examples)
97
+ examples.extend(few_shot_examples)
98
+ output_path = os.path.join(output_dir, "cot_data.jsonl")
99
+ with open(output_path, "w") as fout:
100
+ for idx, example in enumerate(examples):
101
+ prompt = example["inputs"]
102
+ if not prompt.endswith("\n") and not prompt.rstrip().endswith(":"):
103
+ prompt += "\n"
104
+ completion = example["targets"]
105
+ fout.write(json.dumps({
106
+ "dataset": "cot",
107
+ "id": f"cot_{idx}",
108
+ "messages": [
109
+ {"role": "user", "content": prompt},
110
+ {"role": "assistant", "content": completion},
111
+ ]
112
+ }) + "\n")
113
+
114
+
115
+ def convert_flan_v2_data(data_dir, output_dir):
116
+ os.makedirs(output_dir, exist_ok=True)
117
+ examples = []
118
+ with open(os.path.join(data_dir, "flan_v2_resampled_100k.jsonl"), "r") as fin:
119
+ for line in fin:
120
+ examples.append(json.loads(line))
121
+ output_path = os.path.join(output_dir, "flan_v2_data.jsonl")
122
+ with open(output_path, "w") as fout:
123
+ for idx, example in enumerate(examples):
124
+ prompt = example["inputs"]
125
+ if not prompt.endswith("\n") and not prompt.rstrip().endswith(":"):
126
+ prompt += "\n"
127
+ completion = example["targets"]
128
+ fout.write(json.dumps({
129
+ "dataset": "flan_v2",
130
+ "id": f"flan_v2_{idx}",
131
+ "messages": [
132
+ {"role": "user", "content": prompt},
133
+ {"role": "assistant", "content": completion},
134
+ ]
135
+ }) + "\n")
136
+
137
+
138
+ def convert_dolly_data(data_dir, output_dir):
139
+ os.makedirs(output_dir, exist_ok=True)
140
+ examples = []
141
+ with open(os.path.join(data_dir, "databricks-dolly-15k.jsonl"), "r") as fin:
142
+ for line in fin:
143
+ examples.append(json.loads(line))
144
+ output_path = os.path.join(output_dir, "dolly_data.jsonl")
145
+ with open(output_path, "w") as fout:
146
+ for idx, example in enumerate(examples):
147
+ encoded_example = encode_instruction_example(
148
+ instruction=example["instruction"],
149
+ input=example["context"],
150
+ output=example["response"],
151
+ random_template=True,
152
+ eos_token=None
153
+ )
154
+ fout.write(json.dumps({
155
+ "dataset": "dolly",
156
+ "id": f"dolly_{idx}",
157
+ "messages": [
158
+ {"role": "user", "content": encoded_example["prompt"]},
159
+ {"role": "assistant", "content": encoded_example["completion"]},
160
+ ]
161
+ }) + "\n")
162
+
163
+
164
+ def convert_self_instruct_data(data_dir, output_dir):
165
+ os.makedirs(output_dir, exist_ok=True)
166
+ examples = []
167
+ with open(os.path.join(data_dir, "all_instances_82K.jsonl"), "r") as fin:
168
+ for line in fin:
169
+ examples.append(json.loads(line))
170
+ output_path = os.path.join(output_dir, "self_instruct_data.jsonl")
171
+ with open(output_path, "w") as fout:
172
+ for idx, example in enumerate(examples):
173
+ encoded_example = encode_instruction_example(
174
+ instruction=example["instruction"],
175
+ input=example["input"],
176
+ output=example["output"],
177
+ random_template=True,
178
+ eos_token=None
179
+ )
180
+ fout.write(json.dumps({
181
+ "dataset": "self_instruct",
182
+ "id": f"self_instruct_{idx}",
183
+ "messages": [
184
+ {"role": "user", "content": encoded_example["prompt"]},
185
+ {"role": "assistant", "content": encoded_example["completion"]},
186
+ ]
187
+ }) + "\n")
188
+
189
+
190
+ def convert_unnatural_instructions_data(data_dir, output_dir):
191
+ os.makedirs(output_dir, exist_ok=True)
192
+ instance_cnt = 0
193
+ with open(os.path.join(data_dir, "core_data.jsonl"), "r") as fin, open((os.path.join(output_dir, "unnatural_instructions_data.jsonl")), "w") as fout:
194
+ for line in fin:
195
+ task_data = json.loads(line)
196
+ instruction = task_data["instruction"]
197
+ for instance in task_data["instances"]:
198
+ if instance["constraints"] and instance["constraints"].lower() not in ["none", "none."]:
199
+ instance_instruction = instruction + "\n" + instance["constraints"]
200
+ else:
201
+ instance_instruction = instruction
202
+ encoded_example = encode_instruction_example(
203
+ instruction=instance_instruction,
204
+ input=instance["input"],
205
+ output=instance["output"],
206
+ random_template=True,
207
+ eos_token=None
208
+ )
209
+ fout.write(json.dumps({
210
+ "dataset": "unnatural_instructions",
211
+ "id": f"unnatural_instructions_{instance_cnt}",
212
+ "messages": [
213
+ {"role": "user", "content": encoded_example["prompt"]},
214
+ {"role": "assistant", "content": encoded_example["completion"]},
215
+ ]
216
+ }) + "\n")
217
+ instance_cnt += 1
218
+
219
+
220
+ def convert_stanford_alpaca_data(data_dir, output_dir):
221
+ os.makedirs(output_dir, exist_ok=True)
222
+ examples = []
223
+ with open(os.path.join(data_dir, "alpaca_data.json"), "r") as fin:
224
+ examples.extend(json.load(fin))
225
+ output_path = os.path.join(output_dir, "stanford_alpaca_data.jsonl")
226
+ with open(output_path, "w") as fout:
227
+ for idx, example in enumerate(examples):
228
+ encoded_example = encode_instruction_example(
229
+ instruction=example["instruction"],
230
+ input=example["input"],
231
+ output=example["output"],
232
+ random_template=True,
233
+ eos_token=None
234
+ )
235
+ fout.write(json.dumps({
236
+ "dataset": "stanford_alpaca",
237
+ "id": f"stanford_alpaca_{idx}",
238
+ "messages": [
239
+ {"role": "user", "content": encoded_example["prompt"]},
240
+ {"role": "assistant", "content": encoded_example["completion"]},
241
+ ]
242
+ }) + "\n")
243
+
244
+
245
+ def convert_code_alpaca_data(data_dir, output_dir):
246
+ os.makedirs(output_dir, exist_ok=True)
247
+ examples = []
248
+ with open(os.path.join(data_dir, "code_alpaca_20k.json"), "r") as fin:
249
+ examples.extend(json.load(fin))
250
+ output_path = os.path.join(output_dir, "code_alpaca_data.jsonl")
251
+ with open(output_path, "w") as fout:
252
+ for idx, example in enumerate(examples):
253
+ encoded_example = encode_instruction_example(
254
+ instruction=example["instruction"],
255
+ input=example["input"],
256
+ output=example["output"],
257
+ random_template=True,
258
+ eos_token=None
259
+ )
260
+ fout.write(json.dumps({
261
+ "dataset": "code_alpaca",
262
+ "id": f"code_alpaca_{idx}",
263
+ "messages": [
264
+ {"role": "user", "content": encoded_example["prompt"]},
265
+ {"role": "assistant", "content": encoded_example["completion"]},
266
+ ]
267
+ }) + "\n")
268
+
269
+
270
+ def convert_gpt4_alpaca_data(data_dir, output_dir, load_en=True, load_zh=False):
271
+ os.makedirs(output_dir, exist_ok=True)
272
+ examples = []
273
+ if load_en:
274
+ with open(os.path.join(data_dir, "alpaca_gpt4_data.json"), "r") as fin:
275
+ examples.extend(json.load(fin))
276
+ if load_zh:
277
+ with open(os.path.join(data_dir, "alpaca_gpt4_data_zh.json"), "r") as fin:
278
+ examples.extend(json.load(fin))
279
+ output_path = os.path.join(output_dir, "gpt4_alpaca_data.jsonl")
280
+ with open(output_path, "w") as fout:
281
+ for idx, example in enumerate(examples):
282
+ encoded_example = encode_instruction_example(
283
+ instruction=example["instruction"],
284
+ input=example["input"],
285
+ output=example["output"],
286
+ random_template=True,
287
+ eos_token=None
288
+ )
289
+ fout.write(json.dumps({
290
+ "dataset": "gpt4_alpaca",
291
+ "id": f"gpt4_alpaca_{idx}",
292
+ "messages": [
293
+ {"role": "user", "content": encoded_example["prompt"]},
294
+ {"role": "assistant", "content": encoded_example["completion"]},
295
+ ]
296
+ }) + "\n")
297
+
298
+
299
+ def convert_sharegpt_data(data_dir, output_dir):
300
+ os.makedirs(output_dir, exist_ok=True)
301
+ examples = []
302
+ with open(os.path.join(data_dir, "sharegpt_html_cleaned_and_split.json"), "r") as fin:
303
+ examples.extend(json.load(fin))
304
+
305
+ output_path = os.path.join(output_dir, "sharegpt_data.jsonl")
306
+ with open(output_path, "w") as fout:
307
+ invalid_cnt = 0
308
+ for idx, example in enumerate(examples):
309
+ messages = []
310
+ valid = True
311
+ for message in example["conversations"]:
312
+ if message["from"] == "human" or message["from"] == "user":
313
+ messages.append({
314
+ "role": "user",
315
+ "content": message["value"]
316
+ })
317
+ elif message["from"] == "gpt" or message["from"] == "chatgpt":
318
+ messages.append({
319
+ "role": "assistant",
320
+ "content": message["value"]
321
+ })
322
+ elif message["from"] == "system":
323
+ valid = False
324
+ invalid_cnt += 1
325
+ break
326
+ elif message["from"] == "bing":
327
+ valid = False
328
+ invalid_cnt += 1
329
+ break
330
+ else:
331
+ raise ValueError(f"Unknown message sender: {message['from']}")
332
+ if messages and valid:
333
+ fout.write(json.dumps({
334
+ "dataset": "sharegpt",
335
+ "id": f"sharegpt_{example['id']}",
336
+ "messages": messages
337
+ }) + "\n")
338
+ print(f"# of invalid examples in sharegpt data: {invalid_cnt}")
339
+
340
+
341
+ def convert_baize_data(data_dir, output_dir):
342
+ os.makedirs(output_dir, exist_ok=True)
343
+ examples = []
344
+ for source in ["alpaca", "medical", "quora", "stackoverflow"]:
345
+ with open(os.path.join(data_dir, f"{source}_chat_data.json"), "r") as fin:
346
+ examples.extend(json.load(fin))
347
+
348
+ output_path = os.path.join(output_dir, "baize_data.jsonl")
349
+ with open(output_path, "w") as fout:
350
+ for idx, example in enumerate(examples):
351
+ # split example["input"] by [|Human|] and [|AI|]
352
+ messages = []
353
+ rounds = example["input"].split("[|Human|]")[1:]
354
+ for round in rounds:
355
+ if not round.strip() or "[|AI|]" not in round:
356
+ continue
357
+ human, assistant = round.split("[|AI|]")
358
+ messages.append({
359
+ "role": "user",
360
+ "content": human.strip()
361
+ })
362
+ messages.append({
363
+ "role": "assistant",
364
+ "content": assistant.strip()
365
+ })
366
+ fout.write(json.dumps({
367
+ "dataset": "baize",
368
+ "id": f"baize_{idx}",
369
+ "messages": messages
370
+ }) + "\n")
371
+
372
+
373
+ def convert_oasst1_data(data_dir, output_dir):
374
+ '''
375
+ For OASST1, because it's in a tree structure, where every user input might get multiple replies,
376
+ we have to save every path from the root node to the assistant reply (including both leaf node and intemediate node).
377
+ This results in some of the messages being duplicated among different paths (instances).
378
+ Be careful when using this dataset for training. Ideally, you should only minimize the loss of the last message in each path.
379
+ '''
380
+ os.makedirs(output_dir, exist_ok=True)
381
+ conversations = []
382
+ with open(os.path.join(data_dir, "2023-04-12_oasst_ready.trees.jsonl"), "r") as fin:
383
+ for line in fin:
384
+ conversations.append(json.loads(line))
385
+
386
+ output_path = os.path.join(output_dir, "oasst1_data.jsonl")
387
+
388
+ # we filter out the sequences that mention the creator information
389
+ filter_strings = [
390
+ "LAION",
391
+ "Open Asssistant",
392
+ "OpenAssistant",
393
+ ]
394
+
395
+ # tranvers the conversation tree, and collect all valid sequences
396
+ def dfs(reply, messages, valid_sequences):
397
+ if any([filter_string in reply["text"] for filter_string in filter_strings]):
398
+ return
399
+ if reply["role"] == "assistant":
400
+ messages.append(
401
+ {"role": "assistant", "content": reply["text"]}
402
+ )
403
+ if not reply["replies"]: # leaf node
404
+ valid_sequences.append(messages[:])
405
+ else:
406
+ for child in reply["replies"]:
407
+ dfs(child, messages, valid_sequences)
408
+ messages.pop()
409
+ elif reply["role"] == "prompter":
410
+ messages.append(
411
+ {"role": "user", "content": reply["text"]}
412
+ )
413
+ for child in reply["replies"]:
414
+ dfs(child, messages, valid_sequences)
415
+ messages.pop()
416
+ else:
417
+ raise ValueError(f"Unknown role: {reply['role']}")
418
+
419
+ with open(output_path, "w") as fout:
420
+ example_cnt = 0
421
+ for _, conversation in enumerate(conversations):
422
+ valid_sequences = []
423
+ dfs(conversation["prompt"], [], valid_sequences)
424
+ for sequence in valid_sequences:
425
+ fout.write(json.dumps({
426
+ "dataset": "oasst1",
427
+ "id": f"oasst1_{example_cnt}",
428
+ "messages": sequence
429
+ }) + "\n")
430
+ example_cnt += 1
431
+
432
+
433
+ def convert_lima_data(data_dir, output_dir):
434
+ os.makedirs(output_dir, exist_ok=True)
435
+ examples = []
436
+ with open(os.path.join(data_dir, "train.jsonl"), "r") as fin:
437
+ for line in fin:
438
+ examples.append(json.loads(line))
439
+ output_path = os.path.join(output_dir, "lima_data.jsonl")
440
+ with open(output_path, "w") as fout:
441
+ for idx, example in enumerate(examples):
442
+ messages = []
443
+ if not len(example["conversations"]) % 2 == 0:
444
+ print(f"Waring: example {idx} in LIMA has odd number of messages. Cutting off the last message.")
445
+ example["conversations"] = example["conversations"][:-1]
446
+
447
+ for i in range(0, len(example["conversations"]), 2):
448
+ messages.append({
449
+ "role": "user",
450
+ "content": example["conversations"][i]
451
+ })
452
+ messages.append({
453
+ "role": "assistant",
454
+ "content": example["conversations"][i+1]
455
+ })
456
+ fout.write(json.dumps({
457
+ "dataset": "lima",
458
+ "id": f"lima_{idx}",
459
+ "messages": messages,
460
+ }) + "\n")
461
+
462
+
463
+ def convert_wizardlm_data(data_dir, output_dir):
464
+ os.makedirs(output_dir, exist_ok=True)
465
+ examples = []
466
+ with open(os.path.join(data_dir, "WizardLM_evol_instruct_V2_143k.json"), "r") as fin:
467
+ examples = json.load(fin)
468
+
469
+ output_path = os.path.join(output_dir, "wizardlm_data.jsonl")
470
+ with open(output_path, "w") as fout:
471
+ for idx, example in enumerate(examples):
472
+ messages = []
473
+ assert len(example["conversations"]) % 2 == 0
474
+ for i in range(0, len(example["conversations"]), 2):
475
+ assert example["conversations"][i]["from"] == "human"
476
+ assert example["conversations"][i+1]["from"] == "gpt"
477
+ messages.append({
478
+ "role": "user",
479
+ "content": example["conversations"][i]["value"]
480
+ })
481
+ messages.append({
482
+ "role": "assistant",
483
+ "content": example["conversations"][i+1]["value"]
484
+ })
485
+ fout.write(json.dumps({
486
+ "dataset": "wizardlm",
487
+ "id": f"wizardlm_{example['idx']}",
488
+ "messages": messages,
489
+ }) + "\n")
490
+
491
+
492
+ def convert_open_orca_data(data_dir, output_dir, num_gpt4_examples=100000, num_gpt35_examples=0):
493
+ os.makedirs(output_dir, exist_ok=True)
494
+ examples = []
495
+
496
+ df = pd.read_parquet(os.path.join(data_dir, "1M-GPT4-Augmented.parquet"))
497
+ gpt4_examples = [row.to_dict() for _, row in df.iterrows()]
498
+ random.shuffle(gpt4_examples)
499
+ examples.extend(gpt4_examples[:num_gpt4_examples])
500
+
501
+ df = pd.read_parquet(os.path.join(data_dir, "3_5M-GPT3_5-Augmented.parquet"))
502
+ gpt35_examples = [row.to_dict() for _, row in df.iterrows()]
503
+ random.shuffle(gpt35_examples)
504
+ examples.extend(gpt35_examples[:num_gpt35_examples])
505
+
506
+ output_path = os.path.join(output_dir, "open_orca_data.jsonl")
507
+ with open(output_path, "w") as fout:
508
+ for idx, example in enumerate(examples):
509
+ messages = [
510
+ {"role": "system", "content": example["system_prompt"]},
511
+ {"role": "user", "content": example["question"]},
512
+ {"role": "assistant", "content": example["response"]}
513
+ ]
514
+ fout.write(json.dumps({
515
+ "dataset": "open_orca",
516
+ "id": f"open_orca_{example['id']}",
517
+ "messages": messages,
518
+ }) + "\n")
519
+
520
+
521
+ if __name__ == "__main__":
522
+ arg_parser = argparse.ArgumentParser()
523
+ arg_parser.add_argument("--raw_data_dir", type=str, default="data/downloads")
524
+ arg_parser.add_argument("--output_dir", type=str, default="data/processed")
525
+ arg_parser.add_argument("--seed", type=int, default=42)
526
+ args = arg_parser.parse_args()
527
+ random.seed(args.seed)
528
+
529
+ # get the subfolder names in raw_data_dir
530
+ subfolders = [f for f in os.listdir(args.raw_data_dir) if os.path.isdir(os.path.join(args.raw_data_dir, f))]
531
+
532
+ # all supported datasets
533
+ supported_datasets = []
534
+ all_funcs = [func_name for func_name in globals() if callable(globals()[func_name])]
535
+ for func_name in all_funcs:
536
+ if re.match(r"convert_.+_data", func_name):
537
+ supported_datasets.append(func_name[8:-5])
538
+
539
+ # check if the subfolder names are supported datasets
540
+ valid_subfolders = []
541
+ for subfolder in subfolders:
542
+ if subfolder not in supported_datasets:
543
+ print(f"Warning: {subfolder} in the raw data folder is not a supported dataset. We will skip it.")
544
+ else:
545
+ valid_subfolders.append(subfolder)
546
+
547
+ # prepare data for each dataset
548
+ statistics = {}
549
+ for subfolder in valid_subfolders:
550
+ print(f"Processing {subfolder} data...")
551
+ globals()[f"convert_{subfolder}_data"](os.path.join(args.raw_data_dir, subfolder), os.path.join(args.output_dir, subfolder))
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ pdfplumber
3
+ docx2txt
4
+ justext
5
+ openai
6
+ langchain
web-app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain.embeddings.openai import OpenAIEmbeddings
3
+ from langchain.vectorstores import FAISS, Chroma
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.llms import OpenAI as OpenAI_llm
6
+ from langchain.chat_models import ChatOpenAI
7
+ from langchain.chains import ConversationalRetrievalChain,RetrievalQA
8
+ from langchain.memory import ConversationBufferMemory
9
+ from langchain.document_loaders import PyPDFLoader, TextLoader, WebBaseLoader
10
+ from langchain.prompts.chat import ChatPromptTemplate,HumanMessagePromptTemplate,SystemMessagePromptTemplate
11
+ # from langchain.chains.qa_with_sources import load_qa_with_sources_chain,BaseCombineDocumentsChain
12
+ import os
13
+ import chromadb
14
+ import tempfile
15
+ import requests
16
+ import openai
17
+ from bs4 import BeautifulSoup
18
+ from urllib.parse import urlparse
19
+
20
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
21
+
22
+ def assistant(url):
23
+
24
+
25
+ question=st.text_input("Ask your Question")
26
+
27
+ if st.button("Submit",type="primary"):
28
+ ABS_PATH: str = os.path.dirname(os.path.abspath(__file__))
29
+ DB_DIR: str = os.path.join(ABS_PATH,"db")
30
+
31
+ loader=WebBaseLoader(url)
32
+ data=loader.load()
33
+
34
+ text_splitter = CharacterTextSplitter(separator='\n',
35
+ chunk_size=1000,chunk_overlap=0)
36
+
37
+ docs = text_splitter.split_documents(data)
38
+
39
+
40
+ openai_embeddings = OpenAIEmbeddings()
41
+
42
+ # client = chromadb.PersistentClient(path=DB_DIR)
43
+ vectordb = FAISS.from_documents(documents=docs,embedding=openai_embeddings)
44
+
45
+
46
+ # vectordb.persist()
47
+
48
+
49
+ retriever=vectordb.as_retriever()
50
+
51
+ llm=ChatOpenAI(model_name='gpt-3.5-turbo')
52
+
53
+ qa=RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
54
+
55
+
56
+ response=qa(question)
57
+
58
+ st.write(response)
59
+
60
+
61
+
62
+ st.title('Chat with Website')
63
+
64
+ url=st.text_input('Enter Your URL here:')
65
+
66
+ if url:
67
+ assistant(url)