amezasor commited on
Commit
f1785a4
1 Parent(s): 2ab68a5

datasets update

Browse files
Files changed (1) hide show
  1. README.md +2 -97
README.md CHANGED
@@ -12,107 +12,12 @@ datasets:
12
  - bigcode/starcoderdata
13
  # - Stackexchange
14
  # - CommonCrawl
15
- # - open-web-math/open-web-math # Phase 1
16
- # - math-ai/StackMathQA # Phase 2
17
  # - Arxiv
18
  # - Wikipedia
19
  # - conceptofmind/FLAN_2022 # Original link is broken, we used IBM's filtered version | Phase 2
20
- # - bigcode/commitpackft # Phase 2
21
- # - bigcode/oasst-octopack # Phase 2
22
-
23
- # Phase 1 datasets
24
- - togethercomputer/RedPajama-Data-V2 # Common Crawl - CC (Redpajama v2)
25
- - togethercomputer/RedPajama-Data-1T # Books (Redpajama v1)
26
- - allenai/peS2o
27
- - open-web-math/open-web-math
28
- - EleutherAI/proof-pile-2 # Algebraic-stack (HF)
29
- # - Code pile v2 w/o GPL (dp08)
30
- # - Webhose (dp08)
31
- # - Patents (dp08)
32
- # - Arxiv (dp08)
33
- # - IEEE (dp08)
34
- # - DMMath (dp08)
35
- # - Financial research paper (dp08)
36
- # - Paper with code (dp08)
37
- # - Wikipedia (dp08)
38
- # - Stackexchange (dp08)
39
- # - doabooks (dp08)
40
- # - Freelaw (dp08)
41
- # - Pubmed (dp08)
42
- # - EDGAR (dp08)
43
- # - Secfiling (dp08)
44
- # - FIDC (dp08)
45
- # - Earning call transcript (dp08)
46
- #
47
- # Phase 2 datasets: add high quality + instruction tuning datasets into the mixture
48
- # Hiqh quality:
49
- # - sap_revised
50
- # - cybersecurity
51
- # - ibm-redbooks
52
- # - ibm.com
53
- # - superknowa
54
- # - multilingual – wikipedia + doabooks (de/es/fr/ja/pt/ar/cs/it/ko/nl/zh)
55
- # Instruction-tuning
56
  - nvidia/HelpSteer
57
- - garage-bAInd/Open-Platypus
58
- - mosaicml/dolly_hhrlhf
59
- - mosaicml/instruct-v3
60
- - conceptofmind/FLAN_2022
61
- - KnutJaegersberg/longinstruct
62
- - bigcode/oasst-octopack
63
- - CohereForAI/xP3x
64
- - math-ai/StackMathQA
65
- - math-ai/TemplateGSM
66
- - bugdaryan/sql-create-context-instruction
67
- - glaiveai/glaive-function-calling-v2
68
- - glaiveai/glaive-code-assistant-v3
69
- - cognitivecomputations/dolphin-coder
70
- - glaiveai/glaive-code-assistant
71
- - TokenBender/code_instructions_122k_alpaca_style
72
- - TIGER-Lab/MathInstruct
73
- - meta-math/MetaMathQA
74
- - tiedong/goat
75
- - CohereForAI/xP3x
76
- - bigcode/commitpack
77
- - bigcode/commitpackft
78
- - HuggingFaceTB/cosmopedia
79
- - deepmind/code_contests
80
- - ise-uiuc/Magicoder-Evol-Instruct-110K
81
- - ise-uiuc/Magicoder-OSS-Instruct-75K
82
- - theblackcat102/evol-codealpaca-v1
83
- - ajibawa-2023/Code-290k-ShareGPT
84
- - Locutusque/UltraTextbooks-2.0
85
- - teknium/OpenHermes-2.5
86
- - stingning/ultrachat
87
- # - API Blend
88
- #
89
- # DATASET LINKS
90
- # NL
91
- # - nvidia/HelpSteer
92
- # - garage-bAInd/Open-Platypus
93
- # - mosaicml/dolly_hhrlhf
94
- # - mosaicml/instruct-v3
95
- # - conceptofmind/FLAN_2022
96
- # - KnutJaegersberg/longinstruct
97
- # - CohereForAI/xP3x
98
- # - HuggingFaceTB/cosmopedia
99
- # - open-web-math/open-web-math
100
- # - EleutherAI/proof-pile-2
101
- # - math-ai/StackMathQA
102
- # - math-ai/TemplateGSM
103
- # - IBM ConvAI 0111
104
- # - IBM Forca 30K
105
- # - IBM Hardcoded
106
- # Code
107
- # - bugdaryan/sql-create-context-instruction
108
- # - glaiveai/glaive-function-calling-v2
109
- # - cognitivecomputations/dolphin-coder
110
- # - glaiveai/glaive-code-
111
- # - bigcode/commitpackft
112
- # - TIGER-Lab/MathInstruct
113
- # - meta-math/MetaMathQA
114
- # - tiedong/goat
115
- # - CohereForAI/xP3x
116
  metrics:
117
  - code_eval
118
  library_name: transformers
 
12
  - bigcode/starcoderdata
13
  # - Stackexchange
14
  # - CommonCrawl
15
+ - open-web-math/open-web-math
16
+ - math-ai/StackMathQA
17
  # - Arxiv
18
  # - Wikipedia
19
  # - conceptofmind/FLAN_2022 # Original link is broken, we used IBM's filtered version | Phase 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  - nvidia/HelpSteer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  metrics:
22
  - code_eval
23
  library_name: transformers