JonasGeiping commited on
Commit
795c6fd
·
verified ·
1 Parent(s): 823822c

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +81 -81
README.md CHANGED
@@ -7,87 +7,87 @@ license: apache-2.0
7
  language:
8
  - en
9
  pipeline_tag: text-generation
10
- datasets:
11
- - HuggingFaceTB/smollm-corpus
12
- - jon-tow/starcoderdata-python-edu
13
- - ubaada/booksum-complete-cleaned
14
- - euirim/goodwiki
15
- - togethercomputer/RedPajama-Data-1T
16
- - allenai/dolma
17
- - bigcode/the-stack-v2-train-smol-ids
18
- - bigcode/starcoderdata
19
- - m-a-p/Matrix
20
- - cerebras/SlimPajama-627B
21
- - open-phi/textbooks
22
- - open-phi/textbooks_grounded
23
- - open-phi/programming_books_llama
24
- - nampdn-ai/tiny-strange-textbooks
25
- - nampdn-ai/tiny-textbooks
26
- - nampdn-ai/tiny-code-textbooks
27
- - nampdn-ai/tiny-orca-textbooks
28
- - SciPhi/textbooks-are-all-you-need-lite
29
- - vikp/textbook_quality_programming
30
- - EleutherAI/proof-pile-2
31
- - open-web-math/open-web-math
32
- - biglam/blbooks-parquet
33
- - storytracer/LoC-PD-Books
34
- - GAIR/MathPile
35
- - tomg-group-umd/CLRS-Text-train
36
- - math-ai/AutoMathText
37
- - bigcode/commitpackft
38
- - bigcode/stack-dedup-python-fns
39
- - vikp/python_code_instructions_filtered
40
- - mlabonne/chessllm
41
- - Waterhorse/chess_data
42
- - EleutherAI/lichess-puzzles
43
- - chargoddard/WebInstructSub-prometheus
44
- - Locutusque/hercules-v5.0
45
- - nvidia/OpenMathInstruct-1
46
- - meta-math/MetaMathQA
47
- - m-a-p/CodeFeedback-Filtered-Instruction
48
- - nvidia/Daring-Anteater
49
- - nvidia/sft_datablend_v1
50
- - BAAI/Infinity-Instruct
51
- - anthracite-org/Stheno-Data-Filtered
52
- - Nopm/Opus_WritingStruct
53
- - xinlai/Math-Step-DPO-10K
54
- - bigcode/self-oss-instruct-sc2-exec-filter-50k
55
- - HuggingFaceTB/everyday-conversations
56
- - hkust-nlp/gsm8k-fix
57
- - HuggingFaceH4/no_robots
58
- - THUDM/LongWriter-6k
59
- - THUDM/webglm-qa
60
- - AlgorithmicResearchGroup/ArXivDLInstruct
61
- - allenai/tulu-v2-sft-mixture-olmo-4096
62
- - bigscience/P3
63
- - Gryphe/Sonnet3.5-SlimOrcaDedupCleaned
64
- - Gryphe/Opus-WritingPrompts
65
- - nothingiisreal/Reddit-Dirty-And-WritingPrompts
66
- - nothingiisreal/Kalomaze-Opus-Instruct-25k-filtered
67
- - internlm/Lean-Github
68
- - pkuAI4M/LeanWorkbook
69
- - casey-martin/multilingual-mathematical-autoformalization
70
- - AI4M/leandojo-informalized
71
- - casey-martin/oa_cpp_annotate_gen
72
- - l3lab/ntp-mathlib-instruct-st
73
- - ajibawa-2023/Maths-College
74
- - ajibawa-2023/Maths-Grade-School
75
- - ajibawa-2023/General-Stories-Collection
76
- - XinyaoHu/AMPS_mathematica
77
- - XinyaoHu/AMPS_khan
78
- - Magpie-Align/Magpie-Pro-MT-300K-v0.1
79
- - Magpie-Align/Magpie-Reasoning-150K
80
- - gair-prox/FineWeb-pro
81
- - gair-prox/c4-pro
82
- - gair-prox/RedPajama-pro
83
- - gair-prox/open-web-math-pro
84
- - togethercomputer/Long-Data-Collections
85
- - emozilla/pg19
86
- - MathGenie/MathCode-Pile
87
- - KingNish/reasoning-base-20k
88
- - nvidia/OpenMathInstruct-2
89
- - LLM360/TxT360
90
- - neuralwork/arxiver
91
  ---
92
 
93
  # Huginn-0125
 
7
  language:
8
  - en
9
  pipeline_tag: text-generation
10
+ # datasets: # cannot order these nicely
11
+ # - HuggingFaceTB/smollm-corpus
12
+ # - jon-tow/starcoderdata-python-edu
13
+ # - ubaada/booksum-complete-cleaned
14
+ # - euirim/goodwiki
15
+ # - togethercomputer/RedPajama-Data-1T
16
+ # - allenai/dolma
17
+ # - bigcode/the-stack-v2-train-smol-ids
18
+ # - bigcode/starcoderdata
19
+ # - m-a-p/Matrix
20
+ # - cerebras/SlimPajama-627B
21
+ # - open-phi/textbooks
22
+ # - open-phi/textbooks_grounded
23
+ # - open-phi/programming_books_llama
24
+ # - nampdn-ai/tiny-strange-textbooks
25
+ # - nampdn-ai/tiny-textbooks
26
+ # - nampdn-ai/tiny-code-textbooks
27
+ # - nampdn-ai/tiny-orca-textbooks
28
+ # - SciPhi/textbooks-are-all-you-need-lite
29
+ # - vikp/textbook_quality_programming
30
+ # - EleutherAI/proof-pile-2
31
+ # - open-web-math/open-web-math
32
+ # - biglam/blbooks-parquet
33
+ # - storytracer/LoC-PD-Books
34
+ # - GAIR/MathPile
35
+ # - tomg-group-umd/CLRS-Text-train
36
+ # - math-ai/AutoMathText
37
+ # - bigcode/commitpackft
38
+ # - bigcode/stack-dedup-python-fns
39
+ # - vikp/python_code_instructions_filtered
40
+ # - mlabonne/chessllm
41
+ # - Waterhorse/chess_data
42
+ # - EleutherAI/lichess-puzzles
43
+ # - chargoddard/WebInstructSub-prometheus
44
+ # - Locutusque/hercules-v5.0
45
+ # - nvidia/OpenMathInstruct-1
46
+ # - meta-math/MetaMathQA
47
+ # - m-a-p/CodeFeedback-Filtered-Instruction
48
+ # - nvidia/Daring-Anteater
49
+ # - nvidia/sft_datablend_v1
50
+ # - BAAI/Infinity-Instruct
51
+ # - anthracite-org/Stheno-Data-Filtered
52
+ # - Nopm/Opus_WritingStruct
53
+ # - xinlai/Math-Step-DPO-10K
54
+ # - bigcode/self-oss-instruct-sc2-exec-filter-50k
55
+ # - HuggingFaceTB/everyday-conversations
56
+ # - hkust-nlp/gsm8k-fix
57
+ # - HuggingFaceH4/no_robots
58
+ # - THUDM/LongWriter-6k
59
+ # - THUDM/webglm-qa
60
+ # - AlgorithmicResearchGroup/ArXivDLInstruct
61
+ # - allenai/tulu-v2-sft-mixture-olmo-4096
62
+ # - bigscience/P3
63
+ # - Gryphe/Sonnet3.5-SlimOrcaDedupCleaned
64
+ # - Gryphe/Opus-WritingPrompts
65
+ # - nothingiisreal/Reddit-Dirty-And-WritingPrompts
66
+ # - nothingiisreal/Kalomaze-Opus-Instruct-25k-filtered
67
+ # - internlm/Lean-Github
68
+ # - pkuAI4M/LeanWorkbook
69
+ # - casey-martin/multilingual-mathematical-autoformalization
70
+ # - AI4M/leandojo-informalized
71
+ # - casey-martin/oa_cpp_annotate_gen
72
+ # - l3lab/ntp-mathlib-instruct-st
73
+ # - ajibawa-2023/Maths-College
74
+ # - ajibawa-2023/Maths-Grade-School
75
+ # - ajibawa-2023/General-Stories-Collection
76
+ # - XinyaoHu/AMPS_mathematica
77
+ # - XinyaoHu/AMPS_khan
78
+ # - Magpie-Align/Magpie-Pro-MT-300K-v0.1
79
+ # - Magpie-Align/Magpie-Reasoning-150K
80
+ # - gair-prox/FineWeb-pro
81
+ # - gair-prox/c4-pro
82
+ # - gair-prox/RedPajama-pro
83
+ # - gair-prox/open-web-math-pro
84
+ # - togethercomputer/Long-Data-Collections
85
+ # - emozilla/pg19
86
+ # - MathGenie/MathCode-Pile
87
+ # - KingNish/reasoning-base-20k
88
+ # - nvidia/OpenMathInstruct-2
89
+ # - LLM360/TxT360
90
+ # - neuralwork/arxiver
91
  ---
92
 
93
  # Huginn-0125