Siddharth63
commited on
Commit
·
2fa335f
1
Parent(s):
7674270
Update ul2_tasks.py
Browse files- ul2_tasks.py +24 -12
ul2_tasks.py
CHANGED
@@ -69,34 +69,46 @@ dataset_shapes = {"train": dataset["train"].num_rows,
|
|
69 |
TaskRegistry.add(
|
70 |
"pretrain_biological_ul2",
|
71 |
source=seqio.FunctionDataSource(
|
72 |
-
dataset_fn=functools.partial(
|
|
|
|
|
73 |
splits=("train", "validation"),
|
74 |
caching_permitted=False,
|
75 |
-
num_input_examples=dataset_shapes,
|
76 |
),
|
77 |
preprocessors=[
|
78 |
functools.partial(
|
79 |
-
target_to_key,
|
80 |
-
|
81 |
-
"
|
82 |
-
|
|
|
|
|
|
|
83 |
seqio.preprocessors.tokenize,
|
84 |
functools.partial(
|
85 |
ul2_objective,
|
86 |
shard_ds=False,
|
87 |
use_prefix_lm_task=True, # use S-denoising
|
88 |
-
rates=[0.4 / len(R_DENOISER_SPAN_LENGTHS)]*len(R_DENOISER_SPAN_LENGTHS)
|
89 |
-
|
|
|
|
|
|
|
90 |
mean_noise_span_lengths=R_DENOISER_SPAN_LENGTHS + X_DENOISER_SPAN_LENGTHS,
|
91 |
noise_densities=R_DENOISER_CORRUPT_RATES + X_DENOISER_CORRUPT_RATES,
|
92 |
-
optional_task_prefixes=[R_DENOISER_TOKEN_PREFIX]
|
93 |
-
|
|
|
|
|
94 |
reserved_for_packing=1, # make room for task prefix token
|
95 |
),
|
96 |
seqio.preprocessors.append_eos_after_trim,
|
97 |
],
|
98 |
-
output_features={
|
99 |
-
|
|
|
|
|
|
|
100 |
)
|
101 |
|
102 |
|
|
|
69 |
TaskRegistry.add(
|
70 |
"pretrain_biological_ul2",
|
71 |
source=seqio.FunctionDataSource(
|
72 |
+
dataset_fn=functools.partial(
|
73 |
+
dataset_fn, path="Siddharth63/biological_dataset",
|
74 |
+
),
|
75 |
splits=("train", "validation"),
|
76 |
caching_permitted=False,
|
|
|
77 |
),
|
78 |
preprocessors=[
|
79 |
functools.partial(
|
80 |
+
target_to_key,
|
81 |
+
key_map={
|
82 |
+
"inputs": "text",
|
83 |
+
"targets": "text",
|
84 |
+
},
|
85 |
+
target_key="targets",
|
86 |
+
),
|
87 |
seqio.preprocessors.tokenize,
|
88 |
functools.partial(
|
89 |
ul2_objective,
|
90 |
shard_ds=False,
|
91 |
use_prefix_lm_task=True, # use S-denoising
|
92 |
+
rates=[0.4 / len(R_DENOISER_SPAN_LENGTHS)] * len(R_DENOISER_SPAN_LENGTHS)
|
93 |
+
+ [0.4 / len(X_DENOISER_SPAN_LENGTHS)] * len(X_DENOISER_SPAN_LENGTHS)
|
94 |
+
+ [
|
95 |
+
0.2
|
96 |
+
], # equal total 40% rate for both R- and X-denoisers + 20% for S-denoising (suggested at the paper chapter 4.5)
|
97 |
mean_noise_span_lengths=R_DENOISER_SPAN_LENGTHS + X_DENOISER_SPAN_LENGTHS,
|
98 |
noise_densities=R_DENOISER_CORRUPT_RATES + X_DENOISER_CORRUPT_RATES,
|
99 |
+
optional_task_prefixes=[R_DENOISER_TOKEN_PREFIX]
|
100 |
+
* len(R_DENOISER_SPAN_LENGTHS)
|
101 |
+
+ [X_DENOISER_TOKEN_PREFIX] * len(X_DENOISER_SPAN_LENGTHS)
|
102 |
+
+ [S_DENOISER_TOKEN_PREFIX],
|
103 |
reserved_for_packing=1, # make room for task prefix token
|
104 |
),
|
105 |
seqio.preprocessors.append_eos_after_trim,
|
106 |
],
|
107 |
+
output_features={
|
108 |
+
"targets": DEFAULT_OUTPUT_FEATURES["targets"],
|
109 |
+
"inputs": seqio.Feature(vocabulary=vocabulary, add_eos=True),
|
110 |
+
},
|
111 |
+
metric_fns=[metrics.accuracy],
|
112 |
)
|
113 |
|
114 |
|