Siddharth63 commited on
Commit
2fa335f
·
1 Parent(s): 7674270

Update ul2_tasks.py

Browse files
Files changed (1) hide show
  1. ul2_tasks.py +24 -12
ul2_tasks.py CHANGED
@@ -69,34 +69,46 @@ dataset_shapes = {"train": dataset["train"].num_rows,
69
  TaskRegistry.add(
70
  "pretrain_biological_ul2",
71
  source=seqio.FunctionDataSource(
72
- dataset_fn=functools.partial(dataset_fn, dataset=dataset),
 
 
73
  splits=("train", "validation"),
74
  caching_permitted=False,
75
- num_input_examples=dataset_shapes,
76
  ),
77
  preprocessors=[
78
  functools.partial(
79
- target_to_key, key_map={
80
- "inputs": None,
81
- "targets": None,
82
- }, target_key="targets"),
 
 
 
83
  seqio.preprocessors.tokenize,
84
  functools.partial(
85
  ul2_objective,
86
  shard_ds=False,
87
  use_prefix_lm_task=True, # use S-denoising
88
- rates=[0.4 / len(R_DENOISER_SPAN_LENGTHS)]*len(R_DENOISER_SPAN_LENGTHS) + [
89
- 0.4 / len(X_DENOISER_SPAN_LENGTHS)]*len(X_DENOISER_SPAN_LENGTHS) + [0.2], # equal total 40% rate for both R- and X-denoisers + 20% for S-denoising (suggested at the paper chapter 4.5)
 
 
 
90
  mean_noise_span_lengths=R_DENOISER_SPAN_LENGTHS + X_DENOISER_SPAN_LENGTHS,
91
  noise_densities=R_DENOISER_CORRUPT_RATES + X_DENOISER_CORRUPT_RATES,
92
- optional_task_prefixes=[R_DENOISER_TOKEN_PREFIX]*len(R_DENOISER_SPAN_LENGTHS) + [
93
- X_DENOISER_TOKEN_PREFIX]*len(X_DENOISER_SPAN_LENGTHS) + [S_DENOISER_TOKEN_PREFIX],
 
 
94
  reserved_for_packing=1, # make room for task prefix token
95
  ),
96
  seqio.preprocessors.append_eos_after_trim,
97
  ],
98
- output_features={"targets": DEFAULT_OUTPUT_FEATURES["targets"]},
99
- metric_fns=[metrics.accuracy]
 
 
 
100
  )
101
 
102
 
 
69
  TaskRegistry.add(
70
  "pretrain_biological_ul2",
71
  source=seqio.FunctionDataSource(
72
+ dataset_fn=functools.partial(
73
+ dataset_fn, path="Siddharth63/biological_dataset",
74
+ ),
75
  splits=("train", "validation"),
76
  caching_permitted=False,
 
77
  ),
78
  preprocessors=[
79
  functools.partial(
80
+ target_to_key,
81
+ key_map={
82
+ "inputs": "text",
83
+ "targets": "text",
84
+ },
85
+ target_key="targets",
86
+ ),
87
  seqio.preprocessors.tokenize,
88
  functools.partial(
89
  ul2_objective,
90
  shard_ds=False,
91
  use_prefix_lm_task=True, # use S-denoising
92
+ rates=[0.4 / len(R_DENOISER_SPAN_LENGTHS)] * len(R_DENOISER_SPAN_LENGTHS)
93
+ + [0.4 / len(X_DENOISER_SPAN_LENGTHS)] * len(X_DENOISER_SPAN_LENGTHS)
94
+ + [
95
+ 0.2
96
+ ], # equal total 40% rate for both R- and X-denoisers + 20% for S-denoising (suggested at the paper chapter 4.5)
97
  mean_noise_span_lengths=R_DENOISER_SPAN_LENGTHS + X_DENOISER_SPAN_LENGTHS,
98
  noise_densities=R_DENOISER_CORRUPT_RATES + X_DENOISER_CORRUPT_RATES,
99
+ optional_task_prefixes=[R_DENOISER_TOKEN_PREFIX]
100
+ * len(R_DENOISER_SPAN_LENGTHS)
101
+ + [X_DENOISER_TOKEN_PREFIX] * len(X_DENOISER_SPAN_LENGTHS)
102
+ + [S_DENOISER_TOKEN_PREFIX],
103
  reserved_for_packing=1, # make room for task prefix token
104
  ),
105
  seqio.preprocessors.append_eos_after_trim,
106
  ],
107
+ output_features={
108
+ "targets": DEFAULT_OUTPUT_FEATURES["targets"],
109
+ "inputs": seqio.Feature(vocabulary=vocabulary, add_eos=True),
110
+ },
111
+ metric_fns=[metrics.accuracy],
112
  )
113
 
114