How the evals were run (FLEURS)

#1
by RASMUS - opened

I cannot repeat FLERUS results with simple huggingface transformers speech recognition pipeline.
How were the evals run?

def log_results(result: Dataset):

# load metric
wer = evaluate.load("wer")
cer = evaluate.load("cer")

# compute metrics
wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
cer_result = cer.compute(references=result["target"], predictions=result["prediction"])

# print & log results
result_str = f"WER: {wer_result}\n" f"CER: {cer_result}"
print(result_str)

model_name = "GetmanY1/wav2vec2-base-fi-150k-finetuned"
asr = pipeline("automatic-speech-recognition", model=model, config=config, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor)

dataset = load_dataset("google/fleurs", "fi_fi", split="test")

def pred_row(row):
audio_input = row["audio"]
row["prediction"] = asr(row["audio"]["array"])["text"])
return row

dataset = dataset.map(pred_row, num_proc=1, batched=False)
dataset = dataset.rename_column('transcription', 'target')

log_results(dataset)

ORIG

log_results(dataset)

WER: 0.13945812130380966
CER: 0.062055555109157674

import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

def pred_row(row):
# Move audio data to device
input_values = processor(
row["audio"]["array"],
return_tensors="pt",
padding="longest",
).input_values.to(device) # Move to GPU

# Retrieve logits
with torch.no_grad():
    logits = model(input_values).logits

# Take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)

row["prediction"] = transcription[0]
return row

Apply function

dataset = dataset.map(pred_row, num_proc=1, batched=False)
dataset = dataset.rename_column('transcription', 'target')

Copied the method from repo but still not matching the reported score (9.96):
WER: 0.10816944024205749
CER: 0.05846383775401155

Sign up or log in to comment