import pandas as pd # from evaluate import load from scipy import stats from nltk.translate.bleu_score import sentence_bleu import string # Load data from the CSV files df1 = pd.read_csv('MT0_xxl_ape/result_mr') df2 = pd.read_csv('MT0_xxl_ape/result_mr_50p') df_reference = pd.read_csv('MT0_xxl_ape/result_mr') # bleu = load("sacrebleu") sentences1 = df1['pred_label'] sentences2 = df2['pred_label'] reference_sentences = df_reference['ref'] def process_sentence(sentence): if not isinstance(sentence, str): return "" # Remove spaces before and after the sentence sentence = sentence.split('\n')[0] sentence = sentence.strip() sentence = sentence.lower() # Remove punctuation marks in the sentence for punctuation in string.punctuation: sentence = sentence.replace(punctuation, "") sentence = sentence.strip() if sentence == "": return sentence if (sentence[-1] == 'ред'): print(sentence) sentence = sentence[:-1] print(sentence) return sentence # Calculate BLEU scores def calculate_bleu(sentences, reference): return [sentence_bleu([reference[i]], sentences[i]) for i in range(len(sentences))] sentences1 = [process_sentence(s) for s in list(sentences1)] sentences2 = [process_sentence(s) for s in list(sentences2)] reference_sentences = [process_sentence(s) for s in list(reference_sentences)] bleu_scores1 = calculate_bleu(sentences1, reference_sentences) bleu_scores2 = calculate_bleu(sentences2, reference_sentences) # Check for normality def check_normality(data): stat, p = stats.shapiro(data) if p > 0.05: return True else: return False is_normal1 = check_normality(bleu_scores1) is_normal2 = check_normality(bleu_scores2) # Check for equal variances def check_variance(data1, data2): stat, p = stats.levene(data1, data2) if p > 0.05: return True else: return False is_equal_var = check_variance(bleu_scores1, bleu_scores2) # Decide and perform the significance test def perform_significance_test(): if is_normal1 and is_normal2: if is_equal_var: t_stat, p = stats.ttest_ind(bleu_scores1, bleu_scores2) return "T-test", p else: t_stat, p = stats.ttest_ind(bleu_scores1, bleu_scores2, equal_var=False) return "Welch's T-test", p else: u_stat, p = stats.mannwhitneyu(bleu_scores1, bleu_scores2) return "Mann-Whitney U test", p test_name, p_value = perform_significance_test() # Output results print(f"Test used: {test_name}") print(f"P-value: {p_value}") if p_value < 0.05: print("The difference in BLEU scores is statistically significant.") else: print("The difference in BLEU scores is not statistically significant.")