CoSTA / ST /inference /codes /wilcoxon.py
bhavanishankarpullela's picture
Upload 9 files
2dc7757 verified
raw
history blame
1.69 kB
import csv
import string
from scipy.stats import wilcoxon
import numpy as np
def process_sentence(sentence):
if not isinstance(sentence, str):
return ""
sentence = sentence.split('\n')[0]
sentence = sentence.strip()
sentence = sentence.lower()
for punctuation in string.punctuation:
sentence = sentence.replace(punctuation, "")
sentence = sentence.strip()
if sentence and sentence[-1] == '।':
sentence = sentence[:-1]
return sentence
# Read CSV and generate exact match scores for Prompt A
with open('MT0_xxl_results/result_vi', 'r') as csvfile:
reader = csv.DictReader(csvfile)
scores_a = [1 if process_sentence(row['pred_label']) == process_sentence(row['label']) else 0 for row in reader]
# Read CSV and generate exact match scores for Prompt B
with open('MT0_xxl_results/result_vi_80p', 'r') as csvfile:
reader = csv.DictReader(csvfile)
scores_b = [1 if process_sentence(row['pred_label']) == process_sentence(row['label']) else 0 for row in reader]
# Count the number of 1s in each list
count_a = scores_a.count(1)
count_b = scores_b.count(1)
# Print the counts
print(f"Number of exact matches for Prompt A: {count_a}")
print(f"Number of exact matches for Prompt B: {count_b}")
# Conduct Wilcoxon Signed Rank test
w_stat, p_val = wilcoxon(scores_a, scores_b)
# Print the results
print(f"Wilcoxon Signed Rank statistic: {w_stat}")
print(f"P-value: {p_val}")
if p_val < 0.05:
print("The difference in score distributions between the prompts is statistically significant.")
else:
print("The difference in score distributions between the prompts is not statistically significant.")