import pandas as pd from glob import glob from sklearn import metrics from statistics import harmonic_mean files = glob('output_original/*.csv') theoretical = 1357228 dfs = [] for file in files: filename = file.split('/')[-1] df = pd.read_csv(file) df.columns = ['tpr', 'new_tweets', 'threshold'] df['fpr'] = df['new_tweets'] / df['new_tweets'].max() df['fpr2'] = df['new_tweets'] / theoretical df = df.sort_values(by = ['tpr', 'new_tweets']) df = df.drop_duplicates(subset = ['tpr'], keep = 'first') df.to_csv('output_standardized/%s' % filename, index = False) df['metric'] = filename.split('.csv')[0] roc1 = metrics.auc(df['fpr'], df['tpr']) roc2 = metrics.auc(df['fpr2'], df['tpr']) df['roc1'] = roc1 df['roc2'] = roc2 #roc3 df95 = df.copy() df95 = df95[df95.fpr2 <= 0.016] df95['fpr2'] = df95['fpr2']*(1/0.016) tprmax = df95.tpr.max() if(tprmax < 1): fpr2_max = df95.fpr2.max() multipli = 1/fpr2_max tpr_interpolated = tprmax*multipli tpr = df95['tpr'] fpr = df95['fpr2'] tpr.loc[-1] = tpr_interpolated fpr.loc[-1] = 1 roc95 = metrics.auc(fpr, tpr) df['roc95'] = roc95 df['fpr3'] = df.fpr2*(1/0.016) df['harmonic'] = harmonic_mean([roc95,roc1]) dfs.append(df) df = pd.concat(dfs) df.to_csv('merged_outputs.csv', index = False)