import pickle import gradio as gr import numpy as np import pandas as pd import plotly.express as px ###################################### # 1) LOAD DATA & MODELS ###################################### df = pd.read_csv("X_train_test_combined_dataset_Filtered_dataset.csv") model_filenames = [ "YOWRCONC.pkl", "YOSEEDOC.pkl", "YO_MDEA5.pkl", "YOWRLSIN.pkl", "YODPPROB.pkl", "YOWRPROB.pkl", "YODPR2WK.pkl", "YOWRDEPR.pkl", "YODPDISC.pkl", "YOLOSEV.pkl", "YOWRDCSN.pkl", "YODSMMDE.pkl", "YO_MDEA3.pkl", "YODPLSIN.pkl", "YOWRELES.pkl", "YOPB2WK.pkl" ] model_path = "models/" ###################################### # 2) MODEL PREDICTOR ###################################### class ModelPredictor: def __init__(self, model_path, model_filenames): self.model_path = model_path self.model_filenames = model_filenames self.models = self.load_models() self.prediction_map = { "YOWRCONC": ["Did not have difficulty concentrating", "Had difficulty concentrating"], "YOSEEDOC": ["Did not feel the need to see a doctor", "Felt the need to see a doctor"], "YOWRHRS": ["Did not have trouble sleeping", "Had trouble sleeping"], "YO_MDEA5": ["Others did not notice restlessness/lethargy", "Others noticed restlessness/lethargy"], "YOWRCHR": ["Did not feel so sad nothing could cheer up", "Felt so sad that nothing could cheer up"], "YOWRLSIN": ["Did not feel bored / lose interest", "Felt bored / lost interest"], "YODPPROB": ["No other problems for 2+ weeks", "Had other problems for 2+ weeks"], "YOWRPROB": ["No 'worst time ever' feeling", "Had 'worst time ever' feeling"], "YODPR2WK": ["No periods with depressed feelings lasting 2+ weeks", "Had depressed feelings 2+ weeks"], "YOWRDEPR": ["Did not feel sad/depressed mostly everyday", "Felt sad/depressed mostly everyday"], "YODPDISC": ["Overall mood not sad/depressed", "Overall mood was sad/depressed"], "YOLOSEV": ["Did not lose interest", "Lost interest in enjoyable things"], "YOWRDCSN": ["Was able to make decisions", "Was unable to make decisions"], "YODSMMDE": ["Never had 2 weeks depression symptoms", "Had 2+ weeks of depression symptoms"], "YO_MDEA3": ["No changes in appetite/weight", "Had changes in appetite/weight"], "YODPLSIN": ["Never lost interest / felt bored", "Lost interest/felt bored"], "YOWRELES": ["Did not eat less than usual", "Ate less than usual"], "YODSCEV": ["Fewer severe depression symptoms", "More severe depression symptoms"], "YOPB2WK": ["No uneasy feelings lasting 2+ weeks", "Uneasy feelings lasting 2+ weeks"], "YO_MDEA2": ["No physical/mental issues (2+ weeks)", "Had physical/mental issues (2+ weeks)"] } def load_models(self): loaded = [] for fname in self.model_filenames: with open(self.model_path + fname, "rb") as f: model = pickle.load(f) loaded.append(model) return loaded def make_predictions(self, user_input: pd.DataFrame): predictions = [] for model in self.models: out = model.predict(user_input) predictions.append(out.flatten()) return predictions def get_majority_vote(self, predictions): combined = np.concatenate(predictions) return np.bincount(combined).argmax() def evaluate_severity(self, count_ones: int) -> str: if count_ones >= 13: return "Mental Health Severity: Severe" elif count_ones >= 9: return "Mental Health Severity: Moderate" elif count_ones >= 5: return "Mental Health Severity: Low" else: return "Mental Health Severity: Very Low" predictor = ModelPredictor(model_path, model_filenames) ###################################### # 3) FEATURE CATEGORIES + MAPPING ###################################### categories_dict = { "1. Depression & Substance Use Diagnosis": [ "YMDESUD5ANYO", "YMDELT", "YMDEYR", "YMDERSUD5ANY", "YMSUD5YANY", "YMIUD5YANY", "YMIMS5YANY", "YMIMI5YANY" ], "2. Mental Health Treatment & Prof Consultation": [ "YMDEHPO", "YMDETXRX", "YMDEHARX", "YRXMDEYR", "YHLTMDE", "YTXMDEYR", "YDOCMDE", "YPSY2MDE", "YPSY1MDE", "YCOUNMDE" ], "3. Functional & Cognitive Impairment": [ "MDEIMPY", "LVLDIFMEM2" ], "4. Suicidal Thoughts & Behaviors": [ "YUSUITHK", "YUSUITHKYR", "YUSUIPLNYR", "YUSUIPLN" ] } # The numeric mappings for each of the 25 features input_mapping = { 'YMDESUD5ANYO': {"SUD only, no MDE": 1, "MDE only, no SUD": 2, "SUD and MDE": 3, "Neither SUD or MDE": 4}, 'YMDELT': {"Yes": 1, "No": 2}, 'YMDEYR': {"Yes": 1, "No": 2}, 'YMDERSUD5ANY': {"Yes": 1, "No": 0}, 'YMSUD5YANY': {"Yes": 1, "No": 0}, 'YMIUD5YANY': {"Yes": 1, "No": 0}, 'YMIMS5YANY': {"Yes": 1, "No": 0}, 'YMIMI5YANY': {"Yes": 1, "No": 0}, 'YMDEHPO': {"Yes": 1, "No": 0}, 'YMDETXRX': {"Yes": 1, "No": 0}, 'YMDEHARX': {"Yes": 1, "No": 0}, 'YRXMDEYR': {"Yes": 1, "No": 0}, 'YHLTMDE': {"Yes": 1, "No": 0}, 'YTXMDEYR': {"Yes": 1, "No": 0}, 'YDOCMDE': {"Yes": 1, "No": 0}, 'YPSY2MDE': {"Yes": 1, "No": 0}, 'YPSY1MDE': {"Yes": 1, "No": 0}, 'YCOUNMDE': {"Yes": 1, "No": 0}, 'MDEIMPY': {"Yes": 1, "No": 2}, 'LVLDIFMEM2': {"No Difficulty": 1, "Some difficulty": 2, "A lot of difficulty or cannot do at all": 3}, 'YUSUITHK': {"Yes": 1, "No": 2, "I'm not sure": 3, "I don't want to answer": 4}, 'YUSUITHKYR': {"Yes": 1, "No": 2, "I'm not sure": 3, "I don't want to answer": 4}, 'YUSUIPLNYR': {"Yes": 1, "No": 2, "I'm not sure": 3, "I don't want to answer": 4}, 'YUSUIPLN': {"Yes": 1, "No": 2, "I'm not sure": 3, "I don't want to answer": 4}, } def validate_inputs(*args): for arg in args: if not arg: # empty or None return False return True ###################################### # 4) NEAREST NEIGHBORS (Grouped) ###################################### def get_nearest_neighbors_info(user_input_df: pd.DataFrame, k=5): # Ensure columns exist in df user_cols = user_input_df.columns if not all(col in df.columns for col in user_cols): return "Cannot compute nearest neighbors. Some columns not found in df." # Subset df sub_df = df[list(user_cols)].copy() diffs = sub_df - user_input_df.iloc[0] dists = (diffs**2).sum(axis=1)**0.5 nn_indices = dists.nsmallest(k).index neighbors = df.loc[nn_indices] lines = [f"**Nearest Neighbors (k={k})**", f"Distances Range: {dists[nn_indices].min():.2f} to {dists[nn_indices].max():.2f}", ""] # Group the features by our categories_dict for cat_name, cat_feats in categories_dict.items(): lines.append(f"### {cat_name}") for feat in cat_feats: if feat not in neighbors.columns: continue # Count how many neighbors had each numeric value val_counts = neighbors[feat].value_counts().to_dict() # Build string like: "YMDESUD5ANYO => 3 had 1, 2 had 2..." parts = [] for val_, count_ in val_counts.items(): parts.append(f"{count_} had '{val_}'") joined = "; ".join(parts) lines.append(f"**{feat}** => {joined}") lines.append("") # blank line return "\n".join(lines) ###################################### # 5) PREDICT FUNCTION ###################################### def predict( # EXACTLY 25 features, matching categories_dict ordering. # We'll just list them in the dictionary order we want to show them: YMDESUD5ANYO, YMDELT, YMDEYR, YMDERSUD5ANY, YMSUD5YANY, YMIUD5YANY, YMIMS5YANY, YMIMI5YANY, YMDEHPO, YMDETXRX, YMDEHARX, YRXMDEYR, YHLTMDE, YTXMDEYR, YDOCMDE, YPSY2MDE, YPSY1MDE, YCOUNMDE, MDEIMPY, LVLDIFMEM2, YUSUITHK, YUSUITHKYR, YUSUIPLNYR, YUSUIPLN ): if not validate_inputs( YMDESUD5ANYO, YMDELT, YMDEYR, YMDERSUD5ANY, YMSUD5YANY, YMIUD5YANY, YMIMS5YANY, YMIMI5YANY, YMDEHPO, YMDETXRX, YMDEHARX, YRXMDEYR, YHLTMDE, YTXMDEYR, YDOCMDE, YPSY2MDE, YPSY1MDE, YCOUNMDE, MDEIMPY, LVLDIFMEM2, YUSUITHK, YUSUITHKYR, YUSUIPLNYR, YUSUIPLN ): return ( "Please select all required fields.", # 1) Prediction Results "Validation Error", # 2) Severity "No data", # 3) Total Count "No nearest neighbors info", # 4) NN Summary None, # 5) Bar chart (Input) None # 6) Bar chart (Labels) ) # 1) Map user-friendly -> numeric user_input_dict = { 'YMDESUD5ANYO': input_mapping['YMDESUD5ANYO'][YMDESUD5ANYO], 'YMDELT': input_mapping['YMDELT'][YMDELT], 'YMDEYR': input_mapping['YMDEYR'][YMDEYR], 'YMDERSUD5ANY': input_mapping['YMDERSUD5ANY'][YMDERSUD5ANY], 'YMSUD5YANY': input_mapping['YMSUD5YANY'][YMSUD5YANY], 'YMIUD5YANY': input_mapping['YMIUD5YANY'][YMIUD5YANY], 'YMIMS5YANY': input_mapping['YMIMS5YANY'][YMIMS5YANY], 'YMIMI5YANY': input_mapping['YMIMI5YANY'][YMIMI5YANY], 'YMDEHPO': input_mapping['YMDEHPO'][YMDEHPO], 'YMDETXRX': input_mapping['YMDETXRX'][YMDETXRX], 'YMDEHARX': input_mapping['YMDEHARX'][YMDEHARX], 'YRXMDEYR': input_mapping['YRXMDEYR'][YRXMDEYR], 'YHLTMDE': input_mapping['YHLTMDE'][YHLTMDE], 'YTXMDEYR': input_mapping['YTXMDEYR'][YTXMDEYR], 'YDOCMDE': input_mapping['YDOCMDE'][YDOCMDE], 'YPSY2MDE': input_mapping['YPSY2MDE'][YPSY2MDE], 'YPSY1MDE': input_mapping['YPSY1MDE'][YPSY1MDE], 'YCOUNMDE': input_mapping['YCOUNMDE'][YCOUNMDE], 'MDEIMPY': input_mapping['MDEIMPY'][MDEIMPY], 'LVLDIFMEM2': input_mapping['LVLDIFMEM2'][LVLDIFMEM2], 'YUSUITHK': input_mapping['YUSUITHK'][YUSUITHK], 'YUSUITHKYR': input_mapping['YUSUITHKYR'][YUSUITHKYR], 'YUSUIPLNYR': input_mapping['YUSUIPLNYR'][YUSUIPLNYR], 'YUSUIPLN': input_mapping['YUSUIPLN'][YUSUIPLN] } user_df = pd.DataFrame(user_input_dict, index=[0]) # 2) Predict predictions = predictor.make_predictions(user_df) all_preds = np.concatenate(predictions) count_ones = sum(all_preds == 1) severity_msg = predictor.evaluate_severity(count_ones) # 3) Grouped textual results groups = { "Concentration_and_Decision_Making": ["YOWRCONC", "YOWRDCSN"], "Sleep_and_Energy_Levels": ["YOWRHRS", "YO_MDEA5", "YOWRELES", "YO_MDEA2"], "Mood_and_Emotional_State": [ "YOWRCHR", "YOWRLSIN", "YOWRDEPR", "YODPDISC", "YOLOSEV", "YODPLSIN", "YODSCEV" ], "Appetite_and_Weight_Changes": ["YO_MDEA3", "YOWRELES"], "Duration_and_Severity_of_Depression_Symptoms": [ "YODPPROB", "YOWRPROB", "YODPR2WK", "YODSMMDE", "YOPB2WK" ] } group_text = {g: [] for g in groups} # The model_filenames order determines which label is i for i, arr in enumerate(predictions): label_col = model_filenames[i].split('.')[0] # e.g. "YOWRCONC" val = arr[0] # If we have a textual map, use it if label_col in predictor.prediction_map and val in range(len(predictor.prediction_map[label_col])): text_label = predictor.prediction_map[label_col][val] else: text_label = f"Prediction={val}" # Put in whichever group for group_name, cols_ in groups.items(): if label_col in cols_: group_text[group_name].append(f"{label_col} => {text_label}") break final_str_parts = [] for gname, lines in group_text.items(): if lines: gtitle = gname.replace("_", " ") final_str_parts.append(f"**{gtitle}**") final_str_parts.append("\n".join(lines)) final_str_parts.append("") if not final_str_parts: final_str = "No predictions made or no matching group columns." else: final_str = "\n".join(final_str_parts) # 4) Additional info total_count = len(df) total_count_md = f"We have **{total_count}** patients in the dataset." # 5) Nearest Neighbors nn_md = get_nearest_neighbors_info(user_df, k=5) # 6) Bar chart for input features input_counts = {} for col, val_ in user_input_dict.items(): matched = len(df[df[col] == val_]) input_counts[col] = matched bar_in_df = pd.DataFrame({"Feature": list(input_counts.keys()), "Count": list(input_counts.values())}) fig_in = px.bar( bar_in_df, x="Feature", y="Count", title="Number of Patients with the Same Input Feature Values" ) fig_in.update_layout(width=1200, height=400) # 7) Bar chart for predicted labels label_counts = {} for i, arr in enumerate(predictions): lbl = model_filenames[i].split('.')[0] pred_val = arr[0] if lbl in df.columns: label_counts[lbl] = len(df[df[lbl] == pred_val]) if label_counts: bar_lbl_df = pd.DataFrame({ "Label": list(label_counts.keys()), "Count": list(label_counts.values()) }) fig_lbl = px.bar( bar_lbl_df, x="Label", y="Count", title="Number of Patients with the Same Predicted Label" ) fig_lbl.update_layout(width=1200, height=400) else: fig_lbl = px.bar(title="No valid predicted labels to display.") fig_lbl.update_layout(width=1200, height=400) return ( final_str, # 1) Prediction Results severity_msg, # 2) Mental Health Severity total_count_md, # 3) Total Patient Count nn_md, # 4) Nearest Neighbors Summary fig_in, # 5) Bar Chart (input features) fig_lbl # 6) Bar Chart (labels) ) ###################################### # 6) EXTRA TABS / FUNCTIONS ###################################### def distribution_plot(feature_col, label_col): if not feature_col or not label_col: return px.bar(title="Please select both Feature and Label.") if (feature_col not in df.columns) or (label_col not in df.columns): return px.bar(title="Selected columns not found in the dataset.") grouped = df.groupby([feature_col, label_col]).size().reset_index(name="count") fig = px.bar( grouped, x=feature_col, y="count", color=label_col, title=f"Distribution of {feature_col} vs {label_col}" ) fig.update_layout(width=1200, height=600) return fig def co_occurrence_plot(feature1, feature2, label_col): if not feature1 or not feature2 or not label_col: return px.bar(title="Please select all three fields.") if feature1 not in df.columns or feature2 not in df.columns or label_col not in df.columns: return px.bar(title="Selected columns not found in the dataset.") grouped = df.groupby([feature1, feature2, label_col]).size().reset_index(name="count") fig = px.bar( grouped, x=feature1, y="count", color=label_col, facet_col=feature2, title=f"Co-occurrence: {feature1}, {feature2} vs {label_col}" ) fig.update_layout(width=1200, height=600) return fig ###################################### # 7) BUILD GRADIO UI ###################################### with gr.Blocks(css=".gradio-container {max-width: 1200px;}") as demo: # ======== TAB 1: PREDICTION ======== with gr.Tab("Prediction"): gr.Markdown( """ ### Please provide inputs in each of the four categories below. *All fields are required.* """ ) # For clarity, we define an ordered list of the features in the exact sequence # matching our predict() function. We’ll group them under the same headings. cat1_col_labels = [ ("YMDESUD5ANYO", "YMDESUD5ANYO: ONLY MDE, ONLY SUD, BOTH, OR NEITHER"), ("YMDELT", "YMDELT: Had major depressive episode in lifetime"), ("YMDEYR", "YMDEYR: Past-year major depressive episode"), ("YMDERSUD5ANY", "YMDERSUD5ANY: MDE or substance use disorder - past year"), ("YMSUD5YANY", "YMSUD5YANY: Past-year MDE & substance use disorder"), ("YMIUD5YANY", "YMIUD5YANY: Past-year MDE & illicit drug use disorder"), ("YMIMS5YANY", "YMIMS5YANY: Past-year MDE + severe impairment + substance use"), ("YMIMI5YANY", "YMIMI5YANY: Past-year MDE with severe impairment & illicit drug use") ] cat2_col_labels = [ ("YMDEHPO", "YMDEHPO: Saw health prof only for MDE in past year"), ("YMDETXRX", "YMDETXRX: Received treatment/counseling if saw doc/prof for MDE"), ("YMDEHARX", "YMDEHARX: Saw health professional & received medication for MDE"), ("YRXMDEYR", "YRXMDEYR: Used received medication for MDE in past years"), ("YHLTMDE", "YHLTMDE: Saw/talked to health professional about MDE in past year"), ("YTXMDEYR", "YTXMDEYR: Saw or talked to doc/health prof for MDE in past year"), ("YDOCMDE", "YDOCMDE: Saw/talked to general practitioner/family MD about MDE"), ("YPSY2MDE", "YPSY2MDE: Saw/talked to psychiatrist about MDE"), ("YPSY1MDE", "YPSY1MDE: Saw/talked to psychologist about MDE"), ("YCOUNMDE", "YCOUNMDE: Saw/talked to counselor about MDE") ] cat3_col_labels = [ ("MDEIMPY", "MDEIMPY: MDE with severe role impairment"), ("LVLDIFMEM2", "LVLDIFMEM2: Difficulty remembering/concentrating") ] cat4_col_labels = [ ("YUSUITHK", "YUSUITHK: Youth seriously think about killing self in past 12 months"), ("YUSUITHKYR", "YUSUITHKYR: Seriously thought about killing self"), ("YUSUIPLNYR", "YUSUIPLNYR: Made plans to kill self in past year"), ("YUSUIPLN", "YUSUIPLN: Made plans to kill yourself in past 12 months") ] # Category 1 block gr.Markdown("#### 1. Depression & Substance Use Diagnosis") cat1_inputs = [] for col, label_text in cat1_col_labels: dd = gr.Dropdown( choices=list(input_mapping[col].keys()), label=label_text ) cat1_inputs.append(dd) # Category 2 block gr.Markdown("#### 2. Mental Health Treatment & Professional Consultation") cat2_inputs = [] for col, label_text in cat2_col_labels: dd = gr.Dropdown( choices=list(input_mapping[col].keys()), label=label_text ) cat2_inputs.append(dd) # Category 3 block gr.Markdown("#### 3. Functional & Cognitive Impairment") cat3_inputs = [] for col, label_text in cat3_col_labels: dd = gr.Dropdown( choices=list(input_mapping[col].keys()), label=label_text ) cat3_inputs.append(dd) # Category 4 block gr.Markdown("#### 4. Suicidal Thoughts & Behaviors") cat4_inputs = [] for col, label_text in cat4_col_labels: dd = gr.Dropdown( choices=list(input_mapping[col].keys()), label=label_text ) cat4_inputs.append(dd) # The overall input list must match the order in `predict()` all_inputs = cat1_inputs + cat2_inputs + cat3_inputs + cat4_inputs predict_btn = gr.Button("Predict") # 6 outputs out_pred_res = gr.Textbox(label="Prediction Results", lines=8) out_sev = gr.Textbox(label="Mental Health Severity", lines=2) out_count = gr.Markdown(label="Total Patient Count") out_nn = gr.Markdown(label="Nearest Neighbors Summary (Grouped by Category)") out_bar_input= gr.Plot(label="Input Feature Counts") out_bar_label= gr.Plot(label="Predicted Label Counts") predict_btn.click( fn=predict, inputs=all_inputs, outputs=[ out_pred_res, # 1 out_sev, # 2 out_count, # 3 out_nn, # 4 out_bar_input, # 5 out_bar_label # 6 ] ) # ======== TAB 2: Distribution Analysis ======== with gr.Tab("Distribution Analysis"): gr.Markdown("## Distribution Plot\nSelect one feature and one label column to see bar counts.") list_of_features = sorted(input_mapping.keys()) list_of_labels = sorted(predictor.prediction_map.keys()) feat_dd = gr.Dropdown(choices=list_of_features, label="Feature Column") lbl_dd = gr.Dropdown(choices=list_of_labels, label="Label Column") generate_dist_btn = gr.Button("Generate Distribution Plot") dist_output = gr.Plot() generate_dist_btn.click( fn=distribution_plot, inputs=[feat_dd, lbl_dd], outputs=dist_output ) # ======== TAB 3: Co-occurrence ======== with gr.Tab("Co-occurrence"): gr.Markdown("## Co-Occurrence Plot\nSelect two features + one label to see a 3-way distribution.") feat1_dd = gr.Dropdown(choices=list_of_features, label="Feature 1") feat2_dd = gr.Dropdown(choices=list_of_features, label="Feature 2") label_dd = gr.Dropdown(choices=list_of_labels, label="Label Column") generate_btn = gr.Button("Generate Co-occurrence Plot") co_occ_output = gr.Plot() generate_btn.click( fn=co_occurrence_plot, inputs=[feat1_dd, feat2_dd, label_dd], outputs=co_occ_output ) # Finally, launch demo.launch()