Spaces:

pantdipendra
/

AdolescentsMentalHealthPrediction

Running

App Files Files Community

pantdipendra commited on 19 days ago

Commit

b782f65

verified ·

1 Parent(s): 51455ff

Update app.py

Browse files

Files changed (1) hide show

app.py +237 -238

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
-import gradio as gr
 import pickle
 import numpy as np
 import pandas as pd
 import plotly.express as px
@@ -7,15 +8,13 @@ import plotly.express as px
 # Load the training CSV once (outside the functions so it is read only once).
 df = pd.read_csv("X_train_Y_Train_merged_train.csv")
-##############################################################################
-#                           MODEL PREDICTOR CLASS
-##############################################################################
 class ModelPredictor:
     def __init__(self, model_path, model_filenames):
         self.model_path = model_path
         self.model_filenames = model_filenames
         self.models = self.load_models()
         self.prediction_map = {
             "YOWRCONC": ["Did not have difficulty concentrating", "Had difficulty concentrating"],
             "YOSEEDOC": ["Did not feel the need to see a doctor", "Felt the need to see a doctor"],
@@ -57,7 +56,10 @@ class ModelPredictor:
         return models
     def make_predictions(self, user_input):
-        """Returns a list of numpy arrays, each array is [0] or [1]."""
         predictions = []
         for model in self.models:
             pred = model.predict(user_input)
@@ -68,13 +70,17 @@ class ModelPredictor:
     def get_majority_vote(self, predictions):
         """
         Flatten all predictions from all models, combine them into a single array,
-        then find the majority class (0 or 1).
         """
         combined_predictions = np.concatenate(predictions)
         majority_vote = np.bincount(combined_predictions).argmax()
         return majority_vote
-    # Severity interpretation (same as before)
     def evaluate_severity(self, majority_vote_count):
         if majority_vote_count >= 13:
             return "Mental health severity: Severe"
@@ -85,6 +91,7 @@ class ModelPredictor:
         else:
             return "Mental health severity: Very Low"
 model_filenames = [
     "YOWRCONC.pkl", "YOSEEDOC.pkl", "YO_MDEA5.pkl", "YOWRLSIN.pkl",
     "YODPPROB.pkl", "YOWRPROB.pkl", "YODPR2WK.pkl", "YOWRDEPR.pkl",
@@ -94,21 +101,12 @@ model_filenames = [
 model_path = "models/"
 predictor = ModelPredictor(model_path, model_filenames)
-##############################################################################
-#                          INPUT VALIDATION
-##############################################################################
 def validate_inputs(*args):
-    """Return False if any argument is blank or None."""
     for arg in args:
-        if arg == '' or arg is None:
             return False
     return True
-##############################################################################
-#                             MAIN PREDICT FUNCTION
-##############################################################################
 def predict(
     YMDEYR, YMDERSUD5ANY, YMDEIMAD5YR, YMIMS5YANY, YMDELT, YMDEHARX,
     YMDEHPRX, YMDETXRX, YMDEHPO, YMDEAUD5YR, YMIMI5YANY, YMIUD5YANY,
@@ -116,6 +114,20 @@ def predict(
     YHLTMDE, YDOCMDE, YTXMDEYR, YUSUITHKYR, YUSUIPLNYR, YUSUITHK,
     YUSUIPLN, MDEIMPY, LVLDIFMEM2, YMSUD5YANY, YRXMDEYR
 ):
     # Prepare user_input dataframe for prediction
     user_input_data = {
         'YNURSMDE': [int(YNURSMDE)],
@@ -150,18 +162,21 @@ def predict(
     }
     user_input = pd.DataFrame(user_input_data)
-    # 1) Make predictions for each of the 16 models
     predictions = predictor.make_predictions(user_input)
-    # 2) Majority vote across all models
     majority_vote = predictor.get_majority_vote(predictions)
-    # 3) Count how many 1's in all predictions
     majority_vote_count = sum([1 for pred in np.concatenate(predictions) if pred == 1])
     # 4) Evaluate severity
     severity = predictor.evaluate_severity(majority_vote_count)
-    ############################################################################
-    # (A) Summarize per-model predictions
-    ############################################################################
     results = {
         "Concentration_and_Decision_Making": [],
         "Sleep_and_Energy_Levels": [],
@@ -180,221 +195,73 @@ def predict(
                                                          "YODPR2WK", "YODSMMDE",
                                                          "YOPB2WK"]
     }
     for i, pred in enumerate(predictions):
         model_name = model_filenames[i].split('.')[0]  # e.g. 'YOWRCONC'
         pred_value = pred[0]
         if model_name in predictor.prediction_map and pred_value in [0, 1]:
             result_text = f"Model {model_name}: {predictor.prediction_map[model_name][pred_value]}"
         else:
-            result_text = f"Model {model_name}: Unknown or out-of-range prediction {pred_value}"
         found_group = False
         for group_name, group_models in prediction_groups.items():
             if model_name in group_models:
                 results[group_name].append(result_text)
                 found_group = True
                 break
     formatted_results = []
     for group, preds in results.items():
         if preds:
             formatted_results.append(f"Group {group.replace('_', ' ')}:")
             formatted_results.append("\n".join(preds))
-            formatted_results.append("")
-    if not formatted_results:
-        formatted_results = ["No predictions made. Please check your inputs."]
-    prediction_summary_text = "\n".join(formatted_results).strip()
-    ############################################################################
-    # (B) Show "Total Patient Count" (replacing old matched-vs-total)
-    ############################################################################
     total_patients = len(df)
-    total_patients_text = (
         "### Total Patient Count\n"
-        f"This dataset contains **{total_patients}** patient records.\n\n"
-        "In the next sections, we explore how the features and labels are distributed in these records."
-    )
-    ############################################################################
-    # (C) CROSS-TABULATION & GROUPED BAR CHART (EXAMPLE)
-    #     We'll demonstrate with one feature (e.g., 'YMDEYR') vs. the actual label 'YOWRCONC'
-    ############################################################################
-    # Explanation:
-    cross_tab_explanation = (
-        "### Cross-Tabulation & Grouped Bar Chart\n"
-        "This chart shows how often each category of a given feature (X-axis) co-occurs with each **actual label** (0 or 1). "
-        "Interpreting this helps clinicians see which categories have a higher proportion of positive vs. negative outcomes. "
-        "For instance, if 'Yes' in YMDEYR heavily corresponds to label=1, that suggests a stronger link between that feature and the mental health outcome."
-    )
-    if "YOWRCONC" in df.columns and "YMDEYR" in df.columns:
-        # Make sure we actually have the columns needed
-        ctab = pd.crosstab(df["YMDEYR"], df["YOWRCONC"])
-        # ctab might have column names [0,1] for the label
-        ctab.reset_index(inplace=True)
-        # rename for clarity
-        ctab.columns = ["YMDEYR_Value", "Label0_Count", "Label1_Count"]
-        fig_crosstab = px.bar(
-            ctab,
-            x="YMDEYR_Value",
-            y=["Label0_Count", "Label1_Count"],
-            barmode="group",
-            title="YMDEYR vs. YOWRCONC (Actual Label)",
-            labels={
-                "YMDEYR_Value": "YMDEYR Feature Categories",
-                "value": "Count of Patients",
-                "variable": "Label"
-            }
-        )
-    else:
-        # fallback if we don't have those columns
-        fig_crosstab = px.bar(
-            x=["Data Error"], y=[0],
-            title="Could not generate cross-tab: 'YOWRCONC' or 'YMDEYR' not in df"
-        )
-    ############################################################################
-    # (D) "SIMILAR PATIENT" / NEAREST-NEIGHBORS DEMO
-    #     We'll pick a small set of "key features", measure Hamming distance,
-    #     and find the top-K closest rows. Then we'll show how many had label=1.
-    ############################################################################
-    similar_explanation = (
-        "### Similar Patients (Nearest Neighbors)\n"
-        "Here we define a small set of key features and use a simple Hamming distance "
-        "(count of mismatched categories) to find patients who are 'closest' to the current input. "
-        "This helps clinicians see how similar patients were labeled or what interventions they needed."
     )
-    # Example "key features" (choose whichever are most clinically relevant)
-    key_features = ["YMDEYR", "YMDERSUD5ANY", "YMSUD5YANY", "LVLDIFMEM2"]
-    if all(kf in df.columns for kf in key_features) and "YOWRCONC" in df.columns:
-        # Compute distance for each row
-        user_vector = [user_input_data[kf][0] for kf in key_features]
-        distances = []
-        for idx, row in df[key_features].iterrows():
-            # Compare row to user_vector
-            row_vector = row.values
-            # Hamming distance = sum(row_vector[i] != user_vector[i])
-            dist = sum(rv != uv for rv, uv in zip(row_vector, user_vector))
-            distances.append(dist)
-        # Add distances to a copy of df
-        temp_df = df.copy()
-        temp_df["HammingDist"] = distances
-        # Sort ascending by distance, take top-K (e.g., 20)
-        top_k = temp_df.nsmallest(20, "HammingDist")
-        # Count how many have label=1 in top_k
-        if "YOWRCONC" in top_k.columns:
-            similar_label_1_count = (top_k["YOWRCONC"] == 1).sum()
-            similar_label_0_count = (top_k["YOWRCONC"] == 0).sum()
-            similar_text = (
-                f"Out of the 20 most similar patients:\n"
-                f"- {similar_label_1_count} had label=1\n"
-                f"- {similar_label_0_count} had label=0\n"
-                f"(Distances ranged from {top_k['HammingDist'].min()} to {top_k['HammingDist'].max()})."
-            )
-        else:
-            similar_text = "Label column 'YOWRCONC' missing in dataset."
-    else:
-        similar_text = "Cannot compute nearest neighbors: some key features or label column are missing."
-    ############################################################################
-    # (E) CO-OCCURRENCE PLOT (TWO FEATURES) vs. LABEL
-    ############################################################################
-    cooccurrence_explanation = (
-        "### Co-Occurrence of Two Features vs. Label\n"
-        "This shows how two categorical features combine, and how many patients in each combination are labeled 0 or 1. "
-        "Clinicians can spot if certain feature-combinations are particularly high-risk or high-incidence of label=1."
-    )
-    # Example: co-occurrence of 'YMDEYR' and 'YMDERSUD5ANY' vs. 'YOWRCONC'
-    if all(col in df.columns for col in ["YMDEYR", "YMDERSUD5ANY", "YOWRCONC"]):
-        co_tab = pd.crosstab([df["YMDEYR"], df["YMDERSUD5ANY"]], df["YOWRCONC"])
-        co_tab.reset_index(inplace=True)
-        # co_tab columns: ["YMDEYR", "YMDERSUD5ANY", "0", "1"]
-        co_tab.columns = ["YMDEYR", "YMDERSUD5ANY", "Label0", "Label1"]
-        # We'll create a stacked or grouped bar. Let's do grouped by label.
-        # Construct a single column "Count" and a single column "Label" to let plotly group them
-        data_list = []
-        for i, row in co_tab.iterrows():
-            data_list.append({
-                "YMDEYR_Val": row["YMDEYR"],
-                "YMDERSUD5ANY_Val": row["YMDERSUD5ANY"],
-                "Label": "Label=0",
-                "Count": row["Label0"]
-            })
-            data_list.append({
-                "YMDEYR_Val": row["YMDEYR"],
-                "YMDERSUD5ANY_Val": row["YMDERSUD5ANY"],
-                "Label": "Label=1",
-                "Count": row["Label1"]
-            })
-        df_co = pd.DataFrame(data_list)
-        fig_cooccur = px.bar(
-            df_co,
-            x="YMDEYR_Val",
-            y="Count",
-            color="Label",
-            facet_col="YMDERSUD5ANY_Val",  # separate subplots by second feature
-            barmode="group",
-            title="Co-Occurrence: YMDEYR & YMDERSUD5ANY vs. YOWRCONC",
-            labels={"YMDEYR_Val": "YMDEYR", "YMDERSUD5ANY_Val": "YMDERSUD5ANY"}
-        )
-        fig_cooccur.update_layout(
-            legend_title_text="Actual Label",
-            xaxis_title="YMDEYR Categories",
-            yaxis_title="Number of Patients"
-        )
-    else:
-        fig_cooccur = px.bar(
-            x=["Data Error"], y=[0],
-            title="Could not generate co-occurrence chart: missing columns"
-        )
-    #------------------------------------------------------------------------------
-    # RETURN / RENDER
-    #------------------------------------------------------------------------------
-    # We have 6 outputs total (the code is set up for that).
-    # We'll map them as follows:
-    #  1) "Prediction Results" (Textbox)
-    #  2) "Mental Health Severity" (Textbox)
-    #  3) A Markdown that combines: total_patients_text + cross_tab_explanation + similar_explanation + cooccurrence_explanation + the nearest-neighbors result
-    #  4) Cross-Tab Bar Chart
-    #  5) "Number of Patients with the Same Value for Each Input Feature"
-    #  6) "Number of Patients with Predicted Labels"
-    # (i) Provide text results for the user’s predictions
-    # (ii) Provide severity
-    # Build the big markdown text for (3)
-    big_markdown = (
-        total_patients_text
-        + "\n\n"
-        + cross_tab_explanation
-        + "\n\n"
-        + f"**Crosstab Example**: See the bar chart below comparing 'YMDEYR' vs. actual label 'YOWRCONC'.\n\n"
-        + similar_explanation
-        + "\n\n"
-        + similar_text
-        + "\n\n"
-        + cooccurrence_explanation
-        + "\n\n"
-        + "See the final chart below for how 'YMDEYR' & 'YMDERSUD5ANY' co-occur with label 'YOWRCONC'."
-    )
-    # (F) Bar Chart for each input feature
-    #     We'll keep the logic for counting how many in df have the same value for each feature
     input_counts = {}
-    for col, val_list in user_input_data.items():
-        val = val_list[0]
         same_val_count = len(df[df[col] == val])
         input_counts[col] = same_val_count
     bar_input_data = pd.DataFrame({
         "Feature": list(input_counts.keys()),
         "Count": list(input_counts.values())
@@ -408,13 +275,14 @@ def predict(
     )
     fig_bar_input.update_layout(xaxis={'categoryorder':'total descending'})
-    # (G) Bar Chart for predicted labels
-    #     We'll skip "matched vs total" or "exact matching."
     label_counts = {}
     for i, pred in enumerate(predictions):
         model_name = model_filenames[i].split('.')[0]
         pred_value = pred[0]
-        if pred_value in [0, 1] and model_name in df.columns:
             label_counts[model_name] = len(df[df[model_name] == pred_value])
     if len(label_counts) > 0:
@@ -426,12 +294,12 @@ def predict(
             bar_label_data,
             x="Model",
             y="Count",
-            title="Number of Patients with the Same Predicted Label by Model",
             labels={"Model": "Predicted Column", "Count": "Number of Patients"}
         )
         fig_bar_labels.update_layout(xaxis={'categoryorder':'total descending'})
     else:
-        # fallback
         bar_label_data = pd.DataFrame({"Model": [], "Count": []})
         fig_bar_labels = px.bar(
             bar_label_data,
@@ -440,20 +308,128 @@ def predict(
             title="No valid predicted labels to display"
         )
-    # Finally return the updated outputs
-    return (
-        prediction_summary_text,    # (1) Prediction Results
-        severity,                   # (2) Mental Health Severity
-        big_markdown,              # (3) Our large Markdown with headings & explanations
-        fig_crosstab,              # (4) Cross-Tab Bar Chart
-        fig_bar_input,             # (5) Input Feature Bar Chart
-        fig_bar_labels             # (6) Predicted Labels Bar Chart
     )
-##############################################################################
-#                     INPUT MAPPING & GRADIO INTERFACE
-##############################################################################
 input_mapping = {
     'YNURSMDE': {"Yes": 1, "No": 0},
     'YMDEYR': {"Yes": 1, "No": 2},
@@ -486,8 +462,23 @@ input_mapping = {
     'YMDELT': {"Yes": 1, "No": 2}
 }
 # Define the "inputs" in the same order used in the function signature
 inputs = [
     gr.Dropdown(list(input_mapping['YMDEYR'].keys()), label="YMDEYR: PAST YEARS MAJOR DEPRESSIVE EPISODE"),
     gr.Dropdown(list(input_mapping['YMDERSUD5ANY'].keys()), label="YMDERSUD5ANY: MDE OR SUBSTANCE USE DISORDER - ANY"),
     gr.Dropdown(list(input_mapping['YMDEIMAD5YR'].keys()), label="YMDEIMAD5YR: MDE WITH SEV. IMP + ALCOHOL USE DISORDER"),
@@ -501,6 +492,8 @@ inputs = [
     gr.Dropdown(list(input_mapping['YMIMI5YANY'].keys()), label="YMIMI5YANY: MDE W/ ILL DRUG USE DISORDER"),
     gr.Dropdown(list(input_mapping['YMIUD5YANY'].keys()), label="YMIUD5YANY: MDE + ILL DRUG USE DISORDER"),
     gr.Dropdown(list(input_mapping['YMDESUD5ANYO'].keys()), label="YMDESUD5ANYO: MDE vs. SUD vs. BOTH vs. NEITHER"),
     gr.Dropdown(list(input_mapping['YNURSMDE'].keys()), label="YNURSMDE: SAW/TALK TO NURSE/OT ABOUT MDE"),
     gr.Dropdown(list(input_mapping['YSOCMDE'].keys()), label="YSOCMDE: SAW/TALK TO SOCIAL WORKER ABOUT MDE"),
     gr.Dropdown(list(input_mapping['YCOUNMDE'].keys()), label="YCOUNMDE: SAW/TALK TO COUNSELOR ABOUT MDE"),
@@ -509,22 +502,28 @@ inputs = [
     gr.Dropdown(list(input_mapping['YHLTMDE'].keys()), label="YHLTMDE: SAW/TALK TO HEALTH PROFESSIONAL ABOUT MDE"),
     gr.Dropdown(list(input_mapping['YDOCMDE'].keys()), label="YDOCMDE: SAW/TALK TO GP/FAMILY MD ABOUT MDE"),
     gr.Dropdown(list(input_mapping['YTXMDEYR'].keys()), label="YTXMDEYR: SAW/TALK DOCTOR/HEALTH PROF FOR MDE"),
     gr.Dropdown(list(input_mapping['YUSUITHKYR'].keys()), label="YUSUITHKYR: SERIOUSLY THOUGHT ABOUT KILLING SELF"),
     gr.Dropdown(list(input_mapping['YUSUIPLNYR'].keys()), label="YUSUIPLNYR: MADE PLANS TO KILL SELF"),
     gr.Dropdown(list(input_mapping['YUSUITHK'].keys()), label="YUSUITHK: THINK ABOUT KILLING SELF (12 MONTHS)"),
     gr.Dropdown(list(input_mapping['YUSUIPLN'].keys()), label="YUSUIPLN: MADE PLANS TO KILL SELF (12 MONTHS)"),
     gr.Dropdown(list(input_mapping['MDEIMPY'].keys()), label="MDEIMPY: MDE W/ SEVERE ROLE IMPAIRMENT"),
     gr.Dropdown(list(input_mapping['LVLDIFMEM2'].keys()), label="LVLDIFMEM2: LEVEL OF DIFFICULTY REMEMBERING/CONCENTRATING"),
     gr.Dropdown(list(input_mapping['YMSUD5YANY'].keys()), label="YMSUD5YANY: MDE + SUBSTANCE USE DISORDER - ANY"),
     gr.Dropdown(list(input_mapping['YRXMDEYR'].keys()), label="YRXMDEYR: USED MEDS FOR MDE IN PAST YEAR"),
 ]
-# We have 6 outputs now:
 outputs = [
     gr.Textbox(label="Prediction Results", lines=30),
     gr.Textbox(label="Mental Health Severity", lines=4),
-    gr.Markdown(),  # Combined heading & explanations for cross-tab, similar patients, co-occurrence
-    gr.Plot(label="Cross-Tab (Feature vs. Actual Label)"),
     gr.Plot(label="Number of Patients per Input Feature"),
     gr.Plot(label="Number of Patients with Predicted Labels")
 ]
@@ -545,10 +544,14 @@ def predict_with_text(
         YUSUIPLN, MDEIMPY, LVLDIFMEM2, YMSUD5YANY, YRXMDEYR
     ):
         return (
-            "Please select all required fields.",  # Pred result
             "Validation Error",                    # Severity
-            "",                                    # Markdown
-            None, None, None                      # Plots
         )
     # Map from user-friendly text to int
@@ -587,7 +590,6 @@ def predict_with_text(
     # Pass our mapped values into the original 'predict' function
     return predict(**user_inputs)
 # Custom CSS (optional)
 custom_css = """
     .gradio-container * {
@@ -606,10 +608,7 @@ custom_css = """
     }
 """
-##############################################################################
-#                              LAUNCH INTERFACE
-##############################################################################
 interface = gr.Interface(
     fn=predict_with_text,
     inputs=inputs,

 import pickle
+import gradio as gr
 import numpy as np
 import pandas as pd
 import plotly.express as px
 # Load the training CSV once (outside the functions so it is read only once).
 df = pd.read_csv("X_train_Y_Train_merged_train.csv")
 class ModelPredictor:
     def __init__(self, model_path, model_filenames):
         self.model_path = model_path
         self.model_filenames = model_filenames
         self.models = self.load_models()
+        # For readability, you might want to keep only a few keys here if you want
+        # to demonstrate partial cross-tabs, etc.
         self.prediction_map = {
             "YOWRCONC": ["Did not have difficulty concentrating", "Had difficulty concentrating"],
             "YOSEEDOC": ["Did not feel the need to see a doctor", "Felt the need to see a doctor"],
         return models
     def make_predictions(self, user_input):
+        """
+        Returns a list of numpy arrays, each array is [0] or [1].
+        The i-th array corresponds to the i-th model in self.models.
+        """
         predictions = []
         for model in self.models:
             pred = model.predict(user_input)
     def get_majority_vote(self, predictions):
         """
         Flatten all predictions from all models, combine them into a single array,
+        then find the majority class (0 or 1) across all of them.
         """
         combined_predictions = np.concatenate(predictions)
         majority_vote = np.bincount(combined_predictions).argmax()
         return majority_vote
+    # Based on Equal Interval and Percentage-Based Method
+    # Severe: 13 to 16 votes (upper 25%)
+    # Moderate: 9 to 12 votes (upper-middle 25%)
+    # Low: 5 to 8 votes (lower-middle 25%)
+    # Very Low: 0 to 4 votes (lower 25%)
     def evaluate_severity(self, majority_vote_count):
         if majority_vote_count >= 13:
             return "Mental health severity: Severe"
         else:
             return "Mental health severity: Very Low"
+# List of model filenames
 model_filenames = [
     "YOWRCONC.pkl", "YOSEEDOC.pkl", "YO_MDEA5.pkl", "YOWRLSIN.pkl",
     "YODPPROB.pkl", "YOWRPROB.pkl", "YODPR2WK.pkl", "YOWRDEPR.pkl",
 model_path = "models/"
 predictor = ModelPredictor(model_path, model_filenames)
 def validate_inputs(*args):
     for arg in args:
+        if arg == '' or arg is None:  # Assuming empty string or None as unselected
             return False
     return True
 def predict(
     YMDEYR, YMDERSUD5ANY, YMDEIMAD5YR, YMIMS5YANY, YMDELT, YMDEHARX,
     YMDEHPRX, YMDETXRX, YMDEHPO, YMDEAUD5YR, YMIMI5YANY, YMIUD5YANY,
     YHLTMDE, YDOCMDE, YTXMDEYR, YUSUITHKYR, YUSUIPLNYR, YUSUITHK,
     YUSUIPLN, MDEIMPY, LVLDIFMEM2, YMSUD5YANY, YRXMDEYR
 ):
+    """
+    Core prediction function that:
+      1) Predicts with each model
+      2) Aggregates results
+      3) Produces an overall 'severity'
+      4) Returns detailed per-model predictions
+      5) Returns bar charts about how many in the dataset share the same inputs/predicted labels
+      6) ***Now includes custom sections for:
+         - Total patient count (markdown)
+         - Cross-tab & grouped bar chart
+         - Similar Patient (Nearest Neighbors)
+         - Co-occurrence plot
+    """
     # Prepare user_input dataframe for prediction
     user_input_data = {
         'YNURSMDE': [int(YNURSMDE)],
     }
     user_input = pd.DataFrame(user_input_data)
+    # -----------------------
+    # 1) Make predictions
+    # -----------------------
     predictions = predictor.make_predictions(user_input)
+    # 2) Calculate majority vote (0 or 1) across all models
     majority_vote = predictor.get_majority_vote(predictions)
+    # 3) Count how many 1's in all predictions combined
     majority_vote_count = sum([1 for pred in np.concatenate(predictions) if pred == 1])
     # 4) Evaluate severity
     severity = predictor.evaluate_severity(majority_vote_count)
+    # 5) Prepare detailed results for each model group
     results = {
         "Concentration_and_Decision_Making": [],
         "Sleep_and_Energy_Levels": [],
                                                          "YODPR2WK", "YODSMMDE",
                                                          "YOPB2WK"]
     }
     for i, pred in enumerate(predictions):
         model_name = model_filenames[i].split('.')[0]  # e.g. 'YOWRCONC'
         pred_value = pred[0]
+        # Map the prediction value to a human-readable string
         if model_name in predictor.prediction_map and pred_value in [0, 1]:
             result_text = f"Model {model_name}: {predictor.prediction_map[model_name][pred_value]}"
+        elif model_name in predictor.prediction_map:
+            # Out of known range => "Unknown"
+            result_text = f"Model {model_name}: Unknown prediction value {pred_value}"
         else:
+            result_text = f"Model {model_name}: Unknown model"
+        # Append to the appropriate group
         found_group = False
         for group_name, group_models in prediction_groups.items():
             if model_name in group_models:
                 results[group_name].append(result_text)
                 found_group = True
                 break
+        if not found_group:
+            # If model doesn't match any group, skip or store it in a catch-all
+            pass
+    # 6) Nicely format the results
     formatted_results = []
     for group, preds in results.items():
         if preds:
             formatted_results.append(f"Group {group.replace('_', ' ')}:")
             formatted_results.append("\n".join(preds))
+            formatted_results.append("\n")
+    formatted_results = "\n".join(formatted_results).strip()
+    if len(formatted_results) == 0:
+        formatted_results = "No predictions made. Please check your inputs."
+    # Heuristic: if too many unknown predictions, append note
+    num_unknown = len([
+        pred for group, preds in results.items()
+        for pred in preds if "Unknown prediction value" in pred or "Unknown model" in pred
+    ])
+    if num_unknown > len(model_filenames) / 2:
+        severity += " (Unknown prediction count is high. Please consult with a human.)"
+    # ------------------------
+    # ADDITIONAL FEATURES
+    # ------------------------
+    # A) Total Patient Count (instead of the old "Pie" chart)
     total_patients = len(df)
+    total_patient_count_markdown = (
         "### Total Patient Count\n"
+        f"There are **{total_patients}** total patients in the dataset.\n\n"
+        "This count can help you understand the overall dataset size. "
+        "All subsequent analyses are relative to these patients."
     )
+    # B) Analyze Each Input Feature
+    #    For each feature in user_input, compute how many patients have that same value.
     input_counts = {}
+    for col in user_input_data.keys():
+        val = user_input_data[col][0]
         same_val_count = len(df[df[col] == val])
         input_counts[col] = same_val_count
+    # Plot: Bar Chart for each input feature
     bar_input_data = pd.DataFrame({
         "Feature": list(input_counts.keys()),
         "Count": list(input_counts.values())
     )
     fig_bar_input.update_layout(xaxis={'categoryorder':'total descending'})
+    # C) Analyze Predicted Labels
+    # For each model's predicted label (0 or 1), count how many patients in the CSV
+    # have that label. We skip unknown if pred_value not in [0, 1].
     label_counts = {}
     for i, pred in enumerate(predictions):
         model_name = model_filenames[i].split('.')[0]
         pred_value = pred[0]
+        if pred_value in [0, 1]:
             label_counts[model_name] = len(df[df[model_name] == pred_value])
     if len(label_counts) > 0:
             bar_label_data,
             x="Model",
             y="Count",
+            title="Number of Patients with the Predicted Label (0 or 1) by Model",
             labels={"Model": "Predicted Column", "Count": "Number of Patients"}
         )
         fig_bar_labels.update_layout(xaxis={'categoryorder':'total descending'})
     else:
+        # If everything was unknown, produce an empty figure or a fallback message
         bar_label_data = pd.DataFrame({"Model": [], "Count": []})
         fig_bar_labels = px.bar(
             bar_label_data,
             title="No valid predicted labels to display"
         )
+    # D) Cross-Tabulation & Grouped Bar Chart
+    #    Example: Show how a single input feature (YMDEYR) relates to one actual label (YOWRCONC).
+    #    For demonstration only — in practice you might do this for multiple features/labels.
+    # NOTE: If the columns don't exist in the dataset (some code merges them differently),
+    # you might adapt accordingly.
+    if "YMDEYR" in df.columns and "YOWRCONC" in df.columns:
+        cross_tab_data = df.groupby(["YMDEYR", "YOWRCONC"]).size().reset_index(name="count")
+        fig_cross_tab = px.bar(
+            cross_tab_data,
+            x="YMDEYR",
+            y="count",
+            color="YOWRCONC",
+            barmode="group",
+            title="Cross-Tab: YMDEYR vs YOWRCONC (Grouped Bar Chart)",
+            labels={"YMDEYR": "Feature: YMDEYR", "YOWRCONC": "Label: YOWRCONC"}
+        )
+    else:
+        # Provide a fallback message if columns not found
+        fig_cross_tab = px.bar(title="YMDEYR or YOWRCONC not found in dataset. Cross-tab not available.")
+    # E) Similar Patient (Nearest Neighbors) via simple Hamming distance
+    #    We'll pick K=5 neighbors. Then see how many had label=0 vs label=1 for
+    #    one example label: YOWRCONC.
+    #    (You can adapt to do multiple labels, but that can get lengthy.)
+    def hamming_distance(row, user_row):
+        dist = 0
+        for c in user_row.index:
+            if row[c] != user_row[c]:
+                dist += 1
+        return dist
+    # Create a single row for easy iteration
+    user_series = user_input.iloc[0]
+    # We'll compute distance for all rows in df on the same features
+    # that were used in the user_input.
+    # NOTE: In real usage, confirm these columns exist in df.
+    # If df lacks them or is encoded differently, you'd adapt.
+    features_to_compare = list(user_input.columns)
+    # For Hamming, ensure we pick only the columns present in df
+    features_to_compare = [f for f in features_to_compare if f in df.columns]
+    # Build a DataFrame we can safely compare
+    subset_df = df[features_to_compare].copy()
+    # Calculate distances
+    distances = []
+    for idx, row in subset_df.iterrows():
+        d = 0
+        for col in features_to_compare:
+            if row[col] != user_series[col]:
+                d += 1
+        distances.append(d)
+    # Attach distances
+    df_with_dist = df.copy()
+    df_with_dist["distance"] = distances
+    # Sort by distance ascending, pick top K=5
+    K = 5
+    nearest_neighbors = df_with_dist.sort_values("distance", ascending=True).head(K)
+    # For demonstration, let's show how many had YOWRCONC=0 vs. 1
+    nn_label_0 = nn_label_1 = 0
+    if "YOWRCONC" in nearest_neighbors.columns:
+        nn_label_0 = len(nearest_neighbors[nearest_neighbors["YOWRCONC"] == 0])
+        nn_label_1 = len(nearest_neighbors[nearest_neighbors["YOWRCONC"] == 1])
+    # Summarize in markdown
+    similar_patient_markdown = (
+        "### Nearest Neighbors (Simple Hamming Distance)\n"
+        f"We searched for the top **{K}** patients in the dataset whose categorical features "
+        "most closely match your input (Hamming distance).\n\n"
+        "**For the label `YOWRCONC`** among these neighbors:\n"
+        f"- {nn_label_0} had label=0\n"
+        f"- {nn_label_1} had label=1\n\n"
+        "(This is a simple illustration. In real practice, you'd refine which columns to use, "
+        "how to encode them, and how many neighbors to consider.)"
     )
+    # F) Co-Occurrence Plot
+    #    Example: How two features (YMDEYR, YMDERSUD5ANY) combine with label (YOWRCONC).
+    #    We'll produce a multi-way distribution using facet_col.
+    if all(col in df.columns for col in ["YMDEYR", "YMDERSUD5ANY", "YOWRCONC"]):
+        co_occ_data = df.groupby(["YMDEYR", "YMDERSUD5ANY", "YOWRCONC"]).size().reset_index(name="count")
+        fig_co_occ = px.bar(
+            co_occ_data,
+            x="YMDEYR",
+            y="count",
+            color="YOWRCONC",
+            facet_col="YMDERSUD5ANY",
+            title="Co-Occurrence Plot: YMDEYR and YMDERSUD5ANY vs YOWRCONC"
+        )
+    else:
+        fig_co_occ = px.bar(title="Co-occurrence plot not available (columns not found).")
+    # ------------------------
+    # Return everything
+    # ------------------------
+    # We now have 8 items to return:
+    #  1) Prediction Results (Textbox)
+    #  2) Mental Health Severity (Textbox)
+    #  3) Total Patient Count (Markdown)
+    #  4) Cross-Tab & Grouped Bar Chart (Plot)
+    #  5) Nearest Neighbors Summary (Markdown)
+    #  6) Co-Occurrence Plot (Plot)
+    #  7) Bar Chart for input features (Plot)
+    #  8) Bar Chart for predicted labels (Plot)
+    return (
+        formatted_results,
+        severity,
+        total_patient_count_markdown,
+        fig_cross_tab,
+        similar_patient_markdown,
+        fig_co_occ,
+        fig_bar_input,
+        fig_bar_labels
+    )
+# -----------------------------------------------------------------------------
+# MAPPING user-friendly text => numeric values
+# -----------------------------------------------------------------------------
 input_mapping = {
     'YNURSMDE': {"Yes": 1, "No": 0},
     'YMDEYR': {"Yes": 1, "No": 2},
     'YMDELT': {"Yes": 1, "No": 2}
 }
+# -----------------------------------------------------------------------------
+# Create the Gradio interface
+# -----------------------------------------------------------------------------
+# We have 8 outputs now:
+#   1) Prediction Results (Textbox)
+#   2) Mental Health Severity (Textbox)
+#   3) Total Patient Count (Markdown)
+#   4) Cross-Tab & Grouped Bar Chart (Plot)
+#   5) Nearest Neighbors Summary (Markdown)
+#   6) Co-Occurrence Plot (Plot)
+#   7) Bar Chart for input features (Plot)
+#   8) Bar Chart for predicted labels (Plot)
 # Define the "inputs" in the same order used in the function signature
 inputs = [
+    ################# Ordered and grouped ##########################
+    # Questions related to Major Depressive Episode (MDE) and related impairments or disorders
     gr.Dropdown(list(input_mapping['YMDEYR'].keys()), label="YMDEYR: PAST YEARS MAJOR DEPRESSIVE EPISODE"),
     gr.Dropdown(list(input_mapping['YMDERSUD5ANY'].keys()), label="YMDERSUD5ANY: MDE OR SUBSTANCE USE DISORDER - ANY"),
     gr.Dropdown(list(input_mapping['YMDEIMAD5YR'].keys()), label="YMDEIMAD5YR: MDE WITH SEV. IMP + ALCOHOL USE DISORDER"),
     gr.Dropdown(list(input_mapping['YMIMI5YANY'].keys()), label="YMIMI5YANY: MDE W/ ILL DRUG USE DISORDER"),
     gr.Dropdown(list(input_mapping['YMIUD5YANY'].keys()), label="YMIUD5YANY: MDE + ILL DRUG USE DISORDER"),
     gr.Dropdown(list(input_mapping['YMDESUD5ANYO'].keys()), label="YMDESUD5ANYO: MDE vs. SUD vs. BOTH vs. NEITHER"),
+    # Questions related to consultations with professionals about MDE
     gr.Dropdown(list(input_mapping['YNURSMDE'].keys()), label="YNURSMDE: SAW/TALK TO NURSE/OT ABOUT MDE"),
     gr.Dropdown(list(input_mapping['YSOCMDE'].keys()), label="YSOCMDE: SAW/TALK TO SOCIAL WORKER ABOUT MDE"),
     gr.Dropdown(list(input_mapping['YCOUNMDE'].keys()), label="YCOUNMDE: SAW/TALK TO COUNSELOR ABOUT MDE"),
     gr.Dropdown(list(input_mapping['YHLTMDE'].keys()), label="YHLTMDE: SAW/TALK TO HEALTH PROFESSIONAL ABOUT MDE"),
     gr.Dropdown(list(input_mapping['YDOCMDE'].keys()), label="YDOCMDE: SAW/TALK TO GP/FAMILY MD ABOUT MDE"),
     gr.Dropdown(list(input_mapping['YTXMDEYR'].keys()), label="YTXMDEYR: SAW/TALK DOCTOR/HEALTH PROF FOR MDE"),
+    # Questions related to suicidal thoughts and plans
     gr.Dropdown(list(input_mapping['YUSUITHKYR'].keys()), label="YUSUITHKYR: SERIOUSLY THOUGHT ABOUT KILLING SELF"),
     gr.Dropdown(list(input_mapping['YUSUIPLNYR'].keys()), label="YUSUIPLNYR: MADE PLANS TO KILL SELF"),
     gr.Dropdown(list(input_mapping['YUSUITHK'].keys()), label="YUSUITHK: THINK ABOUT KILLING SELF (12 MONTHS)"),
     gr.Dropdown(list(input_mapping['YUSUIPLN'].keys()), label="YUSUIPLN: MADE PLANS TO KILL SELF (12 MONTHS)"),
+    # Questions related to impairment due to MDE
     gr.Dropdown(list(input_mapping['MDEIMPY'].keys()), label="MDEIMPY: MDE W/ SEVERE ROLE IMPAIRMENT"),
     gr.Dropdown(list(input_mapping['LVLDIFMEM2'].keys()), label="LVLDIFMEM2: LEVEL OF DIFFICULTY REMEMBERING/CONCENTRATING"),
     gr.Dropdown(list(input_mapping['YMSUD5YANY'].keys()), label="YMSUD5YANY: MDE + SUBSTANCE USE DISORDER - ANY"),
     gr.Dropdown(list(input_mapping['YRXMDEYR'].keys()), label="YRXMDEYR: USED MEDS FOR MDE IN PAST YEAR"),
 ]
+# We now have 8 outputs in total:
 outputs = [
     gr.Textbox(label="Prediction Results", lines=30),
     gr.Textbox(label="Mental Health Severity", lines=4),
+    gr.Markdown(label="Total Patient Count"),
+    gr.Plot(label="Cross-Tab & Grouped Bar Chart"),
+    gr.Markdown(label="Nearest Neighbors Summary"),
+    gr.Plot(label="Co-Occurrence Plot"),
     gr.Plot(label="Number of Patients per Input Feature"),
     gr.Plot(label="Number of Patients with Predicted Labels")
 ]
         YUSUIPLN, MDEIMPY, LVLDIFMEM2, YMSUD5YANY, YRXMDEYR
     ):
         return (
+            "Please select all required fields.",  # Prediction Results
             "Validation Error",                    # Severity
+            "No data",                             # Total Patient Count
+            None,                                  # Cross-Tab figure
+            "No data",                             # Nearest Neighbors
+            None,                                  # Co-Occurrence
+            None,                                  # Input Features Bar
+            None                                   # Predicted Labels Bar
         )
     # Map from user-friendly text to int
     # Pass our mapped values into the original 'predict' function
     return predict(**user_inputs)
 # Custom CSS (optional)
 custom_css = """
     .gradio-container * {
     }
 """
+# Finally, launch the app with 8 outputs
 interface = gr.Interface(
     fn=predict_with_text,
     inputs=inputs,