Spaces:

pantdipendra
/

AdolescentsMentalHealthPrediction

Running

App Files Files Community

pantdipendra commited on 19 days ago

Commit

67c356a

verified ·

1 Parent(s): b782f65

Update app.py

Browse files

Files changed (1) hide show

app.py +276 -297

app.py CHANGED Viewed

@@ -8,13 +8,17 @@ import plotly.express as px
 # Load the training CSV once (outside the functions so it is read only once).
 df = pd.read_csv("X_train_Y_Train_merged_train.csv")
 class ModelPredictor:
     def __init__(self, model_path, model_filenames):
         self.model_path = model_path
         self.model_filenames = model_filenames
         self.models = self.load_models()
-        # For readability, you might want to keep only a few keys here if you want
-        # to demonstrate partial cross-tabs, etc.
         self.prediction_map = {
             "YOWRCONC": ["Did not have difficulty concentrating", "Had difficulty concentrating"],
             "YOSEEDOC": ["Did not feel the need to see a doctor", "Felt the need to see a doctor"],
@@ -91,7 +95,9 @@ class ModelPredictor:
         else:
             return "Mental health severity: Very Low"
-# List of model filenames
 model_filenames = [
     "YOWRCONC.pkl", "YOSEEDOC.pkl", "YO_MDEA5.pkl", "YOWRLSIN.pkl",
     "YODPPROB.pkl", "YOWRPROB.pkl", "YODPR2WK.pkl", "YOWRDEPR.pkl",
@@ -101,12 +107,60 @@ model_filenames = [
 model_path = "models/"
 predictor = ModelPredictor(model_path, model_filenames)
 def validate_inputs(*args):
     for arg in args:
         if arg == '' or arg is None:  # Assuming empty string or None as unselected
             return False
     return True
 def predict(
     YMDEYR, YMDERSUD5ANY, YMDEIMAD5YR, YMIMS5YANY, YMDELT, YMDEHARX,
     YMDEHPRX, YMDETXRX, YMDEHPO, YMDEAUD5YR, YMIMI5YANY, YMIUD5YANY,
@@ -120,15 +174,11 @@ def predict(
       2) Aggregates results
       3) Produces an overall 'severity'
       4) Returns detailed per-model predictions
-      5) Returns bar charts about how many in the dataset share the same inputs/predicted labels
-      6) ***Now includes custom sections for:
-         - Total patient count (markdown)
-         - Cross-tab & grouped bar chart
-         - Similar Patient (Nearest Neighbors)
-         - Co-occurrence plot
     """
-    # Prepare user_input dataframe for prediction
     user_input_data = {
         'YNURSMDE': [int(YNURSMDE)],
         'YMDEYR': [int(YMDEYR)],
@@ -162,21 +212,20 @@ def predict(
     }
     user_input = pd.DataFrame(user_input_data)
-    # -----------------------
-    # 1) Make predictions
-    # -----------------------
     predictions = predictor.make_predictions(user_input)
-    # 2) Calculate majority vote (0 or 1) across all models
     majority_vote = predictor.get_majority_vote(predictions)
-    # 3) Count how many 1's in all predictions combined
     majority_vote_count = sum([1 for pred in np.concatenate(predictions) if pred == 1])
-    # 4) Evaluate severity
     severity = predictor.evaluate_severity(majority_vote_count)
-    # 5) Prepare detailed results for each model group
     results = {
         "Concentration_and_Decision_Making": [],
         "Sleep_and_Energy_Levels": [],
@@ -196,17 +245,15 @@ def predict(
                                                          "YOPB2WK"]
     }
     for i, pred in enumerate(predictions):
-        model_name = model_filenames[i].split('.')[0]  # e.g. 'YOWRCONC'
         pred_value = pred[0]
         # Map the prediction value to a human-readable string
         if model_name in predictor.prediction_map and pred_value in [0, 1]:
             result_text = f"Model {model_name}: {predictor.prediction_map[model_name][pred_value]}"
-        elif model_name in predictor.prediction_map:
-            # Out of known range => "Unknown"
-            result_text = f"Model {model_name}: Unknown prediction value {pred_value}"
         else:
-            result_text = f"Model {model_name}: Unknown model"
         # Append to the appropriate group
         found_group = False
@@ -216,318 +263,195 @@ def predict(
                 found_group = True
                 break
         if not found_group:
-            # If model doesn't match any group, skip or store it in a catch-all
             pass
-    # 6) Nicely format the results
     formatted_results = []
     for group, preds in results.items():
         if preds:
             formatted_results.append(f"Group {group.replace('_', ' ')}:")
             formatted_results.append("\n".join(preds))
             formatted_results.append("\n")
     formatted_results = "\n".join(formatted_results).strip()
     if len(formatted_results) == 0:
         formatted_results = "No predictions made. Please check your inputs."
-    # Heuristic: if too many unknown predictions, append note
-    num_unknown = len([
-        pred for group, preds in results.items()
-        for pred in preds if "Unknown prediction value" in pred or "Unknown model" in pred
-    ])
-    if num_unknown > len(model_filenames) / 2:
-        severity += " (Unknown prediction count is high. Please consult with a human.)"
-    # ------------------------
-    # ADDITIONAL FEATURES
-    # ------------------------
-    # A) Total Patient Count (instead of the old "Pie" chart)
     total_patients = len(df)
     total_patient_count_markdown = (
         "### Total Patient Count\n"
         f"There are **{total_patients}** total patients in the dataset.\n\n"
-        "This count can help you understand the overall dataset size. "
-        "All subsequent analyses are relative to these patients."
-    )
-    # B) Analyze Each Input Feature
-    #    For each feature in user_input, compute how many patients have that same value.
-    input_counts = {}
-    for col in user_input_data.keys():
-        val = user_input_data[col][0]
-        same_val_count = len(df[df[col] == val])
-        input_counts[col] = same_val_count
-    # Plot: Bar Chart for each input feature
-    bar_input_data = pd.DataFrame({
-        "Feature": list(input_counts.keys()),
-        "Count": list(input_counts.values())
-    })
-    fig_bar_input = px.bar(
-        bar_input_data,
-        x="Feature",
-        y="Count",
-        title="Number of Patients with the Same Value for Each Input Feature",
-        labels={"Feature": "Input Feature", "Count": "Number of Patients"}
     )
-    fig_bar_input.update_layout(xaxis={'categoryorder':'total descending'})
-    # C) Analyze Predicted Labels
-    # For each model's predicted label (0 or 1), count how many patients in the CSV
-    # have that label. We skip unknown if pred_value not in [0, 1].
-    label_counts = {}
-    for i, pred in enumerate(predictions):
-        model_name = model_filenames[i].split('.')[0]
-        pred_value = pred[0]
-        if pred_value in [0, 1]:
-            label_counts[model_name] = len(df[df[model_name] == pred_value])
-    if len(label_counts) > 0:
-        bar_label_data = pd.DataFrame({
-            "Model": list(label_counts.keys()),
-            "Count": list(label_counts.values())
-        })
-        fig_bar_labels = px.bar(
-            bar_label_data,
-            x="Model",
-            y="Count",
-            title="Number of Patients with the Predicted Label (0 or 1) by Model",
-            labels={"Model": "Predicted Column", "Count": "Number of Patients"}
-        )
-        fig_bar_labels.update_layout(xaxis={'categoryorder':'total descending'})
-    else:
-        # If everything was unknown, produce an empty figure or a fallback message
-        bar_label_data = pd.DataFrame({"Model": [], "Count": []})
-        fig_bar_labels = px.bar(
-            bar_label_data,
-            x="Model",
-            y="Count",
-            title="No valid predicted labels to display"
         )
-    # D) Cross-Tabulation & Grouped Bar Chart
-    #    Example: Show how a single input feature (YMDEYR) relates to one actual label (YOWRCONC).
-    #    For demonstration only — in practice you might do this for multiple features/labels.
-    # NOTE: If the columns don't exist in the dataset (some code merges them differently),
-    # you might adapt accordingly.
-    if "YMDEYR" in df.columns and "YOWRCONC" in df.columns:
-        cross_tab_data = df.groupby(["YMDEYR", "YOWRCONC"]).size().reset_index(name="count")
-        fig_cross_tab = px.bar(
-            cross_tab_data,
-            x="YMDEYR",
             y="count",
-            color="YOWRCONC",
-            barmode="group",
-            title="Cross-Tab: YMDEYR vs YOWRCONC (Grouped Bar Chart)",
-            labels={"YMDEYR": "Feature: YMDEYR", "YOWRCONC": "Label: YOWRCONC"}
         )
     else:
-        # Provide a fallback message if columns not found
-        fig_cross_tab = px.bar(title="YMDEYR or YOWRCONC not found in dataset. Cross-tab not available.")
-    # E) Similar Patient (Nearest Neighbors) via simple Hamming distance
-    #    We'll pick K=5 neighbors. Then see how many had label=0 vs label=1 for
-    #    one example label: YOWRCONC.
-    #    (You can adapt to do multiple labels, but that can get lengthy.)
-    def hamming_distance(row, user_row):
-        dist = 0
-        for c in user_row.index:
-            if row[c] != user_row[c]:
-                dist += 1
-        return dist
-    # Create a single row for easy iteration
     user_series = user_input.iloc[0]
-    # We'll compute distance for all rows in df on the same features
-    # that were used in the user_input.
-    # NOTE: In real usage, confirm these columns exist in df.
-    # If df lacks them or is encoded differently, you'd adapt.
-    features_to_compare = list(user_input.columns)
-    # For Hamming, ensure we pick only the columns present in df
-    features_to_compare = [f for f in features_to_compare if f in df.columns]
-    # Build a DataFrame we can safely compare
-    subset_df = df[features_to_compare].copy()
-    # Calculate distances
     distances = []
-    for idx, row in subset_df.iterrows():
         d = 0
         for col in features_to_compare:
             if row[col] != user_series[col]:
                 d += 1
         distances.append(d)
-    # Attach distances
     df_with_dist = df.copy()
     df_with_dist["distance"] = distances
-    # Sort by distance ascending, pick top K=5
     K = 5
     nearest_neighbors = df_with_dist.sort_values("distance", ascending=True).head(K)
-    # For demonstration, let's show how many had YOWRCONC=0 vs. 1
-    nn_label_0 = nn_label_1 = 0
-    if "YOWRCONC" in nearest_neighbors.columns:
-        nn_label_0 = len(nearest_neighbors[nearest_neighbors["YOWRCONC"] == 0])
-        nn_label_1 = len(nearest_neighbors[nearest_neighbors["YOWRCONC"] == 1])
-    # Summarize in markdown
     similar_patient_markdown = (
         "### Nearest Neighbors (Simple Hamming Distance)\n"
-        f"We searched for the top **{K}** patients in the dataset whose categorical features "
-        "most closely match your input (Hamming distance).\n\n"
-        "**For the label `YOWRCONC`** among these neighbors:\n"
-        f"- {nn_label_0} had label=0\n"
-        f"- {nn_label_1} had label=1\n\n"
-        "(This is a simple illustration. In real practice, you'd refine which columns to use, "
-        "how to encode them, and how many neighbors to consider.)"
     )
-    # F) Co-Occurrence Plot
-    #    Example: How two features (YMDEYR, YMDERSUD5ANY) combine with label (YOWRCONC).
-    #    We'll produce a multi-way distribution using facet_col.
-    if all(col in df.columns for col in ["YMDEYR", "YMDERSUD5ANY", "YOWRCONC"]):
-        co_occ_data = df.groupby(["YMDEYR", "YMDERSUD5ANY", "YOWRCONC"]).size().reset_index(name="count")
-        fig_co_occ = px.bar(
-            co_occ_data,
-            x="YMDEYR",
-            y="count",
-            color="YOWRCONC",
-            facet_col="YMDERSUD5ANY",
-            title="Co-Occurrence Plot: YMDEYR and YMDERSUD5ANY vs YOWRCONC"
-        )
-    else:
-        fig_co_occ = px.bar(title="Co-occurrence plot not available (columns not found).")
-    # ------------------------
-    # Return everything
-    # ------------------------
-    # We now have 8 items to return:
-    #  1) Prediction Results (Textbox)
-    #  2) Mental Health Severity (Textbox)
-    #  3) Total Patient Count (Markdown)
-    #  4) Cross-Tab & Grouped Bar Chart (Plot)
-    #  5) Nearest Neighbors Summary (Markdown)
-    #  6) Co-Occurrence Plot (Plot)
-    #  7) Bar Chart for input features (Plot)
-    #  8) Bar Chart for predicted labels (Plot)
     return (
-        formatted_results,
-        severity,
-        total_patient_count_markdown,
-        fig_cross_tab,
-        similar_patient_markdown,
-        fig_co_occ,
-        fig_bar_input,
-        fig_bar_labels
     )
-# -----------------------------------------------------------------------------
-# MAPPING user-friendly text => numeric values
-# -----------------------------------------------------------------------------
-input_mapping = {
-    'YNURSMDE': {"Yes": 1, "No": 0},
-    'YMDEYR': {"Yes": 1, "No": 2},
-    'YSOCMDE': {"Yes": 1, "No": 0},
-    'YMDESUD5ANYO': {"SUD only, no MDE": 1, "MDE only, no SUD": 2, "SUD and MDE": 3, "Neither SUD or MDE": 4},
-    'YMSUD5YANY': {"Yes": 1, "No": 0},
-    'YUSUITHK': {"Yes": 1, "No": 2, "I'm not sure": 3, "I don't want to answer": 4},
-    'YMDETXRX': {"Yes": 1, "No": 0},
-    'YUSUITHKYR': {"Yes": 1, "No": 2, "I'm not sure": 3, "I don't want to answer": 4},
-    'YMDERSUD5ANY': {"Yes": 1, "No": 0},
-    'YUSUIPLNYR': {"Yes": 1, "No": 2, "I'm not sure": 3, "I don't want to answer": 4},
-    'YCOUNMDE': {"Yes": 1, "No": 0},
-    'YPSY1MDE': {"Yes": 1, "No": 0},
-    'YHLTMDE': {"Yes": 1, "No": 0},
-    'YDOCMDE': {"Yes": 1, "No": 0},
-    'YPSY2MDE': {"Yes": 1, "No": 0},
-    'YMDEHARX': {"Yes": 1, "No": 0},
-    'LVLDIFMEM2': {"No Difficulty": 1, "Some difficulty": 2, "A lot of difficulty or cannot do at all": 3},
-    'MDEIMPY': {"Yes": 1, "No": 2},
-    'YMDEHPO': {"Yes": 1, "No": 0},
-    'YMIMS5YANY': {"Yes": 1, "No": 0},
-    'YMDEIMAD5YR': {"Yes": 1, "No": 0},
-    'YMIUD5YANY': {"Yes": 1, "No": 0},
-    'YMDEHPRX': {"Yes": 1, "No": 0},
-    'YMIMI5YANY': {"Yes": 1, "No": 0},
-    'YUSUIPLN': {"Yes": 1, "No": 2, "I'm not sure": 3, "I don't want to answer": 4},
-    'YTXMDEYR': {"Yes": 1, "No": 0},
-    'YMDEAUD5YR': {"Yes": 1, "No": 0},
-    'YRXMDEYR': {"Yes": 1, "No": 0},
-    'YMDELT': {"Yes": 1, "No": 2}
-}
-# -----------------------------------------------------------------------------
-# Create the Gradio interface
-# -----------------------------------------------------------------------------
-# We have 8 outputs now:
-#   1) Prediction Results (Textbox)
-#   2) Mental Health Severity (Textbox)
-#   3) Total Patient Count (Markdown)
-#   4) Cross-Tab & Grouped Bar Chart (Plot)
-#   5) Nearest Neighbors Summary (Markdown)
-#   6) Co-Occurrence Plot (Plot)
-#   7) Bar Chart for input features (Plot)
-#   8) Bar Chart for predicted labels (Plot)
-# Define the "inputs" in the same order used in the function signature
-inputs = [
-    ################# Ordered and grouped ##########################
-    # Questions related to Major Depressive Episode (MDE) and related impairments or disorders
-    gr.Dropdown(list(input_mapping['YMDEYR'].keys()), label="YMDEYR: PAST YEARS MAJOR DEPRESSIVE EPISODE"),
-    gr.Dropdown(list(input_mapping['YMDERSUD5ANY'].keys()), label="YMDERSUD5ANY: MDE OR SUBSTANCE USE DISORDER - ANY"),
-    gr.Dropdown(list(input_mapping['YMDEIMAD5YR'].keys()), label="YMDEIMAD5YR: MDE WITH SEV. IMP + ALCOHOL USE DISORDER"),
-    gr.Dropdown(list(input_mapping['YMIMS5YANY'].keys()), label="YMIMS5YANY: MDE W/ SEV. IMP + SUBSTANCE USE DISORDER"),
-    gr.Dropdown(list(input_mapping['YMDELT'].keys()), label="YMDELT: HAD MAJOR DEPRESSIVE EPISODE IN LIFETIME"),
-    gr.Dropdown(list(input_mapping['YMDEHARX'].keys()), label="YMDEHARX: SAW HEALTH PROF + MEDS FOR MDE"),
-    gr.Dropdown(list(input_mapping['YMDEHPRX'].keys()), label="YMDEHPRX: SAW HEALTH PROF OR MEDS FOR MDE"),
-    gr.Dropdown(list(input_mapping['YMDETXRX'].keys()), label="YMDETXRX: RECEIVED TREATMENT/COUNSELING FOR MDE"),
-    gr.Dropdown(list(input_mapping['YMDEHPO'].keys()), label="YMDEHPO: SAW HEALTH PROF ONLY FOR MDE"),
-    gr.Dropdown(list(input_mapping['YMDEAUD5YR'].keys()), label="YMDEAUD5YR: MDE + ALCOHOL USE DISORDER"),
-    gr.Dropdown(list(input_mapping['YMIMI5YANY'].keys()), label="YMIMI5YANY: MDE W/ ILL DRUG USE DISORDER"),
-    gr.Dropdown(list(input_mapping['YMIUD5YANY'].keys()), label="YMIUD5YANY: MDE + ILL DRUG USE DISORDER"),
-    gr.Dropdown(list(input_mapping['YMDESUD5ANYO'].keys()), label="YMDESUD5ANYO: MDE vs. SUD vs. BOTH vs. NEITHER"),
-    # Questions related to consultations with professionals about MDE
-    gr.Dropdown(list(input_mapping['YNURSMDE'].keys()), label="YNURSMDE: SAW/TALK TO NURSE/OT ABOUT MDE"),
-    gr.Dropdown(list(input_mapping['YSOCMDE'].keys()), label="YSOCMDE: SAW/TALK TO SOCIAL WORKER ABOUT MDE"),
-    gr.Dropdown(list(input_mapping['YCOUNMDE'].keys()), label="YCOUNMDE: SAW/TALK TO COUNSELOR ABOUT MDE"),
-    gr.Dropdown(list(input_mapping['YPSY1MDE'].keys()), label="YPSY1MDE: SAW/TALK TO PSYCHOLOGIST ABOUT MDE"),
-    gr.Dropdown(list(input_mapping['YPSY2MDE'].keys()), label="YPSY2MDE: SAW/TALK TO PSYCHIATRIST ABOUT MDE"),
-    gr.Dropdown(list(input_mapping['YHLTMDE'].keys()), label="YHLTMDE: SAW/TALK TO HEALTH PROFESSIONAL ABOUT MDE"),
-    gr.Dropdown(list(input_mapping['YDOCMDE'].keys()), label="YDOCMDE: SAW/TALK TO GP/FAMILY MD ABOUT MDE"),
-    gr.Dropdown(list(input_mapping['YTXMDEYR'].keys()), label="YTXMDEYR: SAW/TALK DOCTOR/HEALTH PROF FOR MDE"),
-    # Questions related to suicidal thoughts and plans
-    gr.Dropdown(list(input_mapping['YUSUITHKYR'].keys()), label="YUSUITHKYR: SERIOUSLY THOUGHT ABOUT KILLING SELF"),
-    gr.Dropdown(list(input_mapping['YUSUIPLNYR'].keys()), label="YUSUIPLNYR: MADE PLANS TO KILL SELF"),
-    gr.Dropdown(list(input_mapping['YUSUITHK'].keys()), label="YUSUITHK: THINK ABOUT KILLING SELF (12 MONTHS)"),
-    gr.Dropdown(list(input_mapping['YUSUIPLN'].keys()), label="YUSUIPLN: MADE PLANS TO KILL SELF (12 MONTHS)"),
-    # Questions related to impairment due to MDE
-    gr.Dropdown(list(input_mapping['MDEIMPY'].keys()), label="MDEIMPY: MDE W/ SEVERE ROLE IMPAIRMENT"),
-    gr.Dropdown(list(input_mapping['LVLDIFMEM2'].keys()), label="LVLDIFMEM2: LEVEL OF DIFFICULTY REMEMBERING/CONCENTRATING"),
-    gr.Dropdown(list(input_mapping['YMSUD5YANY'].keys()), label="YMSUD5YANY: MDE + SUBSTANCE USE DISORDER - ANY"),
-    gr.Dropdown(list(input_mapping['YRXMDEYR'].keys()), label="YRXMDEYR: USED MEDS FOR MDE IN PAST YEAR"),
-]
-# We now have 8 outputs in total:
-outputs = [
-    gr.Textbox(label="Prediction Results", lines=30),
-    gr.Textbox(label="Mental Health Severity", lines=4),
-    gr.Markdown(label="Total Patient Count"),
-    gr.Plot(label="Cross-Tab & Grouped Bar Chart"),
-    gr.Markdown(label="Nearest Neighbors Summary"),
-    gr.Plot(label="Co-Occurrence Plot"),
-    gr.Plot(label="Number of Patients per Input Feature"),
-    gr.Plot(label="Number of Patients with Predicted Labels")
-]
 def predict_with_text(
     YMDEYR, YMDERSUD5ANY, YMDEIMAD5YR, YMIMS5YANY, YMDELT, YMDEHARX,
     YMDEHPRX, YMDETXRX, YMDEHPO, YMDEAUD5YR, YMIMI5YANY, YMIUD5YANY,
@@ -547,11 +471,9 @@ def predict_with_text(
             "Please select all required fields.",  # Prediction Results
             "Validation Error",                    # Severity
             "No data",                             # Total Patient Count
-            None,                                  # Cross-Tab figure
             "No data",                             # Nearest Neighbors
-            None,                                  # Co-Occurrence
-            None,                                  # Input Features Bar
-            None                                   # Predicted Labels Bar
         )
     # Map from user-friendly text to int
@@ -590,6 +512,63 @@ def predict_with_text(
     # Pass our mapped values into the original 'predict' function
     return predict(**user_inputs)
 # Custom CSS (optional)
 custom_css = """
     .gradio-container * {
@@ -608,13 +587,13 @@ custom_css = """
     }
 """
-# Finally, launch the app with 8 outputs
 interface = gr.Interface(
-    fn=predict_with_text,
-    inputs=inputs,
-    outputs=outputs,
-    title="Adolescents with Substance Use Mental Health Screening (NSDUH Data)",
-    css=custom_css
 )
 if __name__ == "__main__":

 # Load the training CSV once (outside the functions so it is read only once).
 df = pd.read_csv("X_train_Y_Train_merged_train.csv")
+###############################################################################
+# 1) Model Predictor class
+###############################################################################
 class ModelPredictor:
     def __init__(self, model_path, model_filenames):
         self.model_path = model_path
         self.model_filenames = model_filenames
         self.models = self.load_models()
+        # For each model name, define the mapping from 0->..., 1->...
+        # If you have more labels, expand this dictionary accordingly.
         self.prediction_map = {
             "YOWRCONC": ["Did not have difficulty concentrating", "Had difficulty concentrating"],
             "YOSEEDOC": ["Did not feel the need to see a doctor", "Felt the need to see a doctor"],
         else:
             return "Mental health severity: Very Low"
+###############################################################################
+# 2) Model Filenames & Predictor
+###############################################################################
 model_filenames = [
     "YOWRCONC.pkl", "YOSEEDOC.pkl", "YO_MDEA5.pkl", "YOWRLSIN.pkl",
     "YODPPROB.pkl", "YOWRPROB.pkl", "YODPR2WK.pkl", "YOWRDEPR.pkl",
 model_path = "models/"
 predictor = ModelPredictor(model_path, model_filenames)
+###############################################################################
+# 3) Validate Inputs
+###############################################################################
 def validate_inputs(*args):
     for arg in args:
         if arg == '' or arg is None:  # Assuming empty string or None as unselected
             return False
     return True
+###############################################################################
+# 4) Reverse Lookup (numeric -> user-friendly text) for input columns
+###############################################################################
+# We'll define the forward mapping here. The reverse mapping is constructed below.
+input_mapping = {
+    'YNURSMDE': {"Yes": 1, "No": 0},
+    'YMDEYR': {"Yes": 1, "No": 2},
+    'YSOCMDE': {"Yes": 1, "No": 0},
+    'YMDESUD5ANYO': {"SUD only, no MDE": 1, "MDE only, no SUD": 2, "SUD and MDE": 3, "Neither SUD or MDE": 4},
+    'YMSUD5YANY': {"Yes": 1, "No": 0},
+    'YUSUITHK': {"Yes": 1, "No": 2, "I'm not sure": 3, "I don't want to answer": 4},
+    'YMDETXRX': {"Yes": 1, "No": 0},
+    'YUSUITHKYR': {"Yes": 1, "No": 2, "I'm not sure": 3, "I don't want to answer": 4},
+    'YMDERSUD5ANY': {"Yes": 1, "No": 0},
+    'YUSUIPLNYR': {"Yes": 1, "No": 2, "I'm not sure": 3, "I don't want to answer": 4},
+    'YCOUNMDE': {"Yes": 1, "No": 0},
+    'YPSY1MDE': {"Yes": 1, "No": 0},
+    'YHLTMDE': {"Yes": 1, "No": 0},
+    'YDOCMDE': {"Yes": 1, "No": 0},
+    'YPSY2MDE': {"Yes": 1, "No": 0},
+    'YMDEHARX': {"Yes": 1, "No": 0},
+    'LVLDIFMEM2': {"No Difficulty": 1, "Some difficulty": 2, "A lot of difficulty or cannot do at all": 3},
+    'MDEIMPY': {"Yes": 1, "No": 2},
+    'YMDEHPO': {"Yes": 1, "No": 0},
+    'YMIMS5YANY': {"Yes": 1, "No": 0},
+    'YMDEIMAD5YR': {"Yes": 1, "No": 0},
+    'YMIUD5YANY': {"Yes": 1, "No": 0},
+    'YMDEHPRX': {"Yes": 1, "No": 0},
+    'YMIMI5YANY': {"Yes": 1, "No": 0},
+    'YUSUIPLN': {"Yes": 1, "No": 2, "I'm not sure": 3, "I don't want to answer": 4},
+    'YTXMDEYR': {"Yes": 1, "No": 0},
+    'YMDEAUD5YR': {"Yes": 1, "No": 0},
+    'YRXMDEYR': {"Yes": 1, "No": 0},
+    'YMDELT': {"Yes": 1, "No": 2}
+}
+# Build reverse mapping: { "YNURSMDE": {1: "Yes", 0: "No"}, ... } etc.
+reverse_mapping = {}
+for col, mapping_dict in input_mapping.items():
+    rev = {v: k for k, v in mapping_dict.items()}  # invert dict
+    reverse_mapping[col] = rev
+###############################################################################
+# 5) Main Predict Function
+###############################################################################
 def predict(
     YMDEYR, YMDERSUD5ANY, YMDEIMAD5YR, YMIMS5YANY, YMDELT, YMDEHARX,
     YMDEHPRX, YMDETXRX, YMDEHPO, YMDEAUD5YR, YMIMI5YANY, YMIUD5YANY,
       2) Aggregates results
       3) Produces an overall 'severity'
       4) Returns detailed per-model predictions
+      5) Creates a distribution plot for ALL input features vs. a chosen label
+      6) Nearest neighbor logic (with disclaimers), mapping numeric -> user text
     """
+    # 1) Prepare user_input dataframe
     user_input_data = {
         'YNURSMDE': [int(YNURSMDE)],
         'YMDEYR': [int(YMDEYR)],
     }
     user_input = pd.DataFrame(user_input_data)
+    # 2) Make predictions
     predictions = predictor.make_predictions(user_input)
+    # 3) Calculate majority vote (0 or 1) across all models
     majority_vote = predictor.get_majority_vote(predictions)
+    # 4) Count how many 1's in all predictions combined
     majority_vote_count = sum([1 for pred in np.concatenate(predictions) if pred == 1])
+    # 5) Evaluate severity
     severity = predictor.evaluate_severity(majority_vote_count)
+    # 6) Prepare per-model predictions
+    #    We'll group them just like before
     results = {
         "Concentration_and_Decision_Making": [],
         "Sleep_and_Energy_Levels": [],
                                                          "YOPB2WK"]
     }
+    # We'll keep a record of which model => which predicted label
     for i, pred in enumerate(predictions):
+        model_name = predictor.model_filenames[i].split('.')[0]
         pred_value = pred[0]
         # Map the prediction value to a human-readable string
         if model_name in predictor.prediction_map and pred_value in [0, 1]:
             result_text = f"Model {model_name}: {predictor.prediction_map[model_name][pred_value]}"
         else:
+            result_text = f"Model {model_name}: Unknown or out-of-range"
         # Append to the appropriate group
         found_group = False
                 found_group = True
                 break
         if not found_group:
+            # If no group matches, skip or store in "Other"
             pass
+    # 7) Nicely format the results
     formatted_results = []
     for group, preds in results.items():
         if preds:
             formatted_results.append(f"Group {group.replace('_', ' ')}:")
             formatted_results.append("\n".join(preds))
             formatted_results.append("\n")
     formatted_results = "\n".join(formatted_results).strip()
     if len(formatted_results) == 0:
         formatted_results = "No predictions made. Please check your inputs."
+    # 8) Additional disclaimers if there's a large fraction of unknown
+    num_unknown = sum(1 for group, preds in results.items() if any("Unknown or out-of-range" in p for p in preds))
+    if num_unknown > len(predictor.model_filenames) / 2:
+        severity += " (Unknown prediction count is high. Please consult with a human.)"
+    ############################################################################
+    # A) Total Patient Count
+    ############################################################################
     total_patients = len(df)
     total_patient_count_markdown = (
         "### Total Patient Count\n"
         f"There are **{total_patients}** total patients in the dataset.\n\n"
+        "This number helps you understand the size of the dataset used."
     )
+    ############################################################################
+    # B) Distribution Plot: All Input Features vs. a single predicted label
+    ############################################################################
+    # For demonstration, let's pick "YOWRCONC" if it exists in df:
+    # We'll melt the dataset so that each input feature is in a "FeatureName" column,
+    # and each distinct category is in "FeatureValue". We'll group by those + label to get counts.
+    chosen_label = "YOWRCONC"
+    if chosen_label in df.columns:
+        # 1) Narrow down to the columns of interest
+        #    We'll only use the input features that exist in df
+        input_cols_in_df = [c for c in user_input_data.keys() if c in df.columns]
+        # 2) We'll create a "melted" version of these input features
+        #    i.e., row per (patient_id, FeatureName, FeatureValue)
+        sub_df = df[input_cols_in_df + [chosen_label]].copy()
+        # Melt them
+        melted = sub_df.melt(
+            id_vars=[chosen_label],
+            var_name="FeatureName",
+            value_name="FeatureValue"
         )
+        # 3) Group by (FeatureName, FeatureValue, chosen_label) to get size
+        dist_data = melted.groupby(["FeatureName", "FeatureValue", chosen_label]).size().reset_index(name="count")
+        # 4) We'll try to map FeatureValue from numeric -> user-friendly text if possible
+        #    We'll do it only if FeatureName is in reverse_mapping.
+        def map_value(row):
+            fn = row["FeatureName"]
+            fv = row["FeatureValue"]
+            if fn in reverse_mapping:
+                if fv in reverse_mapping[fn]:
+                    return reverse_mapping[fn][fv]  # e.g. 1->"Yes"
+            return fv  # fallback
+        dist_data["FeatureValueText"] = dist_data.apply(map_value, axis=1)
+        # 5) Similarly, map chosen_label (0 or 1) to text if in predictor.prediction_map
+        if chosen_label in predictor.prediction_map:
+            def map_label(val):
+                if val in [0, 1]:
+                    return predictor.prediction_map[chosen_label][val]
+                return f"Unknown label {val}"
+            dist_data["LabelText"] = dist_data[chosen_label].apply(map_label)
+        else:
+            dist_data["LabelText"] = dist_data[chosen_label].astype(str)
+        # 6) Now produce a bar chart with facet_col = FeatureName
+        fig_distribution = px.bar(
+            dist_data,
+            x="FeatureValueText",
             y="count",
+            color="LabelText",
+            facet_col="FeatureName",
+            facet_col_wrap=4,     # how many facets per row
+            title=f"Distribution of All Input Features vs. {chosen_label}",
+            height=800
         )
+        fig_distribution.update_layout(legend=dict(title=chosen_label))
+        # (Optional) Adjust layout or text angle if you have many categories
+        fig_distribution.update_xaxes(tickangle=45)
     else:
+        # Fallback
+        fig_distribution = px.bar(title=f"Label {chosen_label} not found in dataset. Distribution not available.")
+    ############################################################################
+    # C) Nearest Neighbors (Hamming Distance) with disclaimers & user-friendly text
+    ############################################################################
+    # "Nearest neighbor” methods for high-dimensional or purely categorical data can be non-trivial.
+    # This demo uses a Hamming distance over all input features, picks K=5.
+    # In real practice, you'd refine which features to use, how to encode them, etc.
+    # 1) Build a DataFrame to compare with the user_input
+    features_to_compare = [col for col in user_input_data if col in df.columns]
     user_series = user_input.iloc[0]
+    # 2) Compute distances
     distances = []
+    for idx, row in df[features_to_compare].iterrows():
         d = 0
         for col in features_to_compare:
             if row[col] != user_series[col]:
                 d += 1
         distances.append(d)
     df_with_dist = df.copy()
     df_with_dist["distance"] = distances
+    # 3) Sort and pick top K=5
     K = 5
     nearest_neighbors = df_with_dist.sort_values("distance", ascending=True).head(K)
+    # 4) Show how many had the chosen_label=0 vs 1, but also map them
+    #    We'll also demonstrate showing user-friendly text for each neighbor's feature values.
+    #    However, if you have large K or many features, this can be big.
+    if chosen_label in nearest_neighbors.columns:
+        nn_label_0 = len(nearest_neighbors[nearest_neighbors[chosen_label] == 0])
+        nn_label_1 = len(nearest_neighbors[nearest_neighbors[chosen_label] == 1])
+        if chosen_label in predictor.prediction_map:
+            label0_text = predictor.prediction_map[chosen_label][0]
+            label1_text = predictor.prediction_map[chosen_label][1]
+        else:
+            label0_text = "Label=0"
+            label1_text = "Label=1"
+    else:
+        nn_label_0 = nn_label_1 = 0
+        label0_text = "Label=0"
+        label1_text = "Label=1"
+    # 5) Build an example table of those neighbors in user-friendly text
+    neighbor_text_rows = []
+    for idx, nn_row in nearest_neighbors.iterrows():
+        # For each feature, map numeric -> user text
+        row_str_parts = []
+        row_str_parts.append(f"distance={nn_row['distance']}")
+        for fcol in features_to_compare:
+            val = nn_row[fcol]
+            # try to map
+            if fcol in reverse_mapping and val in reverse_mapping[fcol]:
+                val_str = reverse_mapping[fcol][val]
+            else:
+                val_str = str(val)
+            row_str_parts.append(f"{fcol}={val_str}")
+        # For the label
+        if chosen_label in nn_row:
+            lbl_val = nn_row[chosen_label]
+            if chosen_label in predictor.prediction_map and lbl_val in [0, 1]:
+                lbl_str = predictor.prediction_map[chosen_label][lbl_val]
+            else:
+                lbl_str = str(lbl_val)
+            row_str_parts.append(f"{chosen_label}={lbl_str}")
+        neighbor_text_rows.append(" | ".join(row_str_parts))
+    neighbor_text_block = "\n".join(neighbor_text_rows)
     similar_patient_markdown = (
         "### Nearest Neighbors (Simple Hamming Distance)\n"
+        "“Nearest neighbor” methods for high-dimensional or purely categorical data can be non-trivial. "
+        "This demo simply uses a Hamming distance over all input features and picks **K=5** neighbors.\n\n"
+        "In a real application, you would refine which features are most relevant, how to encode them, "
+        "and how many neighbors to select.\n\n"
+        f"Among these **{K}** nearest neighbors:\n"
+        f"- **{nn_label_0}** had {label0_text}\n"
+        f"- **{nn_label_1}** had {label1_text}\n\n"
+        "Below is a breakdown of each neighbor's key features in user-friendly text:\n\n"
+        f"```\n{neighbor_text_block}\n```"
     )
+    ############################################################################
+    # Return 8 outputs
+    ############################################################################
     return (
+        formatted_results,              # 1) Prediction results (Textbox)
+        severity,                       # 2) Mental Health Severity (Textbox)
+        total_patient_count_markdown,   # 3) Total Patient Count (Markdown)
+        fig_distribution,               # 4) Distribution Plot (Plot)
+        similar_patient_markdown,       # 5) Nearest Neighbor Summary (Markdown)
+        None,                           # 6) Placeholder if you need more plots
+        None,                           # 7) Another placeholder
+        None                            # 8) Another placeholder
     )
+###############################################################################
+# 6) Gradio Interface: We'll keep 8 outputs, but only use some in this demo
+###############################################################################
 def predict_with_text(
     YMDEYR, YMDERSUD5ANY, YMDEIMAD5YR, YMIMS5YANY, YMDELT, YMDEHARX,
     YMDEHPRX, YMDETXRX, YMDEHPO, YMDEAUD5YR, YMIMI5YANY, YMIUD5YANY,
             "Please select all required fields.",  # Prediction Results
             "Validation Error",                    # Severity
             "No data",                             # Total Patient Count
+            None,                                  # Distribution Plot
             "No data",                             # Nearest Neighbors
+            None, None, None                       # Placeholders
         )
     # Map from user-friendly text to int
     # Pass our mapped values into the original 'predict' function
     return predict(**user_inputs)
+###############################################################################
+# 7) Define and Launch Gradio Interface
+###############################################################################
+import sys
+# We have 8 outputs (some are placeholders)
+outputs = [
+    gr.Textbox(label="Prediction Results", lines=30),
+    gr.Textbox(label="Mental Health Severity", lines=4),
+    gr.Markdown(label="Total Patient Count"),
+    gr.Plot(label="Distribution of All Input Features vs. One Label"),
+    gr.Markdown(label="Nearest Neighbors Summary"),
+    gr.Plot(label="Placeholder Plot"),
+    gr.Plot(label="Placeholder Plot"),
+    gr.Plot(label="Placeholder Plot")
+]
+# Define the inputs
+inputs = [
+    # Major Depressive Episode (MDE) questions
+    gr.Dropdown(list(input_mapping['YMDEYR'].keys()), label="YMDEYR: PAST YEAR MDE?"),
+    gr.Dropdown(list(input_mapping['YMDERSUD5ANY'].keys()), label="YMDERSUD5ANY: MDE OR SUBSTANCE USE DISORDER - ANY"),
+    gr.Dropdown(list(input_mapping['YMDEIMAD5YR'].keys()), label="YMDEIMAD5YR: MDE + ALCOHOL USE DISORDER?"),
+    gr.Dropdown(list(input_mapping['YMIMS5YANY'].keys()), label="YMIMS5YANY: MDE + SUBSTANCE USE DISORDER?"),
+    gr.Dropdown(list(input_mapping['YMDELT'].keys()), label="YMDELT: EVER HAD MDE LIFETIME?"),
+    gr.Dropdown(list(input_mapping['YMDEHARX'].keys()), label="YMDEHARX: SAW HEALTH PROF + MEDS FOR MDE"),
+    gr.Dropdown(list(input_mapping['YMDEHPRX'].keys()), label="YMDEHPRX: SAW HEALTH PROF OR MEDS FOR MDE"),
+    gr.Dropdown(list(input_mapping['YMDETXRX'].keys()), label="YMDETXRX: TREATMENT/COUNSELING FOR MDE"),
+    gr.Dropdown(list(input_mapping['YMDEHPO'].keys()), label="YMDEHPO: HEALTH PROF ONLY FOR MDE"),
+    gr.Dropdown(list(input_mapping['YMDEAUD5YR'].keys()), label="YMDEAUD5YR: MDE + ALCOHOL USE DISORDER"),
+    gr.Dropdown(list(input_mapping['YMIMI5YANY'].keys()), label="YMIMI5YANY: MDE + ILL DRUG USE DISORDER"),
+    gr.Dropdown(list(input_mapping['YMIUD5YANY'].keys()), label="YMIUD5YANY: MDE + ILL DRUG USE DISORDER"),
+    gr.Dropdown(list(input_mapping['YMDESUD5ANYO'].keys()), label="YMDESUD5ANYO: MDE vs. SUD vs. BOTH vs. NEITHER"),
+    # Consultations
+    gr.Dropdown(list(input_mapping['YNURSMDE'].keys()), label="YNURSMDE: NURSE / OT FOR MDE"),
+    gr.Dropdown(list(input_mapping['YSOCMDE'].keys()), label="YSOCMDE: SOCIAL WORKER FOR MDE"),
+    gr.Dropdown(list(input_mapping['YCOUNMDE'].keys()), label="YCOUNMDE: COUNSELOR FOR MDE"),
+    gr.Dropdown(list(input_mapping['YPSY1MDE'].keys()), label="YPSY1MDE: PSYCHOLOGIST FOR MDE"),
+    gr.Dropdown(list(input_mapping['YPSY2MDE'].keys()), label="YPSY2MDE: PSYCHIATRIST FOR MDE"),
+    gr.Dropdown(list(input_mapping['YHLTMDE'].keys()), label="YHLTMDE: HEALTH PROF FOR MDE"),
+    gr.Dropdown(list(input_mapping['YDOCMDE'].keys()), label="YDOCMDE: GP/FAMILY MD FOR MDE"),
+    gr.Dropdown(list(input_mapping['YTXMDEYR'].keys()), label="YTXMDEYR: DOCTOR/HEALTH PROF FOR MDE THIS YEAR"),
+    # Suicidal thoughts / plans
+    gr.Dropdown(list(input_mapping['YUSUITHKYR'].keys()), label="YUSUITHKYR: SERIOUSLY THOUGHT ABOUT KILLING SELF"),
+    gr.Dropdown(list(input_mapping['YUSUIPLNYR'].keys()), label="YUSUIPLNYR: MADE PLANS TO KILL SELF"),
+    gr.Dropdown(list(input_mapping['YUSUITHK'].keys()), label="YUSUITHK: THINK ABOUT KILLING SELF (12 MONTHS)"),
+    gr.Dropdown(list(input_mapping['YUSUIPLN'].keys()), label="YUSUIPLN: MADE PLANS TO KILL SELF (12 MONTHS)"),
+    # Impairment
+    gr.Dropdown(list(input_mapping['MDEIMPY'].keys()), label="MDEIMPY: MDE WITH SEVERE ROLE IMPAIRMENT?"),
+    gr.Dropdown(list(input_mapping['LVLDIFMEM2'].keys()), label="LVLDIFMEM2: DIFFICULTY REMEMBERING/CONCENTRATING"),
+    gr.Dropdown(list(input_mapping['YMSUD5YANY'].keys()), label="YMSUD5YANY: MDE + SUBSTANCE USE DISORDER?"),
+    gr.Dropdown(list(input_mapping['YRXMDEYR'].keys()), label="YRXMDEYR: USED MEDS FOR MDE IN PAST YEAR?")
+]
 # Custom CSS (optional)
 custom_css = """
     .gradio-container * {
     }
 """
+# Build the interface
 interface = gr.Interface(
+    fn=predict_with_text,
+    inputs=inputs,
+    outputs=outputs,
+    title="Adolescents with Substance Use Mental Health Screening (NSDUH Data)",
+    css=custom_css,
 )
 if __name__ == "__main__":