import streamlit as st import pandas as pd import numpy as np import plotly.express as px import plotly.graph_objs as go import folium from streamlit_folium import st_folium from datetime import timedelta # ---------------------------------------------------- # 1. Load data # ---------------------------------------------------- @st.cache_data def load_data(): # Load daily and monthly CSV from local files (or a URL if needed) daily_df = pd.read_csv("daily_data_2013_2024.csv", parse_dates=["date"]) monthly_df = pd.read_csv("monthly_data_2013_2024.csv") # If monthly_df also needs a 'date' column for plotting, you can create: # monthly_df["date"] = pd.to_datetime(monthly_df["year"].astype(str) + "-" + monthly_df["month"].astype(str) + "-01") return daily_df, monthly_df daily_data, monthly_data = load_data() # Pre-define your location dictionary so we can map lat/lon LOCATIONS = { "Karagwe": {"lat": -1.7718, "lon": 30.9876}, "Masasi": {"lat": -10.7167, "lon": 38.8000}, "Igunga": {"lat": -4.2833, "lon": 33.8833} } # ---------------------------------------------------- # 2. Streamlit UI Layout # ---------------------------------------------------- st.title("Malaria & Dengue Outbreak Analysis (2013–2024)") st.sidebar.header("Filters & Options") # Choose disease type to focus on disease_choice = st.sidebar.radio("Select Disease", ["Malaria", "Dengue"], index=0) # Choose data granularity data_choice = st.sidebar.radio("Data Granularity", ["Monthly", "Daily"], index=0) # Let user filter location(s) location_list = list(LOCATIONS.keys()) selected_locations = st.sidebar.multiselect("Select Location(s)", location_list, default=location_list) # For monthly data, let user select a year range if data_choice == "Monthly": year_min = int(monthly_data["year"].min()) year_max = int(monthly_data["year"].max()) year_range = st.sidebar.slider( "Select Year Range", min_value=year_min, max_value=year_max, value=(year_min, year_max), step=1 ) # For daily data, let user select a date range else: date_min = daily_data["date"].min() date_max = daily_data["date"].max() date_range = st.sidebar.date_input( "Select Date Range", [date_min, date_max], min_value=date_min, max_value=date_max ) # ---------------------------------------------------- # 3. Filter data based on user input # ---------------------------------------------------- if data_choice == "Monthly": # Subset monthly data for selected locations df = monthly_data[monthly_data["location"].isin(selected_locations)].copy() # Filter year range df = df[(df["year"] >= year_range[0]) & (df["year"] <= year_range[1])] # Create a "date" column for monthly plotting df["date"] = pd.to_datetime(df["year"].astype(str) + "-" + df["month"].astype(str) + "-01") else: # Subset daily data df = daily_data[daily_data["location"].isin(selected_locations)].copy() # Filter date range df = df[(df["date"] >= pd.to_datetime(date_range[0])) & (df["date"] <= pd.to_datetime(date_range[1]))] # ---------------------------------------------------- # 4. Interactive Plotly Time-Series (Original) # ---------------------------------------------------- st.subheader(f"{data_choice} {disease_choice} Risk & Climate Parameters") # Decide which columns are relevant for risk risk_col = "malaria_risk" if disease_choice == "Malaria" else "dengue_risk" if data_choice == "Monthly": # Plot a line chart of risk vs. date fig = px.line( df, x="date", y=risk_col, color="location", title=f"{disease_choice} Risk Over Time ({data_choice})" ) fig.update_layout(yaxis_title="Risk (0–1)") st.plotly_chart(fig, use_container_width=True) # Temperature & Rainfall side-by-side col1, col2 = st.columns(2) with col1: fig_temp = px.line( df, x="date", y="temp_avg", color="location", title="Average Temperature (°C)" ) st.plotly_chart(fig_temp, use_container_width=True) with col2: # 'monthly_rainfall_mm' is total monthly rainfall fig_rain = px.line( df, x="date", y="monthly_rainfall_mm", color="location", title="Monthly Rainfall (mm)" ) st.plotly_chart(fig_rain, use_container_width=True) # Show outbreak flags if focusing on monthly if disease_choice == "Malaria": flag_col = "malaria_outbreak" else: flag_col = "dengue_outbreak" outbreak_months = df[df[flag_col] == True] if not outbreak_months.empty: st.write(f"**Months with likely {disease_choice} outbreak:**") st.dataframe(outbreak_months[[ "location","year","month","temp_avg", "humidity","monthly_rainfall_mm",flag_col ]]) else: st.write(f"No months meet the {disease_choice} outbreak criteria in this selection.") else: # For daily data, plot daily risk fig = px.line( df, x="date", y=risk_col, color="location", title=f"{disease_choice} Daily Risk Over Time (2013–2024)" ) fig.update_layout(yaxis_title="Risk (0–1)") st.plotly_chart(fig, use_container_width=True) # Temperature & Rainfall side-by-side col1, col2 = st.columns(2) with col1: fig_temp = px.line( df, x="date", y="temp_avg", color="location", title="Daily Avg Temperature (°C)" ) st.plotly_chart(fig_temp, use_container_width=True) with col2: fig_rain = px.line( df, x="date", y="daily_rainfall_mm", color="location", title="Daily Rainfall (mm)" ) st.plotly_chart(fig_rain, use_container_width=True) # ---------------------------------------------------- # 5. Correlation Heatmap (Original) # ---------------------------------------------------- st.subheader(f"Correlation Heatmap - {data_choice} Data") # Option to choose correlation method corr_method = st.selectbox("Correlation Method", ["pearson", "spearman"], index=0) # We'll pick relevant numeric columns if data_choice == "Monthly": subset_cols = ["temp_avg", "humidity", "monthly_rainfall_mm", "malaria_risk", "dengue_risk"] else: subset_cols = ["temp_avg", "humidity", "daily_rainfall_mm", "malaria_risk", "dengue_risk"] corr_df = df[subset_cols].corr(method=corr_method) fig_corr = px.imshow( corr_df, text_auto=True, aspect="auto", title=f"Correlation Matrix of Weather & Risk ({corr_method.capitalize()})" ) st.plotly_chart(fig_corr, use_container_width=True) # ---------------------------------------------------- # 6. Interactive Map (Original) # ---------------------------------------------------- st.subheader("Interactive Map") st.markdown( """ **Note**: We only have 3 locations. Each marker popup shows some aggregated stats for the displayed data range. """ ) # Create a base map centered roughly in Tanzania m = folium.Map(location=[-6.0, 35.0], zoom_start=6) # Show monthly or daily aggregates in the popups if data_choice == "Monthly": for loc in selected_locations: loc_info = LOCATIONS[loc] loc_df = df[df["location"] == loc] if loc_df.empty: continue # Basic stats avg_risk = loc_df[risk_col].mean() avg_temp = loc_df["temp_avg"].mean() avg_rain = loc_df["monthly_rainfall_mm"].mean() # Build popup HTML popup_html = f""" {loc}
Disease: {disease_choice}
Avg Risk (in selection): {avg_risk:.2f}
Avg Temp (°C): {avg_temp:.2f}
Avg Rainfall (mm): {avg_rain:.2f}
""" folium.Marker( location=[loc_info["lat"], loc_info["lon"]], popup=popup_html, tooltip=f"{loc} ({disease_choice})" ).add_to(m) else: # Daily data for loc in selected_locations: loc_info = LOCATIONS[loc] loc_df = df[df["location"] == loc] if loc_df.empty: continue avg_risk = loc_df[risk_col].mean() avg_temp = loc_df["temp_avg"].mean() avg_rain = loc_df["daily_rainfall_mm"].mean() popup_html = f""" {loc}
Disease: {disease_choice}
Avg Risk (in selection): {avg_risk:.2f}
Avg Temp (°C): {avg_temp:.2f}
Avg Rain (mm/day): {avg_rain:.2f}
""" folium.Marker( location=[loc_info["lat"], loc_info["lon"]], popup=popup_html, tooltip=f"{loc} ({disease_choice})" ).add_to(m) # Render Folium map in Streamlit st_data = st_folium(m, width=700, height=500) # ---------------------------------------------------- # 7. Additional Explorations (New Features) # ---------------------------------------------------- st.header("Additional Explorations") ############################################################################### # 7.1 Compare Malaria & Dengue Risk Side-by-Side (same chart) for the same data ############################################################################### st.subheader("Compare Malaria & Dengue Risk Over Time") compare_both = st.checkbox("Compare Both Diseases on One Plot") if compare_both: # We'll create two columns for Malaria & Dengue in the same DF subset # Already have "malaria_risk" and "dengue_risk" in the data # Filter the same df but plot them together: # Convert to "long" format for easy plotting with Plotly # e.g. columns: date, location, disease, risk if data_choice == "Monthly": # We have date, location, malaria_risk, dengue_risk df_long = df.melt( id_vars=["date","location","temp_avg","humidity"], value_vars=["malaria_risk","dengue_risk"], var_name="disease", value_name="risk" ) else: df_long = df.melt( id_vars=["date","location","temp_avg","humidity"], value_vars=["malaria_risk","dengue_risk"], var_name="disease", value_name="risk" ) # We only want to show locations user selected, but the df is already filtered # so just plot: title_str = "Malaria vs. Dengue Risk" fig_compare = px.line( df_long, x="date", y="risk", color="location", line_dash="disease", title=title_str ) fig_compare.update_layout(yaxis_title="Risk (0–1)") st.plotly_chart(fig_compare, use_container_width=True) ################################################## # 7.2 Scatter Matrix (Pairwise relationships) ################################################## st.subheader("Scatter Matrix of Risk & Weather Parameters") # Let user choose which columns to include (besides the default subset) scatter_cols = st.multiselect( "Choose additional columns to include in Scatter Matrix (besides risk & weather).", ["temp_avg","humidity","monthly_rainfall_mm","daily_rainfall_mm","malaria_risk","dengue_risk"], default=["temp_avg","humidity","malaria_risk","dengue_risk"] ) if len(scatter_cols) < 2: st.warning("Please select at least two columns to generate a scatter matrix.") else: # Prepare data for scatter matrix sm_df = df[scatter_cols].copy() # For monthly vs daily, the rainfall column might differ # If user selected 'monthly_rainfall_mm' but the data is daily, that column might not exist. # So we can drop missing columns gracefully: sm_df = sm_df.dropna(axis=1, how='all') # Using Plotly's scatter_matrix: fig_sm = px.scatter_matrix( sm_df, dimensions=sm_df.columns, title="Scatter Matrix", color_discrete_sequence=["#636EFA"] # Adjust color if you like ) fig_sm.update_layout(width=800, height=800) st.plotly_chart(fig_sm, use_container_width=True) ################################################## # 7.3 Simple Time-Lag Correlation (Example) ################################################## st.subheader("Time-Lag Correlation (Experimental)") st.markdown(""" Experiment with a simple lag analysis. For example, check how temperature or rainfall in previous weeks/months correlates with **current** Malaria/Dengue risk. """) time_lag = st.slider("Select Lag (days) to shift weather parameters", min_value=0, max_value=60, value=0, step=5) # Example: Shift rainfall & temperature columns by the selected lag and see correlation with disease risk df_lag = df.copy() if data_choice == "Daily" and time_lag > 0: # Shift daily rainfall/temperature backward by 'time_lag' days df_lag = df_lag.sort_values("date") # ensure sorted by date df_lag["temp_avg_lag"] = df_lag.groupby("location")["temp_avg"].shift(time_lag) df_lag["rain_lag"] = df_lag.groupby("location")["daily_rainfall_mm"].shift(time_lag) # If we want to see correlation with today's risk # we can drop rows with NaN in the lag columns df_lag.dropna(subset=["temp_avg_lag","rain_lag"], inplace=True) elif data_choice == "Monthly" and time_lag > 0: # Shift monthly rainfall & temp by 'time_lag' (in days) => must approximate? # We'll interpret the slider as months if data is monthly. # But that might not be precise if "time_lag" is in days. For simplicity, we convert days -> months ~ 30 days month_lag = time_lag // 30 # approximate conversion if month_lag > 0: df_lag = df_lag.sort_values("date") df_lag["temp_avg_lag"] = df_lag.groupby("location")["temp_avg"].shift(month_lag) df_lag["rain_lag"] = df_lag.groupby("location")["monthly_rainfall_mm"].shift(month_lag) df_lag.dropna(subset=["temp_avg_lag","rain_lag"], inplace=True) # Now we compute correlation between risk_col and these lagged columns, if they exist if "temp_avg_lag" in df_lag.columns and "rain_lag" in df_lag.columns: lag_corr_temp = df_lag[risk_col].corr(df_lag["temp_avg_lag"], method=corr_method) lag_corr_rain = df_lag[risk_col].corr(df_lag["rain_lag"], method=corr_method) st.write(f"**Correlation between {disease_choice} Risk and lagged Temperature**: {lag_corr_temp:.3f}") st.write(f"**Correlation between {disease_choice} Risk and lagged Rainfall**: {lag_corr_rain:.3f}") else: st.write("No lag columns or lag is set to 0. Increase the lag to see results.") ################################################## # 7.4 Outbreak Statistics ################################################## st.subheader("Outbreak Statistics - ⚠️ NEEDS NIMR Data to work") st.markdown(""" This section will show the **count** of outbreak periods based on selection and some summary statistics, once we have overlayed NIMR Data with the Existing Weather Data """) if disease_choice == "Malaria": outbreak_flag_col = "malaria_outbreak" else: outbreak_flag_col = "dengue_outbreak" # Summarize outbreak by location if outbreak_flag_col in df.columns: outbreak_count_by_loc = df[df[outbreak_flag_col] == True].groupby("location").size().reset_index(name="outbreak_count") st.write("**Number of outbreak instances (in current selection) by location:**") st.dataframe(outbreak_count_by_loc) else: st.write(f"No outbreak flag column found for {disease_choice}.") # Show average temperature, rainfall, humidity during outbreak vs non-outbreak if outbreak_flag_col in df.columns: with st.expander("Compare Weather Averages During Outbreak vs. Non-Outbreak"): outbreak_df = df[df[outbreak_flag_col] == True] non_outbreak_df = df[df[outbreak_flag_col] == False] if not outbreak_df.empty: avg_temp_outbreak = outbreak_df["temp_avg"].mean() avg_hum_outbreak = outbreak_df["humidity"].mean() if data_choice == "Daily": avg_rain_outbreak = outbreak_df["daily_rainfall_mm"].mean() else: avg_rain_outbreak = outbreak_df["monthly_rainfall_mm"].mean() avg_temp_non = non_outbreak_df["temp_avg"].mean() avg_hum_non = non_outbreak_df["humidity"].mean() if data_choice == "Daily": avg_rain_non = non_outbreak_df["daily_rainfall_mm"].mean() else: avg_rain_non = non_outbreak_df["monthly_rainfall_mm"].mean() st.write(f"**Outbreak Periods** ({disease_choice}):") st.write(f"- Avg Temperature: {avg_temp_outbreak:.2f} °C") st.write(f"- Avg Humidity: {avg_hum_outbreak:.2f}%") st.write(f"- Avg Rainfall: {avg_rain_outbreak:.2f} mm") st.write(f"**Non-Outbreak Periods** ({disease_choice}):") st.write(f"- Avg Temperature: {avg_temp_non:.2f} °C") st.write(f"- Avg Humidity: {avg_hum_non:.2f}%") st.write(f"- Avg Rainfall: {avg_rain_non:.2f} mm") else: st.write(f"No {disease_choice} outbreaks found in the current selection.")