import streamlit as st
from streamlit import session_state
import numpy as np
import json
from io import StringIO
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.cluster import AgglomerativeClustering,k_means
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import numpy as np"stopwords")
import nltk'punkt')
#text preprocessing function
def clean_text_1(text):
stop_words = set(stopwords.words("english"))
def remove_stopwords(text):
return " ".join([word for word in str(text).split() if word not in stop_words])
text = remove_stopwords(text)
text = str(text).lower() # Lowercase words
text = re.sub(r"\[(.*?)\]", " ", text) # Remove [+XYZ chars] in content
text = re.sub(r"\s+", " ", text) # Remove multiple spaces in content
text = re.sub(r"\w+…|…", " ", text) # Remove ellipsis (and last word)
text = re.sub(r"(?<=\w)-(?=\w)", " ", text) # Replace dash between words
# text = re.sub(stop_words, " ", text) # Replace dash between words
text = re.sub(
f"[{re.escape(string.punctuation)}]", "", text
) # Remove punctuation
return text
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #calling hugging face model for embeddings here
# Load sentence transformer model
def get_embedding(text):
# Assuming you have a function clean_text_1 to clean the text
#text = clean_text_1(text)
return model.encode(text)
# Streamlit UI configuration
# Upload file
uploaded_file = st.file_uploader("Choose a file")
if uploaded_file:
# Read data from file
df = pd.read_csv(uploaded_file)
# Clean data
df = df[df['text'].notna()].reset_index(drop=True)
# Get embeddings
df['embedding'] = df['text'].apply(get_embedding)
matrix = np.vstack(df['embedding'].values)
# Distance threshold slider
distance_threshold = st.slider("Select Distance Threshold", min_value=0.1, max_value=2.0, value=1.1, step=0.1)
# Perform clustering
agg_clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, linkage='ward')
cluster_labels = agg_clustering.fit_predict(matrix)
df['Cluster'] = cluster_labels
# Visualize clusters with t-SNE
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
vis_dims2 = tsne.fit_transform(matrix)
x = [x for x, y in vis_dims2]
y = [y for x, y in vis_dims2]
unique_clusters, cluster_counts = np.unique(cluster_labels, return_counts=True)
# Create a colormap based on cluster sizes
colormap ="viridis", len(unique_clusters))
# Set up Streamlit app
fig, ax = plt.subplots()
for category, (color, size) in enumerate(zip(colormap.colors, cluster_counts)):
xs = np.array(x)[cluster_labels == category]
ys = np.array(y)[cluster_labels == category]
ax.scatter(xs, ys, color=color, alpha=0.3, label=f'Cluster {category} (Size: {size})')
avg_x = xs.mean()
avg_y = ys.mean()
ax.scatter(avg_x, avg_y, marker="x", color=color, s=100)
ax.set_title("Clusters identified visualized in language 2D using t-SNE")
# Display the plot in Streamlit
st.text_area("Number of Cluster Labels", value=len(np.unique(cluster_labels.tolist())))
# Reading a review which belong to each group.
rev_per_cluster = 3
n_clusters = len(np.unique(cluster_labels.tolist()))
for i in range(n_clusters):
print(f"Cluster {i} Theme:", end=" ")
reviews = "\n".join(
df[df.Cluster == i]
.text.str.replace("Title: ", "")
.str.replace("\n\nContent: ", ": ")
.sample(rev_per_cluster, random_state=42)
messages = [
{"role": "user", "content": f'What do the following have in common?\n\nValues:\n"""\n{reviews}\n"""\n\nTheme:'}
response =
print(response.choices[0].message.content.replace("\n", ""))
st.text_area(f"Cluster {i} Theme", value=response.choices[0].message.content.replace("\n", ""))
# print("-" * 100)