Spaces:
Running
Running
fix code 6
Browse files- app.py +11 -15
- constants.py +14 -0
- preprocess.py +32 -28
app.py
CHANGED
@@ -1,9 +1,6 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
import numpy as np
|
4 |
import redis
|
5 |
import streamlit as st
|
6 |
-
from dotenv import load_dotenv
|
7 |
from langchain import HuggingFaceHub
|
8 |
from langchain.chains import LLMChain
|
9 |
from langchain.chat_models import ChatOpenAI
|
@@ -17,18 +14,17 @@ from constants import (
|
|
17 |
FALCON_MAX_TOKENS,
|
18 |
FALCON_REPO_ID,
|
19 |
FALCON_TEMPERATURE,
|
|
|
|
|
|
|
20 |
OPENAI_MODEL_NAME,
|
21 |
OPENAI_TEMPERATURE,
|
22 |
TEMPLATE_1,
|
23 |
TEMPLATE_2,
|
|
|
24 |
)
|
25 |
from database import create_redis
|
26 |
|
27 |
-
load_dotenv()
|
28 |
-
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
29 |
-
ITEM_KEYWORD_EMBEDDING = "item_vector"
|
30 |
-
TOPK = 5
|
31 |
-
|
32 |
|
33 |
# connect to redis database
|
34 |
@st.cache_resource()
|
@@ -54,15 +50,12 @@ def encode_keywords_chain():
|
|
54 |
|
55 |
|
56 |
# the present products chain
|
57 |
-
@st.cache_resource()
|
58 |
def present_products_chain():
|
59 |
template = TEMPLATE_2
|
60 |
memory = ConversationBufferMemory(memory_key="chat_history")
|
61 |
prompt = PromptTemplate(input_variables=["chat_history", "user_msg"], template=template)
|
62 |
chain = LLMChain(
|
63 |
-
llm=ChatOpenAI(
|
64 |
-
openai_api_key=os.getenv("OPENAI_API_KEY"), temperature=OPENAI_TEMPERATURE, model=OPENAI_MODEL_NAME
|
65 |
-
),
|
66 |
prompt=prompt,
|
67 |
verbose=False,
|
68 |
memory=memory,
|
@@ -81,7 +74,11 @@ def main():
|
|
81 |
st.caption("π€ Powered by Falcon Open Source AI model")
|
82 |
redis_conn = connect_to_redis()
|
83 |
keywords_chain = encode_keywords_chain()
|
84 |
-
|
|
|
|
|
|
|
|
|
85 |
embedding_model = instance_embedding_model()
|
86 |
|
87 |
if "messages" not in st.session_state:
|
@@ -102,7 +99,6 @@ def main():
|
|
102 |
query_vector = embedding_model.encode(keywords)
|
103 |
query_vector_bytes = np.array(query_vector).astype(np.float32).tobytes()
|
104 |
# prepare the query
|
105 |
-
|
106 |
q = (
|
107 |
Query(f"*=>[KNN {TOPK} @{ITEM_KEYWORD_EMBEDDING} $vec_param AS vector_score]")
|
108 |
.sort_by("vector_score")
|
@@ -116,7 +112,7 @@ def main():
|
|
116 |
result_output = ""
|
117 |
for product in results.docs:
|
118 |
result_output += f"product_name:{product.item_name}, product_description:{product.item_keywords} \n"
|
119 |
-
result = chat_chain.predict(user_msg=f"{result_output}\n{prompt}")
|
120 |
st.session_state.messages.append({"role": "assistant", "content": result})
|
121 |
st.chat_message("assistant").write(result)
|
122 |
|
|
|
|
|
|
|
1 |
import numpy as np
|
2 |
import redis
|
3 |
import streamlit as st
|
|
|
4 |
from langchain import HuggingFaceHub
|
5 |
from langchain.chains import LLMChain
|
6 |
from langchain.chat_models import ChatOpenAI
|
|
|
14 |
FALCON_MAX_TOKENS,
|
15 |
FALCON_REPO_ID,
|
16 |
FALCON_TEMPERATURE,
|
17 |
+
HUGGINGFACEHUB_API_TOKEN,
|
18 |
+
ITEM_KEYWORD_EMBEDDING,
|
19 |
+
OPENAI_API_KEY,
|
20 |
OPENAI_MODEL_NAME,
|
21 |
OPENAI_TEMPERATURE,
|
22 |
TEMPLATE_1,
|
23 |
TEMPLATE_2,
|
24 |
+
TOPK,
|
25 |
)
|
26 |
from database import create_redis
|
27 |
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
# connect to redis database
|
30 |
@st.cache_resource()
|
|
|
50 |
|
51 |
|
52 |
# the present products chain
|
|
|
53 |
def present_products_chain():
|
54 |
template = TEMPLATE_2
|
55 |
memory = ConversationBufferMemory(memory_key="chat_history")
|
56 |
prompt = PromptTemplate(input_variables=["chat_history", "user_msg"], template=template)
|
57 |
chain = LLMChain(
|
58 |
+
llm=ChatOpenAI(openai_api_key=OPENAI_API_KEY, temperature=OPENAI_TEMPERATURE, model=OPENAI_MODEL_NAME),
|
|
|
|
|
59 |
prompt=prompt,
|
60 |
verbose=False,
|
61 |
memory=memory,
|
|
|
74 |
st.caption("π€ Powered by Falcon Open Source AI model")
|
75 |
redis_conn = connect_to_redis()
|
76 |
keywords_chain = encode_keywords_chain()
|
77 |
+
|
78 |
+
if "window_refreshed" not in st.session_state:
|
79 |
+
st.session_state.window_refreshed = True
|
80 |
+
st.session_state.chat_chain = present_products_chain()
|
81 |
+
|
82 |
embedding_model = instance_embedding_model()
|
83 |
|
84 |
if "messages" not in st.session_state:
|
|
|
99 |
query_vector = embedding_model.encode(keywords)
|
100 |
query_vector_bytes = np.array(query_vector).astype(np.float32).tobytes()
|
101 |
# prepare the query
|
|
|
102 |
q = (
|
103 |
Query(f"*=>[KNN {TOPK} @{ITEM_KEYWORD_EMBEDDING} $vec_param AS vector_score]")
|
104 |
.sort_by("vector_score")
|
|
|
112 |
result_output = ""
|
113 |
for product in results.docs:
|
114 |
result_output += f"product_name:{product.item_name}, product_description:{product.item_keywords} \n"
|
115 |
+
result = st.session_state.chat_chain.predict(user_msg=f"{result_output}\n{prompt}")
|
116 |
st.session_state.messages.append({"role": "assistant", "content": result})
|
117 |
st.chat_message("assistant").write(result)
|
118 |
|
constants.py
CHANGED
@@ -1,11 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
FALCON_REPO_ID = "tiiuae/falcon-7b-instruct"
|
2 |
FALCON_TEMPERATURE = 0.1
|
3 |
FALCON_MAX_TOKENS = 500
|
4 |
|
5 |
OPENAI_MODEL_NAME = "gpt-3.5-turbo"
|
6 |
OPENAI_TEMPERATURE = 0.8
|
|
|
7 |
|
8 |
EMBEDDING_MODEL_NAME = "sentence-transformers/all-distilroberta-v1"
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
TEMPLATE_1 = "Create comma separated product keywords to perform a query on amazon dataset for this user input: {product_description}"
|
11 |
TEMPLATE_2 = """You are a salesman.Present the given product results in a nice way as answer to the user_msg. Don't ask questions back,
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
8 |
FALCON_REPO_ID = "tiiuae/falcon-7b-instruct"
|
9 |
FALCON_TEMPERATURE = 0.1
|
10 |
FALCON_MAX_TOKENS = 500
|
11 |
|
12 |
OPENAI_MODEL_NAME = "gpt-3.5-turbo"
|
13 |
OPENAI_TEMPERATURE = 0.8
|
14 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
15 |
|
16 |
EMBEDDING_MODEL_NAME = "sentence-transformers/all-distilroberta-v1"
|
17 |
+
ITEM_KEYWORD_EMBEDDING = "item_vector"
|
18 |
+
TOPK = 5
|
19 |
+
NUMBER_PRODUCTS = 1000
|
20 |
+
MAX_TEXT_LENGTH = 512
|
21 |
+
TEXT_EMBEDDING_DIMENSION = 768
|
22 |
+
DATA_PATH = "product_data.csv"
|
23 |
|
24 |
TEMPLATE_1 = "Create comma separated product keywords to perform a query on amazon dataset for this user input: {product_description}"
|
25 |
TEMPLATE_2 = """You are a salesman.Present the given product results in a nice way as answer to the user_msg. Don't ask questions back,
|
preprocess.py
CHANGED
@@ -3,42 +3,46 @@ import pandas as pd
|
|
3 |
import redis
|
4 |
from sentence_transformers import SentenceTransformer
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
from database import create_redis
|
7 |
from utils import create_flat_index, load_vectors
|
8 |
|
9 |
pool = create_redis()
|
10 |
redis_conn = redis.Redis(connection_pool=pool)
|
11 |
-
# set maximum length for text fields
|
12 |
-
MAX_TEXT_LENGTH = 512
|
13 |
-
TEXT_EMBEDDING_DIMENSION = 768
|
14 |
-
NUMBER_PRODUCTS = 10000
|
15 |
|
16 |
|
17 |
def auto_truncate(text: str):
|
18 |
return text[0:MAX_TEXT_LENGTH]
|
19 |
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
data
|
27 |
-
data
|
28 |
-
data
|
29 |
-
data.
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
#
|
41 |
-
redis_conn
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
|
|
|
3 |
import redis
|
4 |
from sentence_transformers import SentenceTransformer
|
5 |
|
6 |
+
from constants import (
|
7 |
+
DATA_PATH,
|
8 |
+
MAX_TEXT_LENGTH,
|
9 |
+
NUMBER_PRODUCTS,
|
10 |
+
TEXT_EMBEDDING_DIMENSION,
|
11 |
+
)
|
12 |
from database import create_redis
|
13 |
from utils import create_flat_index, load_vectors
|
14 |
|
15 |
pool = create_redis()
|
16 |
redis_conn = redis.Redis(connection_pool=pool)
|
|
|
|
|
|
|
|
|
17 |
|
18 |
|
19 |
def auto_truncate(text: str):
|
20 |
return text[0:MAX_TEXT_LENGTH]
|
21 |
|
22 |
|
23 |
+
def data_preprocessing_and_loading():
|
24 |
+
data = pd.read_csv(
|
25 |
+
DATA_PATH,
|
26 |
+
converters={"bullet_point": auto_truncate, "item_keywords": auto_truncate, "item_name": auto_truncate},
|
27 |
+
)
|
28 |
+
data["primary_key"] = data["item_id"] + "-" + data["domain_name"]
|
29 |
+
data.drop(columns=["item_id", "domain_name"], inplace=True)
|
30 |
+
data["item_keywords"].replace("", np.nan, inplace=True)
|
31 |
+
data.dropna(subset=["item_keywords"], inplace=True)
|
32 |
+
data.reset_index(drop=True, inplace=True)
|
33 |
+
data_metadata = data.head(NUMBER_PRODUCTS).to_dict(orient="index")
|
34 |
+
|
35 |
+
# generate embeddings (vectors) for the item keywords
|
36 |
+
embedding_model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
|
37 |
+
# get the item keywords attribute for each product and encode them into vector embeddings
|
38 |
+
item_keywords = [data_metadata[i]["item_keywords"] for i in data_metadata.keys()]
|
39 |
+
item_keywords_vectors = [embedding_model.encode(item) for item in item_keywords]
|
40 |
+
# flush all data
|
41 |
+
redis_conn.flushall()
|
42 |
+
# create flat index & load vectors
|
43 |
+
create_flat_index(redis_conn, NUMBER_PRODUCTS, TEXT_EMBEDDING_DIMENSION, "COSINE")
|
44 |
+
load_vectors(redis_conn, data_metadata, item_keywords_vectors)
|
45 |
+
|
46 |
+
|
47 |
+
if __name__ == "__main__":
|
48 |
+
data_preprocessing_and_loading()
|