Spaces:

thefish1
/

space02

Running

App Files Files Community

thefish1 commited on Aug 1, 2024

Commit

57c1fc4

1 Parent(s): 212abdf

update

Browse files

Files changed (1) hide show

app.py +52 -10

app.py CHANGED Viewed

@@ -317,7 +317,7 @@ def get_candidates_from_db(keywords_dict, class_name,limit=3):
     for embedding in embeddings:
         response = (
             weaviate_client.query
-            .get(class_name, ['keywords', 'summary'])
             .with_near_vector({'vector': embedding})
             .with_limit(limit)
             .with_additional(['distance'])
@@ -331,8 +331,11 @@ def get_candidates_from_db(keywords_dict, class_name,limit=3):
             for result in results:
                 candidate_list.append({
                     'distance': result['_additional']['distance'],
                     'summary': result['summary'],
-                    'keywords': result['keywords']
                 })
     return candidate_list
@@ -353,7 +356,7 @@ def keyword_match(keywords_dict,candidates):
                 return candidate['distance'],candidate['summary'],candidate['keywords']
     return 1000,None,None
-def chatbot_response(message, history, window_size, threshold, user_weight, triggered_weight,candidate_length,api_key):
     #初始化openai client
     initialize_openai_client(api_key)
@@ -379,22 +382,60 @@ def chatbot_response(message, history, window_size, threshold, user_weight, trig
     #数据库检索，双方平均方式
     # distance, ad_summary, ad_keywords = get_response_from_db(keywords_dict, class_name="ad_DB02")
     #数据库索引，数据库关键词平均方式
-    candidates=get_candidates_from_db(keywords_dict, class_name="Ad_DB03",limit=candidate_length)
     #先对候选集的distance进行筛选，保留小于threshold的
     candidates.sort(key=lambda x:x['distance'])
     candidates=[candidate for candidate in candidates if candidate['distance']<threshold]
     print("----------------------------------------------------------------------")
     print(f"keywords:{keywords_dict.keys()}")
     print(f"candidates:{candidates}")
     if(candidates):
-        distance, ad_summary, ad_keywords=keyword_match(keywords_dict,candidates)
-    else:
-        distance=1000
-    #判断相似度
-    if distance and distance < threshold:
         brands=['腾讯','阿里巴巴','百度','京东','华为','小米','苹果','微软','谷歌','亚马逊']
         brand=random.choice(brands)
         ad_message = f"{message} <sep>品牌{brand}<sep>{ad_summary}"
@@ -439,6 +480,7 @@ demo = gr.ChatInterface(
         # gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
         gr.Slider(minimum=1, maximum=5, value=2, step=1, label="Window size"),
         gr.Slider(minimum=0.01, maximum=0.25, value=0.10, step=0.01, label="Distance threshold"),
         gr.Slider(minimum=1, maximum=5, value=2, step=1, label="Weight of keywords from users"),
         gr.Slider(minimum=0, maximum=2, value=0.5, step=0.5, label="Weight of triggered keywords"),
         gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Number of candidates"),

     for embedding in embeddings:
         response = (
             weaviate_client.query
+            .get(class_name, ['group_id','keyword_list','keyword', 'summary'])
             .with_near_vector({'vector': embedding})
             .with_limit(limit)
             .with_additional(['distance'])
             for result in results:
                 candidate_list.append({
                     'distance': result['_additional']['distance'],
+                    'group_id': result['group_id'],
+                    'keyword_list':result['keyword_list'],
                     'summary': result['summary'],
+                    'keyword': result['keyword']
                 })
     return candidate_list
                 return candidate['distance'],candidate['summary'],candidate['keywords']
     return 1000,None,None
+def chatbot_response(message, history, window_size, threshold, score_threshold,user_weight, triggered_weight,candidate_length,api_key):
     #初始化openai client
     initialize_openai_client(api_key)
     #数据库检索，双方平均方式
     # distance, ad_summary, ad_keywords = get_response_from_db(keywords_dict, class_name="ad_DB02")
     #数据库索引，数据库关键词平均方式
+    candidates=get_candidates_from_db(keywords_dict, class_name="Ad_DB05",limit=candidate_length)
+    # #对类别进行判断加权
+    # for candidate in candidates:
+    #     if candidate['keyword']!= candidate['keyword_list'].split(',')[0]:
+    #         candidate['distance']*=2
     #先对候选集的distance进行筛选，保留小于threshold的
     candidates.sort(key=lambda x:x['distance'])
     candidates=[candidate for candidate in candidates if candidate['distance']<threshold]
     print("----------------------------------------------------------------------")
     print(f"keywords:{keywords_dict.keys()}")
     print(f"candidates:{candidates}")
+    #此时的候选集中所有元素都至少有一个关键词命中了
+    #筛选后的候选集进行投票，选出被投票最多的一条
+    #投中第一个元素加双倍权重
+    group_scores={}
     if(candidates):
+        for candidate in candidates:
+            group_id=candidate['group_id']
+            keyword = candidate['keyword']
+            keyword_list = candidate['keyword_list'].split(',')
+            # 检查 keyword 是否是 keyword_list 中的第一个元素
+            if keyword == keyword_list[0]:
+                score = 2
+            else:
+                score = 1
+            # 更新 group_scores 字典中的分数
+            if group_id in group_scores:
+                group_scores[group_id] += score
+            else:
+                group_scores[group_id] = score
+    print(group_scores[:4])
+    if group_scores:
+        max_group_id = max(group_scores, key=group_scores.get)
+        max_score = group_scores[max_group_id]
+        if(max_score>=score_threshold):
+            distance,ad_summary,ad_keywords=[candidate['distance'],candidate['summary'],candidate['keyword_list'] for candidate in candidates if candidate['group_id']==max_group_id][0]
+        else:
+            distance=1000
+    # if(candidates):
+    #     # distance, ad_summary, ad_keywords=keyword_match(keywords_dict,candidates)
+    #     distance,ad_summary,ad_keywords=candidates[0]['distance'],candidates[0]['summary'],candidates[0]['keyword_list']
+    # else:
+    #     distance=1000
+    if distance and distance < 1000:
         brands=['腾讯','阿里巴巴','百度','京东','华为','小米','苹果','微软','谷歌','亚马逊']
         brand=random.choice(brands)
         ad_message = f"{message} <sep>品牌{brand}<sep>{ad_summary}"
         # gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
         gr.Slider(minimum=1, maximum=5, value=2, step=1, label="Window size"),
         gr.Slider(minimum=0.01, maximum=0.25, value=0.10, step=0.01, label="Distance threshold"),
+        gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Score threshold"),
         gr.Slider(minimum=1, maximum=5, value=2, step=1, label="Weight of keywords from users"),
         gr.Slider(minimum=0, maximum=2, value=0.5, step=0.5, label="Weight of triggered keywords"),
         gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Number of candidates"),