Spaces:

thefish1
/

space02

Running

App Files Files Community

thefish1 commited on Jul 11, 2024

Commit

b031362

1 Parent(s): bb0dffa

update 0711 lk

Browse files

Files changed (3) hide show

app.py +37 -10
load_data.py +21 -0
train_2000_modified.json +0 -0

app.py CHANGED Viewed

@@ -1,15 +1,26 @@
 import gradio as gr
 from huggingface_hub import InferenceClient
-# from ad_matching import fetch_top_ad
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
 client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 def get_keywords(message):
     system_message = """
     #角色
@@ -30,6 +41,20 @@ def get_keywords(message):
         response+=token
     keywords=response.split(' ')
 def respond(
@@ -67,11 +92,13 @@ def respond(
             messages.append({"role": "assistant", "content": val[1]})
     key_words=get_keywords(message)
-    # ad=fetch_top_ad(key_words)
-    # if ad:
-    #     messages.append({"role": "assistant", "content": f"<sep> {ad}"})
-    messages.append({"role": "user", "content": message})
     response = ""
@@ -116,4 +143,4 @@ demo = gr.ChatInterface(
 if __name__ == "__main__":
-    demo.launch(share=True)

 import gradio as gr
 from huggingface_hub import InferenceClient
+import json
+import random
+import re
+from load_data import load_data
+# from ad_matching import fetch_top_ad
+#对话模型
 client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
+#本地加载数据
+dataset = load_data(file_path='train_2000.json',num_samples=2000)
+keyword_lists = [item['content'] for item in dataset if 'content' in item]
+summary_lists = [item['summary'] for item in dataset if 'summary' in item]
+for item in keyword_lists:
+    item=item.split(',')
 def get_keywords(message):
     system_message = """
     #角色
         response+=token
     keywords=response.split(' ')
+    return keywords
+def keyword_match(query_keywords, ad_keywords_lists):
+    max_matches = 0
+    most_matching_list = None
+    index=0
+    for i,lst in enumerate(ad_keywords_lists):
+        matches = sum(keyword in lst for keyword in query_keywords)
+        if matches > max_matches:
+            max_matches = matches
+            most_matching_list = lst
+            index=i
+    return max_matches,index
 def respond(
             messages.append({"role": "assistant", "content": val[1]})
     key_words=get_keywords(message)
+    max_matches,index=keyword_match(key_words,keyword_lists)
+    if max_matches>1:
+        ad=summary_lists[index]
+        messages.append({"role": "user", "content": f"{message} <sep> {ad}"})
+    else :
+        messages.append({"role": "user", "content": message})
     response = ""
 if __name__ == "__main__":
+    demo.launch(share=True)

load_data.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import json
+def load_data(file_path='train.json',num_samples=2000):
+    data = []
+    with open(file_path, 'r', encoding='utf-8') as file:
+        for line in file:
+            try:
+                data.append(json.loads(line))
+            except json.JSONDecodeError as e:
+                print(f"Error decoding JSON: {e}")
+    return data[:num_samples]
+if __name__ == '__main__':
+    #数据集切分
+    dataset = load_data()
+    with open('train_2000.json', 'w', encoding='utf-8') as file:
+        for item in dataset:
+            file.write(json.dumps(item, ensure_ascii=False) + '\n')

train_2000_modified.json ADDED Viewed

The diff for this file is too large to render. See raw diff