File size: 5,604 Bytes
afb84be
 
 
 
 
138e739
 
 
 
 
 
 
 
 
afb84be
 
ca6e1a7
138e739
 
 
eb07c0b
afb84be
ca6e1a7
 
 
 
afb84be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca6e1a7
eb07c0b
bf356bb
afb84be
bf356bb
 
 
afb84be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb07c0b
afb84be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c98343
afb84be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca6e1a7
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import weaviate
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import json
import os
   # 设置 Matplotlib 的缓存目录 
os.environ['MPLCONFIGDIR'] = '/tmp/matplotlib' 
# 设置 Hugging Face Transformers 的缓存目录 
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache' 
# 确保这些目录存在 
os.makedirs(os.environ['MPLCONFIGDIR'], exist_ok=True) 
os.makedirs(os.environ['TRANSFORMERS_CACHE'], exist_ok=True) 
auth_config = weaviate.AuthApiKey(api_key="8wNsHV3Enc2PNVL8Bspadh21qYAfAvnK2ux3")

# 初始化 Weaviate 客户端
database_client = weaviate.Client(
  url="https://3a8sbx3s66by10yxginaa.c0.asia-southeast1.gcp.weaviate.cloud",
  auth_client_secret=auth_config
)
class_name="Lhnjames123321"

tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
model = AutoModel.from_pretrained("bert-base-chinese")


def encode(sentences, model, tokenizer):
    # 使用 BERT 计算每个句子的向量
    model.eval()  # 切换到评估模式
    embeddings = []

    with torch.no_grad():  # 禁用梯度计算
        for sentence in sentences:
            # 对输入句子进行编码
            print(sentence)
            inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
            print(inputs)
            inputs = {key: value for key, value in inputs.items()}
            # print(inputs)
            # 获取模型输出
            outputs = model(**inputs)
            # 计算句子的向量(可以尝试其他聚合方法,如mean或pooling)
            embedding = outputs.last_hidden_state.mean(dim=1).numpy().astype('float32')

            embeddings.append(embedding)
    # 将结果合并为一个二维数组
    return np.vstack(embeddings)


def insert_keywords_to_weaviate(database_client, class_name, keywords, summaries, avg_embeddings):
    # 批量插入数据到 Weaviate
    with database_client.batch(batch_size=100) as batch:
        for i, (keyword, summary, avg_embedding) in enumerate(zip(keywords, summaries, avg_embeddings)):
            vector = avg_embedding.tolist()
            properties = {
                'keywords': keyword,
                'summary': summary  # 存储的属性
            }
            print(f'Inserting: {keyword} with summary: {summary}')  # Debug info
            batch.add_data_object(
                properties,
                class_name=class_name,
                vector=vector
            )
    print('Insertion completed')

def init_database(database_client, class_name):
    #读取数据文件[[]]
    dataset = []
    with open('train_2000_modified.json', 'r', encoding='utf-8') as f:
        for line in f:
            dataset.append(json.loads(line))

    keywords=[item['content'] for item in dataset if 'content' in item]
    summaries=[item['summary'] for item in dataset if 'summary' in item]

    tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
    model = AutoModel.from_pretrained("bert-base-chinese")

    keywords_avg_embeddings =[]
    #计算每一组关键词的平局embedding
    for lst in keywords:
        lst = lst.split(',')
        embeddings = encode(lst, model, tokenizer)
        avg_embedding = embeddings.mean(axis=0)
        keywords_avg_embeddings.append(avg_embedding)

    insert_keywords_to_weaviate(database_client, class_name, keywords, summaries, keywords_avg_embeddings)


def fetch_summary_from_database(query_keywords,classname):

    keyword_embeddings=[]
    for keyword in query_keywords:
        keyword_embedding=encode([keyword], model, tokenizer)
        keyword_embeddings.append(keyword_embedding)

    avg_embedding = np.mean(keyword_embeddings, axis=0)
    response = (
        database_client.query
        .get(class_name, ['keywords', 'summary'])  # 查询返回的字段
        .with_near_vector({'vector': avg_embedding})  # 使用向量进行检索
        .with_limit(1)  # 返回前5个结果
        .with_additional(['distance'])  # 返回距离信息
        .do()
    )
    print(response)

    top_distance = response['data']['Get'][class_name][0]['_additional']['distance']
    top_keywords_list=response['data']['Get'][class_name][0]['keywords']
    top_summary = response['data']['Get'][class_name][0]['summary']
    
    return top_distance,top_keywords_list,top_summary



    # tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
    # model = AutoModel.from_pretrained("bert-base-chinese")

    # # 计算文本的向量
    # init_database()


    # query_keywords='裤子'


    # query_vector = encode([query_keywords], model, tokenizer)[0].tolist()

    # # 进行相似性查询
    # response = (
    #     database_client.query
    #     .get(class_name, ['keywords'])  # 查询返回的字段
    #     .with_near_vector({'vector': query_vector})  # 使用向量进行检索
    #     .with_limit(5)  # 返回前5个结果
    #     .with_additional(['distance'])  # 返回距离信息
    #     .do()
    # )

    # # 解析响应
    # results = response.get('data', {}).get('Get', {}).get(class_name, [])
    # output = [{'keywords': result['keywords'], 'distance': result['_additional']['distance']} for result in results]

    # # input = tokenizer(input_str, return_tensors='pt', padding=True, truncation=True, max_length=512)
    # # with torch.no_grad():
    # #     outputs = model(**input)
    # # embeddings = outputs.last_hidden_state.mean(dim=1)

    # # ans = use_weaviate(input_str)
    # # print("查询结果:", ans
    # print(response)

if __name__ == '__main__':
    init_database()