File size: 5,604 Bytes
afb84be 138e739 afb84be ca6e1a7 138e739 eb07c0b afb84be ca6e1a7 afb84be ca6e1a7 eb07c0b bf356bb afb84be bf356bb afb84be eb07c0b afb84be 6c98343 afb84be ca6e1a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import weaviate
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import json
import os
# 设置 Matplotlib 的缓存目录
os.environ['MPLCONFIGDIR'] = '/tmp/matplotlib'
# 设置 Hugging Face Transformers 的缓存目录
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
# 确保这些目录存在
os.makedirs(os.environ['MPLCONFIGDIR'], exist_ok=True)
os.makedirs(os.environ['TRANSFORMERS_CACHE'], exist_ok=True)
auth_config = weaviate.AuthApiKey(api_key="8wNsHV3Enc2PNVL8Bspadh21qYAfAvnK2ux3")
# 初始化 Weaviate 客户端
database_client = weaviate.Client(
url="https://3a8sbx3s66by10yxginaa.c0.asia-southeast1.gcp.weaviate.cloud",
auth_client_secret=auth_config
)
class_name="Lhnjames123321"
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
model = AutoModel.from_pretrained("bert-base-chinese")
def encode(sentences, model, tokenizer):
# 使用 BERT 计算每个句子的向量
model.eval() # 切换到评估模式
embeddings = []
with torch.no_grad(): # 禁用梯度计算
for sentence in sentences:
# 对输入句子进行编码
print(sentence)
inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
print(inputs)
inputs = {key: value for key, value in inputs.items()}
# print(inputs)
# 获取模型输出
outputs = model(**inputs)
# 计算句子的向量(可以尝试其他聚合方法,如mean或pooling)
embedding = outputs.last_hidden_state.mean(dim=1).numpy().astype('float32')
embeddings.append(embedding)
# 将结果合并为一个二维数组
return np.vstack(embeddings)
def insert_keywords_to_weaviate(database_client, class_name, keywords, summaries, avg_embeddings):
# 批量插入数据到 Weaviate
with database_client.batch(batch_size=100) as batch:
for i, (keyword, summary, avg_embedding) in enumerate(zip(keywords, summaries, avg_embeddings)):
vector = avg_embedding.tolist()
properties = {
'keywords': keyword,
'summary': summary # 存储的属性
}
print(f'Inserting: {keyword} with summary: {summary}') # Debug info
batch.add_data_object(
properties,
class_name=class_name,
vector=vector
)
print('Insertion completed')
def init_database(database_client, class_name):
#读取数据文件[[]]
dataset = []
with open('train_2000_modified.json', 'r', encoding='utf-8') as f:
for line in f:
dataset.append(json.loads(line))
keywords=[item['content'] for item in dataset if 'content' in item]
summaries=[item['summary'] for item in dataset if 'summary' in item]
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
model = AutoModel.from_pretrained("bert-base-chinese")
keywords_avg_embeddings =[]
#计算每一组关键词的平局embedding
for lst in keywords:
lst = lst.split(',')
embeddings = encode(lst, model, tokenizer)
avg_embedding = embeddings.mean(axis=0)
keywords_avg_embeddings.append(avg_embedding)
insert_keywords_to_weaviate(database_client, class_name, keywords, summaries, keywords_avg_embeddings)
def fetch_summary_from_database(query_keywords,classname):
keyword_embeddings=[]
for keyword in query_keywords:
keyword_embedding=encode([keyword], model, tokenizer)
keyword_embeddings.append(keyword_embedding)
avg_embedding = np.mean(keyword_embeddings, axis=0)
response = (
database_client.query
.get(class_name, ['keywords', 'summary']) # 查询返回的字段
.with_near_vector({'vector': avg_embedding}) # 使用向量进行检索
.with_limit(1) # 返回前5个结果
.with_additional(['distance']) # 返回距离信息
.do()
)
print(response)
top_distance = response['data']['Get'][class_name][0]['_additional']['distance']
top_keywords_list=response['data']['Get'][class_name][0]['keywords']
top_summary = response['data']['Get'][class_name][0]['summary']
return top_distance,top_keywords_list,top_summary
# tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
# model = AutoModel.from_pretrained("bert-base-chinese")
# # 计算文本的向量
# init_database()
# query_keywords='裤子'
# query_vector = encode([query_keywords], model, tokenizer)[0].tolist()
# # 进行相似性查询
# response = (
# database_client.query
# .get(class_name, ['keywords']) # 查询返回的字段
# .with_near_vector({'vector': query_vector}) # 使用向量进行检索
# .with_limit(5) # 返回前5个结果
# .with_additional(['distance']) # 返回距离信息
# .do()
# )
# # 解析响应
# results = response.get('data', {}).get('Get', {}).get(class_name, [])
# output = [{'keywords': result['keywords'], 'distance': result['_additional']['distance']} for result in results]
# # input = tokenizer(input_str, return_tensors='pt', padding=True, truncation=True, max_length=512)
# # with torch.no_grad():
# # outputs = model(**input)
# # embeddings = outputs.last_hidden_state.mean(dim=1)
# # ans = use_weaviate(input_str)
# # print("查询结果:", ans
# print(response)
if __name__ == '__main__':
init_database()
|