import weaviate import numpy as np import torch from transformers import AutoTokenizer, AutoModel import json import os # 设置 Matplotlib 的缓存目录 os.environ['MPLCONFIGDIR'] = '/tmp/matplotlib' # 设置 Hugging Face Transformers 的缓存目录 os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache' # 确保这些目录存在 os.makedirs(os.environ['MPLCONFIGDIR'], exist_ok=True) os.makedirs(os.environ['TRANSFORMERS_CACHE'], exist_ok=True) auth_config = weaviate.AuthApiKey(api_key="8wNsHV3Enc2PNVL8Bspadh21qYAfAvnK2ux3") # 初始化 Weaviate 客户端 database_client = weaviate.Client( url="https://3a8sbx3s66by10yxginaa.c0.asia-southeast1.gcp.weaviate.cloud", auth_client_secret=auth_config ) class_name="Lhnjames123321" tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese") model = AutoModel.from_pretrained("bert-base-chinese") def encode(sentences, model, tokenizer): # 使用 BERT 计算每个句子的向量 model.eval() # 切换到评估模式 embeddings = [] with torch.no_grad(): # 禁用梯度计算 for sentence in sentences: # 对输入句子进行编码 print(sentence) inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512) print(inputs) inputs = {key: value for key, value in inputs.items()} # print(inputs) # 获取模型输出 outputs = model(**inputs) # 计算句子的向量(可以尝试其他聚合方法,如mean或pooling) embedding = outputs.last_hidden_state.mean(dim=1).numpy().astype('float32') embeddings.append(embedding) # 将结果合并为一个二维数组 return np.vstack(embeddings) def insert_keywords_to_weaviate(database_client, class_name, keywords, summaries, avg_embeddings): # 批量插入数据到 Weaviate with database_client.batch(batch_size=100) as batch: for i, (keyword, summary, avg_embedding) in enumerate(zip(keywords, summaries, avg_embeddings)): vector = avg_embedding.tolist() properties = { 'keywords': keyword, 'summary': summary # 存储的属性 } print(f'Inserting: {keyword} with summary: {summary}') # Debug info batch.add_data_object( properties, class_name=class_name, vector=vector ) print('Insertion completed') def init_database(database_client, class_name): #读取数据文件[[]] dataset = [] with open('train_2000_modified.json', 'r', encoding='utf-8') as f: for line in f: dataset.append(json.loads(line)) keywords=[item['content'] for item in dataset if 'content' in item] summaries=[item['summary'] for item in dataset if 'summary' in item] tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese") model = AutoModel.from_pretrained("bert-base-chinese") keywords_avg_embeddings =[] #计算每一组关键词的平局embedding for lst in keywords: lst = lst.split(',') embeddings = encode(lst, model, tokenizer) avg_embedding = embeddings.mean(axis=0) keywords_avg_embeddings.append(avg_embedding) insert_keywords_to_weaviate(database_client, class_name, keywords, summaries, keywords_avg_embeddings) def fetch_summary_from_database(query_keywords,classname): keyword_embeddings=[] for keyword in query_keywords: keyword_embedding=encode([keyword], model, tokenizer) keyword_embeddings.append(keyword_embedding) avg_embedding = np.mean(keyword_embeddings, axis=0) response = ( database_client.query .get(class_name, ['keywords', 'summary']) # 查询返回的字段 .with_near_vector({'vector': avg_embedding}) # 使用向量进行检索 .with_limit(1) # 返回前5个结果 .with_additional(['distance']) # 返回距离信息 .do() ) print(response) top_distance = response['data']['Get'][class_name][0]['_additional']['distance'] top_keywords_list=response['data']['Get'][class_name][0]['keywords'] top_summary = response['data']['Get'][class_name][0]['summary'] return top_distance,top_keywords_list,top_summary # tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese") # model = AutoModel.from_pretrained("bert-base-chinese") # # 计算文本的向量 # init_database() # query_keywords='裤子' # query_vector = encode([query_keywords], model, tokenizer)[0].tolist() # # 进行相似性查询 # response = ( # database_client.query # .get(class_name, ['keywords']) # 查询返回的字段 # .with_near_vector({'vector': query_vector}) # 使用向量进行检索 # .with_limit(5) # 返回前5个结果 # .with_additional(['distance']) # 返回距离信息 # .do() # ) # # 解析响应 # results = response.get('data', {}).get('Get', {}).get(class_name, []) # output = [{'keywords': result['keywords'], 'distance': result['_additional']['distance']} for result in results] # # input = tokenizer(input_str, return_tensors='pt', padding=True, truncation=True, max_length=512) # # with torch.no_grad(): # # outputs = model(**input) # # embeddings = outputs.last_hidden_state.mean(dim=1) # # ans = use_weaviate(input_str) # # print("查询结果:", ans # print(response) if __name__ == '__main__': init_database()