Spaces:
Sleeping
Sleeping
from sentence_transformers import SentenceTransformer, util | |
import pandas as pd | |
import numpy as np | |
import pickle | |
from tqdm import tqdm | |
from functools import partial | |
from multiprocessing import Pool | |
# Load pre-trained model | |
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
# Load data | |
with open('data.pickle', 'rb') as file: | |
data = pickle.load(file) | |
# Define a function to compute similarity for a pair of sentences | |
def compute_similarity(model, source_sentence, target_sentence): | |
embedding_1 = model.encode(source_sentence, convert_to_tensor=True) | |
embedding_2 = model.encode(target_sentence, convert_to_tensor=True) | |
similarity = util.pytorch_cos_sim(embedding_1, embedding_2) | |
return similarity.item() | |
# Define a function to compute similarities for a given source sentence | |
def compute_similarities_for_source(model, source_sentence, data): | |
source_index = data.index(source_sentence) | |
similarities = [compute_similarity(model, | |
source_sentence['description'], | |
data[index]['description']) for index in tqdm(range(source_index, len(data)), | |
desc=f"Computing similarities for '{source_sentence['description']}'")] | |
return similarities | |
# Define a function to compute similarities for all sentences in the data | |
def compute_similarities(model, data): | |
with Pool() as pool: | |
func = partial(compute_similarities_for_source, model) | |
similarities = list(tqdm(pool.imap(func, data), total=len(data), desc="Computing similarities")) | |
return similarities | |
# Embed sentences and compute similarities | |
embeddings = model.encode([source_sentence['description'] for source_sentence in data], convert_to_tensor=True) | |
matrix = util.pytorch_cos_sim(embeddings, embeddings).numpy() | |
# Save similarities to CSV file | |
pd.DataFrame(matrix, columns=[source_sentence['description'] for source_sentence in data]).to_csv('data.csv', index=False) |