|
from sentence_transformers import SentenceTransformer, util |
|
import pandas as pd |
|
import numpy as np |
|
import pickle |
|
from tqdm import tqdm |
|
from functools import partial |
|
from multiprocessing import Pool |
|
|
|
|
|
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') |
|
|
|
|
|
with open('data.pickle', 'rb') as file: |
|
data = pickle.load(file) |
|
|
|
|
|
def compute_similarity(model, source_sentence, target_sentence): |
|
embedding_1 = model.encode(source_sentence, convert_to_tensor=True) |
|
embedding_2 = model.encode(target_sentence, convert_to_tensor=True) |
|
similarity = util.pytorch_cos_sim(embedding_1, embedding_2) |
|
return similarity.item() |
|
|
|
|
|
def compute_similarities_for_source(model, source_sentence, data): |
|
source_index = data.index(source_sentence) |
|
similarities = [compute_similarity(model, |
|
source_sentence['description'], |
|
data[index]['description']) for index in tqdm(range(source_index, len(data)), |
|
desc=f"Computing similarities for '{source_sentence['description']}'")] |
|
return similarities |
|
|
|
|
|
def compute_similarities(model, data): |
|
with Pool() as pool: |
|
func = partial(compute_similarities_for_source, model) |
|
similarities = list(tqdm(pool.imap(func, data), total=len(data), desc="Computing similarities")) |
|
return similarities |
|
|
|
|
|
embeddings = model.encode([source_sentence['description'] for source_sentence in data], convert_to_tensor=True) |
|
matrix = util.pytorch_cos_sim(embeddings, embeddings).numpy() |
|
|
|
|
|
pd.DataFrame(matrix, columns=[source_sentence['description'] for source_sentence in data]).to_csv('data.csv', index=False) |