Spaces:
Sleeping
Sleeping
import pyarabic.araby as araby | |
import pandas as pd | |
import numpy as np | |
import re | |
from datasets import load_dataset | |
from datasets import Features | |
from datasets import Value | |
from datasets import Dataset | |
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import os | |
Secret_token = os.getenv('HF_token') | |
dataset = load_dataset("FDSRashid/embed_matn", token = Secret_token) | |
books = load_dataset('FDSRashid/Hadith_info', data_files='Books.csv', token=Secret_token)['train'].to_pandas() | |
df = dataset["train"].to_pandas() | |
dataset = load_dataset("FDSRashid/hadith_info", data_files = 'All_Matns.csv',token = Secret_token, features = features) | |
matn_info = dataset['train'].to_pandas() | |
matn_info = matn_info.drop(97550) | |
matn_info = matn_info.drop(307206) | |
matn_info['taraf_ID'] = matn_info['taraf_ID'].replace('KeyAbsent', -1) | |
matn_info['taraf_ID'] = matn_info['taraf_ID'].astype(int) | |
matn_info['Book ID'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[0])) | |
matn_info['Hadith Number'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[1])) | |
matn_info = matn_info.join(books, on='Book ID') | |
cols_to_use = df.columns.difference(matn_info.columns) | |
joined_df = matn_info.merge(df[cols_to_use], left_index=True, right_on='__index_level_0__') | |
df = joined_df.copy() | |
model = SentenceTransformer('FDSRashid/QulBERT', token=Secret_token) | |
def find_most_similar_matn(text, n): | |
embed_text = model.encode(araby.strip_diacritics(text)) | |