Spaces:
Running
Running
File size: 2,879 Bytes
3178215 7fa43a2 0627860 7fa43a2 ef81519 91835ca 3178215 804b38c 3178215 44d1519 3178215 44d1519 b20e9c3 3178215 b20e9c3 3178215 b20e9c3 7fa43a2 9b6cd8c 16ada42 3178215 7fa43a2 0627860 16ada42 e145847 0627860 e145847 16ada42 91835ca 16ada42 55f98ec b746a08 16ada42 3178215 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import pyarabic.araby as araby
import pandas as pd
import numpy as np
import re
from datasets import load_dataset
from datasets import Features
from datasets import Value
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.cross_encoder import CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
import os
import gradio as gr
css = """
.table-wrap {
min-height: 300px;
max-height: 300px;
}
"""
Secret_token = os.getenv('HF_token')
dataset = load_dataset("FDSRashid/embed_matn", token = Secret_token)
books = load_dataset('FDSRashid/Hadith_info', data_files='Books.csv', token=Secret_token)['train'].to_pandas()
df = dataset["train"].to_pandas()
features = Features({'matn': Value('string'), 'taraf_ID': Value('string'), 'bookid_hadithid': Value('string')})
dataset = load_dataset("FDSRashid/hadith_info", data_files = 'All_Matns.csv',token = Secret_token, features = features)
matn_info = dataset['train'].to_pandas()
matn_info = matn_info.drop(97550)
matn_info = matn_info.drop(307206)
matn_info['taraf_ID'] = matn_info['taraf_ID'].replace('KeyAbsent', -1)
matn_info['taraf_ID'] = matn_info['taraf_ID'].astype(int)
matn_info['Book_ID'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[0]))
matn_info['Hadith Number'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[1]))
matn_info = pd.merge(matn_info, books, on='Book_ID')
matn_info = matn_info.reset_index()
df = df.reset_index()
cols_to_use = df.columns.difference(matn_info.columns)
joined_df = pd.merge(matn_info,df[cols_to_use],left_index=True, right_index=True)
df = joined_df.copy()
model = SentenceTransformer('FDSRashid/QulBERT', token=Secret_token)
model_CE = CrossEncoder('FDSRashid/QulBERT-CE-2.0', automodel_args = {'token':Secret_token}, max_length=512)
arr = np.array(df['embed'].to_list())
def find_most_similar_matn(text, n):
prep_text = araby.strip_diacritics(text)
embed_text = model.encode(prep_text)
cos_sim = cosine_similarity(embed_text.reshape(1, -1), arr)
indices = np.argsort(cos_sim)[0][-n:]
matns = df.iloc[indices]
matns['Similarity'] = cos_sim[0][indices]
matns_prep = [araby.strip_diacritics(text) for text in matns['matn']]
to_compare = [(i, prep_text) for i in matns_prep]
is_taraf = model_CE.predict(to_compare)
matns = matns[is_taraf> .5]
return matns[['Book_Name', 'matn', 'taraf_ID', 'Book_ID', 'Hadith Number', 'Author', 'Similarity']]
with gr.Blocks(css=css) as demo:
text_input = gr.Textbox()
num_hadith = gr.Slider(1, 50, value = 5, label = 'Num Hadith', info = 'Choose the number of Hadith to Return', step = 1)
text_output = gr.DataFrame(wrap=True)
text_button = gr.Button("Retrieve")
text_button.click(find_most_similar_matn, inputs=[text_input, num_hadith], outputs=text_output)
demo.launch()
|