File size: 2,879 Bytes
3178215
 
 
 
 
 
 
 
7fa43a2
0627860
7fa43a2
 
ef81519
91835ca
 
 
 
 
 
 
3178215
 
 
 
 
 
 
804b38c
3178215
 
 
 
 
 
 
44d1519
3178215
44d1519
b20e9c3
 
 
 
3178215
 
b20e9c3
3178215
 
b20e9c3
7fa43a2
9b6cd8c
16ada42
3178215
7fa43a2
0627860
 
16ada42
 
 
e145847
0627860
 
 
 
e145847
16ada42
91835ca
16ada42
55f98ec
b746a08
16ada42
 
 
3178215
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pyarabic.araby as araby
import pandas as pd
import numpy as np
import re
from datasets import load_dataset
from datasets import Features
from datasets import Value
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.cross_encoder import CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
import os
import gradio as gr
css = """
.table-wrap {
    min-height: 300px;
    max-height: 300px;
}
"""


Secret_token = os.getenv('HF_token')

dataset = load_dataset("FDSRashid/embed_matn", token = Secret_token)
books = load_dataset('FDSRashid/Hadith_info', data_files='Books.csv', token=Secret_token)['train'].to_pandas()
df = dataset["train"].to_pandas()

features = Features({'matn': Value('string'), 'taraf_ID': Value('string'), 'bookid_hadithid': Value('string')})
dataset = load_dataset("FDSRashid/hadith_info", data_files = 'All_Matns.csv',token = Secret_token, features = features)
matn_info = dataset['train'].to_pandas()
matn_info = matn_info.drop(97550)
matn_info = matn_info.drop(307206)
matn_info['taraf_ID'] = matn_info['taraf_ID'].replace('KeyAbsent', -1)
matn_info['taraf_ID'] = matn_info['taraf_ID'].astype(int)

matn_info['Book_ID'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[0]))
matn_info['Hadith Number'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[1]))
matn_info = pd.merge(matn_info, books, on='Book_ID')

matn_info = matn_info.reset_index()
df = df.reset_index()

cols_to_use = df.columns.difference(matn_info.columns)

joined_df = pd.merge(matn_info,df[cols_to_use],left_index=True, right_index=True)
df = joined_df.copy()


model = SentenceTransformer('FDSRashid/QulBERT', token=Secret_token)
model_CE = CrossEncoder('FDSRashid/QulBERT-CE-2.0', automodel_args = {'token':Secret_token}, max_length=512)
arr = np.array(df['embed'].to_list())

def find_most_similar_matn(text, n):
    prep_text = araby.strip_diacritics(text)
    embed_text = model.encode(prep_text)
    cos_sim = cosine_similarity(embed_text.reshape(1, -1), arr)
    indices = np.argsort(cos_sim)[0][-n:]
    matns = df.iloc[indices]
    matns['Similarity'] = cos_sim[0][indices]
    matns_prep = [araby.strip_diacritics(text) for text in matns['matn']]
    to_compare = [(i, prep_text) for i in matns_prep]
    is_taraf = model_CE.predict(to_compare)
    matns = matns[is_taraf> .5]
    return matns[['Book_Name', 'matn', 'taraf_ID', 'Book_ID', 'Hadith Number', 'Author', 'Similarity']]

with gr.Blocks(css=css) as demo:
    text_input = gr.Textbox()
    num_hadith = gr.Slider(1, 50, value = 5, label = 'Num Hadith', info = 'Choose the number of Hadith to Return', step = 1)
    text_output = gr.DataFrame(wrap=True)
    text_button = gr.Button("Retrieve")
    text_button.click(find_most_similar_matn, inputs=[text_input, num_hadith], outputs=text_output)
demo.launch()