File size: 2,376 Bytes
3178215
 
 
 
 
 
 
 
7fa43a2
 
 
ef81519
3178215
 
 
 
 
 
 
804b38c
3178215
 
 
 
 
 
 
44d1519
3178215
44d1519
b20e9c3
 
 
 
3178215
 
b20e9c3
3178215
 
b20e9c3
7fa43a2
16ada42
3178215
7fa43a2
 
16ada42
 
 
 
44d1519
16ada42
 
 
 
 
 
 
 
3178215
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import pyarabic.araby as araby
import pandas as pd
import numpy as np
import re
from datasets import load_dataset
from datasets import Features
from datasets import Value
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os
import gradio as gr

Secret_token = os.getenv('HF_token')

dataset = load_dataset("FDSRashid/embed_matn", token = Secret_token)
books = load_dataset('FDSRashid/Hadith_info', data_files='Books.csv', token=Secret_token)['train'].to_pandas()
df = dataset["train"].to_pandas()

features = Features({'matn': Value('string'), 'taraf_ID': Value('string'), 'bookid_hadithid': Value('string')})
dataset = load_dataset("FDSRashid/hadith_info", data_files = 'All_Matns.csv',token = Secret_token, features = features)
matn_info = dataset['train'].to_pandas()
matn_info = matn_info.drop(97550)
matn_info = matn_info.drop(307206)
matn_info['taraf_ID'] = matn_info['taraf_ID'].replace('KeyAbsent', -1)
matn_info['taraf_ID'] = matn_info['taraf_ID'].astype(int)

matn_info['Book_ID'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[0]))
matn_info['Hadith Number'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[1]))
matn_info = pd.merge(matn_info, books, on='Book_ID')

matn_info = matn_info.reset_index()
df = df.reset_index()

cols_to_use = df.columns.difference(matn_info.columns)

joined_df = pd.merge(matn_info,df[cols_to_use],left_index=True, right_index=True)
df = joined_df.copy()


model = SentenceTransformer('FDSRashid/QulBERT', token=Secret_token)
arr = np.array(df['embed'].to_list())

def find_most_similar_matn(text, n):
    embed_text = model.encode(araby.strip_diacritics(text))
    cos_sim = cosine_similarity(embed_text.reshape(1, -1), arr)
    indices = np.argsort(cos_sim)[0][-n:]
    matns = df.iloc[indices]
    matns['Similarity'] = cos_sim[0][indices]
    return matns[['Book_Name', 'matn', 'taraf_ID', 'Book_ID', 'Hadith Number', 'Author', 'Similarity']]

with gr.Blocks() as demo:
    text_input = gr.Textbox()
    num_hadith = gr.Slider(1, 50, value = 5, label = 'Num Hadith', info = 'Choose the number of Hadith to Return')
    text_output = gr.DataFrame()
    text_button = gr.Button("Retrieve")
    text_button.click(find_most_similar_matn, inputs=[text_input, num_hadith], outputs=text_output)
demo.launch()