Spaces:
Runtime error
Runtime error
Commit
·
c63dd3d
1
Parent(s):
7542a39
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import hf_hub_url
|
2 |
+
from datasets import load_dataset
|
3 |
+
from datasets import Dataset
|
4 |
+
from transformers import AutoTokenizer, AutoModel
|
5 |
+
import torch
|
6 |
+
import pandas as pd
|
7 |
+
|
8 |
+
model_checkpoint = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
9 |
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
10 |
+
model = AutoModel.from_pretrained(model_checkpoint)
|
11 |
+
|
12 |
+
data_files = hf_hub_url(
|
13 |
+
repo_id="lewtun/github-issues",
|
14 |
+
filename="datasets-issues-with-comments.jsonl",
|
15 |
+
repo_type="dataset",
|
16 |
+
)
|
17 |
+
|
18 |
+
issues_dataset = load_dataset("json", data_files=data_files, split="train")
|
19 |
+
|
20 |
+
issues_dataset = issues_dataset.filter(
|
21 |
+
lambda x: (x["is_pull_request"] == False and len(x["comments"]) > 0)
|
22 |
+
)
|
23 |
+
|
24 |
+
columns = issues_dataset.column_names
|
25 |
+
columns_to_keep = ["title", "body", "html_url", "comments"]
|
26 |
+
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
|
27 |
+
|
28 |
+
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
|
29 |
+
|
30 |
+
issues_dataset.set_format("pandas")
|
31 |
+
df = issues_dataset[:]
|
32 |
+
|
33 |
+
comments_df = df.explode("comments", ignore_index=True)
|
34 |
+
|
35 |
+
comments_dataset = Dataset.from_pandas(comments_df)
|
36 |
+
|
37 |
+
comments_dataset = comments_dataset.map(
|
38 |
+
lambda x: {"length_comment": len(x["comments"].split())}
|
39 |
+
)
|
40 |
+
|
41 |
+
comments_dataset = comments_dataset.filter(
|
42 |
+
lambda x: x["length_comment"] > 15
|
43 |
+
)
|
44 |
+
|
45 |
+
def concatenate_text(examples):
|
46 |
+
return {
|
47 |
+
"text": examples["title"]
|
48 |
+
+ " \n "
|
49 |
+
+ examples["body"]
|
50 |
+
+ " \n "
|
51 |
+
+ examples["comments"]
|
52 |
+
}
|
53 |
+
|
54 |
+
comments_dataset = comments_dataset.map(concatenate_text)
|
55 |
+
|
56 |
+
device = torch.device("cuda")
|
57 |
+
model = model.to(device)
|
58 |
+
|
59 |
+
def cls_pooling(model_output):
|
60 |
+
return model_output.last_hidden_state[:, 0]
|
61 |
+
|
62 |
+
def get_embeddings(text_list):
|
63 |
+
encoded_input = tokenizer(
|
64 |
+
text_list, padding=True, truncation=True, return_tensors="pt"
|
65 |
+
)
|
66 |
+
encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
|
67 |
+
model_output = model(**encoded_input)
|
68 |
+
return cls_pooling(model_output)
|
69 |
+
|
70 |
+
embeddings_dataset = comments_dataset.map(
|
71 |
+
lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
|
72 |
+
)
|
73 |
+
|
74 |
+
embeddings_dataset.add_faiss_index(column="embeddings")
|
75 |
+
|
76 |
+
def search(question):
|
77 |
+
question_embedding = get_embeddings([question]).cpu().detach().numpy()
|
78 |
+
|
79 |
+
scores, samples = embeddings_dataset.get_nearest_examples(
|
80 |
+
"embeddings", question_embedding, k=5
|
81 |
+
)
|
82 |
+
|
83 |
+
samples_df = pd.DataFrame.from_dict(samples)
|
84 |
+
samples_df["scores"] = scores
|
85 |
+
samples_df.sort_values("scores", ascending=False, inplace=True)
|
86 |
+
|
87 |
+
string = ""
|
88 |
+
for _, row in samples_df.iterrows():
|
89 |
+
string += f"COMMENT: {row.comments}"
|
90 |
+
string += f"SCORE: {row.scores}"
|
91 |
+
string += f"TITLE: {row.title}"
|
92 |
+
string += f"URL: {row.html_url}"
|
93 |
+
string += "=" * 50
|
94 |
+
string += "\n"
|
95 |
+
|
96 |
+
return string
|
97 |
+
|
98 |
+
demo = gr.Interface(search, inputs=gr.inputs.Textbox(),
|
99 |
+
outputs = gr.outputs.Textbox(),
|
100 |
+
title='Datasets issues search engine')
|
101 |
+
|
102 |
+
if __name__ == '__main__':
|
103 |
+
demo.launch(debug=True)
|