Spaces:
Runtime error
Runtime error
EmicoBinsfinder
commited on
Commit
·
97632b5
1
Parent(s):
5293e61
Update app.py
Browse files
app.py
CHANGED
@@ -1,31 +1,42 @@
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
-
from tridentmodel import classification
|
4 |
import pandas as pd
|
5 |
import torch
|
6 |
import torch.nn as nn
|
7 |
import transformers
|
8 |
-
from transformers import AutoTokenizer, AutoConfig, LlamaForCausalLM, LlamaTokenizer, GenerationConfig
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
auth_token = os.environ.get("AUTH_TOKEN_SECRET")
|
11 |
-
|
12 |
tokenizer = LlamaTokenizer.from_pretrained("Claimed/capybara", use_auth_token=auth_token)
|
13 |
-
|
14 |
model = LlamaForCausalLM.from_pretrained(
|
15 |
"Claimed/capybara", use_auth_token=auth_token,
|
16 |
load_in_8bit=True,
|
17 |
device_map="auto") #low_cpu_mem_usage=True)
|
18 |
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
21 |
def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'):
|
22 |
-
"""
|
23 |
-
Takes in pre-computed class embeddings and abstract texts, converts abstract text into
|
24 |
-
:param class_embeddings: dataframe of class embeddings
|
25 |
-
:param abstract: a single abstract embedding
|
26 |
-
:param N: N highest matching classes to return, from highest to lowest, default is 5
|
27 |
-
:return: predictions: a full dataframe of all the predictions on the 9500+ classes, HighestSimilarity: Dataframe of the N most similar classes
|
28 |
-
"""
|
29 |
predictions = pd.DataFrame(columns=['Class Name', 'Score'])
|
30 |
for i in range(len(class_embeddings)):
|
31 |
class_name = class_embeddings.iloc[i, 0]
|
@@ -81,10 +92,86 @@ def bot(history):
|
|
81 |
|
82 |
|
83 |
########## LOADING PRE-COMPUTED EMBEDDINGS ##########
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
def classifier(userin):
|
87 |
-
clean_in =
|
88 |
in_emb = sentence_embedder(clean_in, 'Model_bert')
|
89 |
|
90 |
Number = 10
|
@@ -139,6 +226,9 @@ def generateresponse(history):#, task):
|
|
139 |
return history
|
140 |
|
141 |
|
|
|
|
|
|
|
142 |
theme = gr.themes.Base(
|
143 |
primary_hue="indigo",
|
144 |
).set(
|
|
|
1 |
import os
|
2 |
import gradio as gr
|
|
|
3 |
import pandas as pd
|
4 |
import torch
|
5 |
import torch.nn as nn
|
6 |
import transformers
|
7 |
+
from transformers import AutoTokenizer, AutoConfig, LlamaForCausalLM, LlamaTokenizer, GenerationConfig, AutoModel
|
8 |
+
import pandas as pd
|
9 |
+
import tensorflow as tf
|
10 |
+
import numpy as np
|
11 |
+
import math
|
12 |
+
import time
|
13 |
+
import csv
|
14 |
+
import nltk
|
15 |
+
from nltk.tokenize import word_tokenize
|
16 |
+
from nltk.corpus import stopwords
|
17 |
+
nltk.download('stopwords')
|
18 |
+
nltk.download('punkt')
|
19 |
+
import string
|
20 |
+
|
21 |
+
########### DATA CLEANER VARIABLES #############
|
22 |
+
all_stopwords = stopwords.words('english') # Making sure to only use English stopwords
|
23 |
+
extra_stopwords = ['ii', 'iii'] # Can add extra stopwords to be removed from dataset/input abstracts
|
24 |
+
all_stopwords.extend(extra_stopwords)
|
25 |
+
|
26 |
+
########### GET CLAIMED TRAINED MODEL ###########
|
27 |
auth_token = os.environ.get("AUTH_TOKEN_SECRET")
|
|
|
28 |
tokenizer = LlamaTokenizer.from_pretrained("Claimed/capybara", use_auth_token=auth_token)
|
|
|
29 |
model = LlamaForCausalLM.from_pretrained(
|
30 |
"Claimed/capybara", use_auth_token=auth_token,
|
31 |
load_in_8bit=True,
|
32 |
device_map="auto") #low_cpu_mem_usage=True)
|
33 |
|
34 |
+
########## DEFINING FUNCTIONS ###################
|
35 |
+
def mean_pooling(model_output, attention_mask):
|
36 |
+
token_embeddings = model_output[0]
|
37 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
38 |
+
return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf)
|
39 |
def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
predictions = pd.DataFrame(columns=['Class Name', 'Score'])
|
41 |
for i in range(len(class_embeddings)):
|
42 |
class_name = class_embeddings.iloc[i, 0]
|
|
|
92 |
|
93 |
|
94 |
########## LOADING PRE-COMPUTED EMBEDDINGS ##########
|
95 |
+
def clean_data(input, type='Dataframe'):
|
96 |
+
"""
|
97 |
+
As preparation for use with the text similarity model, this function removes superfluous data from either a dataframe full of
|
98 |
+
classifications, or an input string, in order for embeddings to be calculated for them. Removes:
|
99 |
+
• Entries with missing abstracts/descriptions/classifications/typos
|
100 |
+
• Duplicate entries
|
101 |
+
• Unnecessary punctuation
|
102 |
+
• Stop words (e.g., by, a , an, he, she, it)
|
103 |
+
• URLs
|
104 |
+
• All entries are in the same language
|
105 |
+
:param input: Either a dataframe or an individual string
|
106 |
+
:param type: Tells fucntion whether input is a dataframe or an individual string
|
107 |
+
:return: (if dataframe), returns a dataframe containing CPC classfication codes and their associated 'cleaned' description
|
108 |
+
:return: (if string), returns a 'cleaned' version of the input string
|
109 |
+
"""
|
110 |
+
if type == 'Dataframe':
|
111 |
+
cleaneddf = pd.DataFrame(columns=['Class', 'Description'])
|
112 |
+
for i in range(0, len(input)):
|
113 |
+
row_list = input.loc[i, :].values.flatten().tolist()
|
114 |
+
noNaN_row = [x for x in row_list if str(x) != 'nan']
|
115 |
+
listrow = []
|
116 |
+
if len(noNaN_row) > 0:
|
117 |
+
row = noNaN_row[:-1]
|
118 |
+
row = [x.strip() for x in row]
|
119 |
+
row = (" ").join(row)
|
120 |
+
text_tokens = word_tokenize(row) # splits abstracts into individual tokens to allow removal of stopwords by list comprehension
|
121 |
+
Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords] # removes stopwords
|
122 |
+
row = (" ").join(Stopword_Filtered_List) # returns abstract to string form
|
123 |
+
removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$']
|
124 |
+
for char in removechars:
|
125 |
+
row = list(map(lambda x: x.replace(char, ''), row))
|
126 |
+
|
127 |
+
row = ''.join(row)
|
128 |
+
wnum = row.split(' ')
|
129 |
+
wnum = [x.lower() for x in wnum]
|
130 |
+
#remove duplicate words
|
131 |
+
wnum = list(dict.fromkeys(wnum))
|
132 |
+
#removing numbers
|
133 |
+
wonum = []
|
134 |
+
for x in wnum:
|
135 |
+
xv = list(x)
|
136 |
+
xv = [i.isnumeric() for i in xv]
|
137 |
+
if True in xv:
|
138 |
+
continue
|
139 |
+
else:
|
140 |
+
wonum.append(x)
|
141 |
+
row = ' '.join(wonum)
|
142 |
+
l = [noNaN_row[-1], row]
|
143 |
+
cleaneddf.loc[len(cleaneddf)] = l
|
144 |
+
cleaneddf = cleaneddf.drop_duplicates(subset=['Description'])
|
145 |
+
cleaneddf.to_csv('E:/Users/eeo21/Startup/CPC_Classifications_List/additionalcleanedclasses.csv', index=False)
|
146 |
+
return cleaneddf
|
147 |
+
|
148 |
+
elif type == 'String':
|
149 |
+
text_tokens = word_tokenize(input) # splits abstracts into individual tokens to allow removal of stopwords by list comprehension
|
150 |
+
Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords] # removes stopwords
|
151 |
+
row = (" ").join(Stopword_Filtered_List) # returns abstract to string form
|
152 |
+
removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$']
|
153 |
+
for char in removechars:
|
154 |
+
row = list(map(lambda x: x.replace(char, ''), row))
|
155 |
+
row = ''.join(row)
|
156 |
+
wnum = row.split(' ')
|
157 |
+
wnum = [x.lower() for x in wnum]
|
158 |
+
# remove duplicate words
|
159 |
+
wnum = list(dict.fromkeys(wnum))
|
160 |
+
# removing numbers
|
161 |
+
wonum = []
|
162 |
+
for x in wnum:
|
163 |
+
xv = list(x)
|
164 |
+
xv = [i.isnumeric() for i in xv]
|
165 |
+
if True in xv:
|
166 |
+
continue
|
167 |
+
else:
|
168 |
+
wonum.append(x)
|
169 |
+
row = ' '.join(wonum)
|
170 |
+
return row
|
171 |
+
|
172 |
|
173 |
def classifier(userin):
|
174 |
+
clean_in = clean_data(userin, type='String')
|
175 |
in_emb = sentence_embedder(clean_in, 'Model_bert')
|
176 |
|
177 |
Number = 10
|
|
|
226 |
return history
|
227 |
|
228 |
|
229 |
+
|
230 |
+
|
231 |
+
############# GRADIO APP ###############
|
232 |
theme = gr.themes.Base(
|
233 |
primary_hue="indigo",
|
234 |
).set(
|