EmicoBinsfinder commited on
Commit
97632b5
·
1 Parent(s): 5293e61

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -16
app.py CHANGED
@@ -1,31 +1,42 @@
1
  import os
2
  import gradio as gr
3
- from tridentmodel import classification
4
  import pandas as pd
5
  import torch
6
  import torch.nn as nn
7
  import transformers
8
- from transformers import AutoTokenizer, AutoConfig, LlamaForCausalLM, LlamaTokenizer, GenerationConfig
9
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  auth_token = os.environ.get("AUTH_TOKEN_SECRET")
11
-
12
  tokenizer = LlamaTokenizer.from_pretrained("Claimed/capybara", use_auth_token=auth_token)
13
-
14
  model = LlamaForCausalLM.from_pretrained(
15
  "Claimed/capybara", use_auth_token=auth_token,
16
  load_in_8bit=True,
17
  device_map="auto") #low_cpu_mem_usage=True)
18
 
19
- #model = model.to('cuda')
20
-
 
 
 
21
  def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'):
22
- """
23
- Takes in pre-computed class embeddings and abstract texts, converts abstract text into
24
- :param class_embeddings: dataframe of class embeddings
25
- :param abstract: a single abstract embedding
26
- :param N: N highest matching classes to return, from highest to lowest, default is 5
27
- :return: predictions: a full dataframe of all the predictions on the 9500+ classes, HighestSimilarity: Dataframe of the N most similar classes
28
- """
29
  predictions = pd.DataFrame(columns=['Class Name', 'Score'])
30
  for i in range(len(class_embeddings)):
31
  class_name = class_embeddings.iloc[i, 0]
@@ -81,10 +92,86 @@ def bot(history):
81
 
82
 
83
  ########## LOADING PRE-COMPUTED EMBEDDINGS ##########
84
- class_embeddings = pd.read_csv('Embeddings/MainClassEmbeddings.csv')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  def classifier(userin):
87
- clean_in = classification.clean_data(userin, type='String')
88
  in_emb = sentence_embedder(clean_in, 'Model_bert')
89
 
90
  Number = 10
@@ -139,6 +226,9 @@ def generateresponse(history):#, task):
139
  return history
140
 
141
 
 
 
 
142
  theme = gr.themes.Base(
143
  primary_hue="indigo",
144
  ).set(
 
1
  import os
2
  import gradio as gr
 
3
  import pandas as pd
4
  import torch
5
  import torch.nn as nn
6
  import transformers
7
+ from transformers import AutoTokenizer, AutoConfig, LlamaForCausalLM, LlamaTokenizer, GenerationConfig, AutoModel
8
+ import pandas as pd
9
+ import tensorflow as tf
10
+ import numpy as np
11
+ import math
12
+ import time
13
+ import csv
14
+ import nltk
15
+ from nltk.tokenize import word_tokenize
16
+ from nltk.corpus import stopwords
17
+ nltk.download('stopwords')
18
+ nltk.download('punkt')
19
+ import string
20
+
21
+ ########### DATA CLEANER VARIABLES #############
22
+ all_stopwords = stopwords.words('english') # Making sure to only use English stopwords
23
+ extra_stopwords = ['ii', 'iii'] # Can add extra stopwords to be removed from dataset/input abstracts
24
+ all_stopwords.extend(extra_stopwords)
25
+
26
+ ########### GET CLAIMED TRAINED MODEL ###########
27
  auth_token = os.environ.get("AUTH_TOKEN_SECRET")
 
28
  tokenizer = LlamaTokenizer.from_pretrained("Claimed/capybara", use_auth_token=auth_token)
 
29
  model = LlamaForCausalLM.from_pretrained(
30
  "Claimed/capybara", use_auth_token=auth_token,
31
  load_in_8bit=True,
32
  device_map="auto") #low_cpu_mem_usage=True)
33
 
34
+ ########## DEFINING FUNCTIONS ###################
35
+ def mean_pooling(model_output, attention_mask):
36
+ token_embeddings = model_output[0]
37
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
38
+ return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf)
39
  def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'):
 
 
 
 
 
 
 
40
  predictions = pd.DataFrame(columns=['Class Name', 'Score'])
41
  for i in range(len(class_embeddings)):
42
  class_name = class_embeddings.iloc[i, 0]
 
92
 
93
 
94
  ########## LOADING PRE-COMPUTED EMBEDDINGS ##########
95
+ def clean_data(input, type='Dataframe'):
96
+ """
97
+ As preparation for use with the text similarity model, this function removes superfluous data from either a dataframe full of
98
+ classifications, or an input string, in order for embeddings to be calculated for them. Removes:
99
+ • Entries with missing abstracts/descriptions/classifications/typos
100
+ • Duplicate entries
101
+ • Unnecessary punctuation
102
+ • Stop words (e.g., by, a , an, he, she, it)
103
+ • URLs
104
+ • All entries are in the same language
105
+ :param input: Either a dataframe or an individual string
106
+ :param type: Tells fucntion whether input is a dataframe or an individual string
107
+ :return: (if dataframe), returns a dataframe containing CPC classfication codes and their associated 'cleaned' description
108
+ :return: (if string), returns a 'cleaned' version of the input string
109
+ """
110
+ if type == 'Dataframe':
111
+ cleaneddf = pd.DataFrame(columns=['Class', 'Description'])
112
+ for i in range(0, len(input)):
113
+ row_list = input.loc[i, :].values.flatten().tolist()
114
+ noNaN_row = [x for x in row_list if str(x) != 'nan']
115
+ listrow = []
116
+ if len(noNaN_row) > 0:
117
+ row = noNaN_row[:-1]
118
+ row = [x.strip() for x in row]
119
+ row = (" ").join(row)
120
+ text_tokens = word_tokenize(row) # splits abstracts into individual tokens to allow removal of stopwords by list comprehension
121
+ Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords] # removes stopwords
122
+ row = (" ").join(Stopword_Filtered_List) # returns abstract to string form
123
+ removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$']
124
+ for char in removechars:
125
+ row = list(map(lambda x: x.replace(char, ''), row))
126
+
127
+ row = ''.join(row)
128
+ wnum = row.split(' ')
129
+ wnum = [x.lower() for x in wnum]
130
+ #remove duplicate words
131
+ wnum = list(dict.fromkeys(wnum))
132
+ #removing numbers
133
+ wonum = []
134
+ for x in wnum:
135
+ xv = list(x)
136
+ xv = [i.isnumeric() for i in xv]
137
+ if True in xv:
138
+ continue
139
+ else:
140
+ wonum.append(x)
141
+ row = ' '.join(wonum)
142
+ l = [noNaN_row[-1], row]
143
+ cleaneddf.loc[len(cleaneddf)] = l
144
+ cleaneddf = cleaneddf.drop_duplicates(subset=['Description'])
145
+ cleaneddf.to_csv('E:/Users/eeo21/Startup/CPC_Classifications_List/additionalcleanedclasses.csv', index=False)
146
+ return cleaneddf
147
+
148
+ elif type == 'String':
149
+ text_tokens = word_tokenize(input) # splits abstracts into individual tokens to allow removal of stopwords by list comprehension
150
+ Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords] # removes stopwords
151
+ row = (" ").join(Stopword_Filtered_List) # returns abstract to string form
152
+ removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$']
153
+ for char in removechars:
154
+ row = list(map(lambda x: x.replace(char, ''), row))
155
+ row = ''.join(row)
156
+ wnum = row.split(' ')
157
+ wnum = [x.lower() for x in wnum]
158
+ # remove duplicate words
159
+ wnum = list(dict.fromkeys(wnum))
160
+ # removing numbers
161
+ wonum = []
162
+ for x in wnum:
163
+ xv = list(x)
164
+ xv = [i.isnumeric() for i in xv]
165
+ if True in xv:
166
+ continue
167
+ else:
168
+ wonum.append(x)
169
+ row = ' '.join(wonum)
170
+ return row
171
+
172
 
173
  def classifier(userin):
174
+ clean_in = clean_data(userin, type='String')
175
  in_emb = sentence_embedder(clean_in, 'Model_bert')
176
 
177
  Number = 10
 
226
  return history
227
 
228
 
229
+
230
+
231
+ ############# GRADIO APP ###############
232
  theme = gr.themes.Base(
233
  primary_hue="indigo",
234
  ).set(