Spaces:

GT4SD
/

PatentToolkit

Runtime error

App Files Files Community

EmicoBinsfinder commited on Apr 10, 2023

Commit

97632b5

1 Parent(s): 5293e61

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -16

app.py CHANGED Viewed

@@ -1,31 +1,42 @@
 import os
 import gradio as gr
-from tridentmodel import classification
 import pandas as pd
 import torch
 import torch.nn as nn
 import transformers
-from transformers import AutoTokenizer, AutoConfig, LlamaForCausalLM, LlamaTokenizer, GenerationConfig
 auth_token = os.environ.get("AUTH_TOKEN_SECRET")
 tokenizer = LlamaTokenizer.from_pretrained("Claimed/capybara", use_auth_token=auth_token)
 model = LlamaForCausalLM.from_pretrained(
      "Claimed/capybara", use_auth_token=auth_token,
     load_in_8bit=True,
     device_map="auto") #low_cpu_mem_usage=True)
-#model = model.to('cuda')
 def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'):
-    """
-    Takes in pre-computed class embeddings and abstract texts, converts abstract text into
-    :param class_embeddings: dataframe of class embeddings
-    :param abstract: a single abstract embedding
-    :param N: N highest matching classes to return, from highest to lowest, default is 5
-    :return: predictions: a full dataframe of all the predictions on the 9500+ classes, HighestSimilarity: Dataframe of the N most similar classes
-    """
     predictions = pd.DataFrame(columns=['Class Name', 'Score'])
     for i in range(len(class_embeddings)):
         class_name = class_embeddings.iloc[i, 0]
@@ -81,10 +92,86 @@ def bot(history):
 ########## LOADING PRE-COMPUTED EMBEDDINGS ##########
-class_embeddings = pd.read_csv('Embeddings/MainClassEmbeddings.csv')
 def classifier(userin):
-    clean_in = classification.clean_data(userin, type='String')
     in_emb = sentence_embedder(clean_in, 'Model_bert')
     Number = 10
@@ -139,6 +226,9 @@ def generateresponse(history):#, task):
     return history
 theme = gr.themes.Base(
     primary_hue="indigo",
 ).set(

 import os
 import gradio as gr
 import pandas as pd
 import torch
 import torch.nn as nn
 import transformers
+from transformers import AutoTokenizer, AutoConfig, LlamaForCausalLM, LlamaTokenizer, GenerationConfig, AutoModel
+import pandas as pd
+import tensorflow as tf
+import numpy as np
+import math
+import time
+import csv
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+nltk.download('stopwords')
+nltk.download('punkt')
+import string
+########### DATA CLEANER VARIABLES #############
+all_stopwords = stopwords.words('english') # Making sure to only use English stopwords
+extra_stopwords = ['ii', 'iii'] # Can add extra stopwords to be removed from dataset/input abstracts
+all_stopwords.extend(extra_stopwords)
+########### GET CLAIMED TRAINED MODEL ###########
 auth_token = os.environ.get("AUTH_TOKEN_SECRET")
 tokenizer = LlamaTokenizer.from_pretrained("Claimed/capybara", use_auth_token=auth_token)
 model = LlamaForCausalLM.from_pretrained(
      "Claimed/capybara", use_auth_token=auth_token,
     load_in_8bit=True,
     device_map="auto") #low_cpu_mem_usage=True)
+########## DEFINING FUNCTIONS ###################
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0]
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf)
 def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'):
     predictions = pd.DataFrame(columns=['Class Name', 'Score'])
     for i in range(len(class_embeddings)):
         class_name = class_embeddings.iloc[i, 0]
 ########## LOADING PRE-COMPUTED EMBEDDINGS ##########
+def clean_data(input, type='Dataframe'):
+    """
+    As preparation for use with the text similarity model, this function removes superfluous data from either a dataframe full of
+    classifications, or an input string, in order for embeddings to be calculated for them. Removes:
+    •	Entries with missing abstracts/descriptions/classifications/typos
+    •	Duplicate entries
+    •   Unnecessary punctuation
+    •	Stop words (e.g., by, a , an, he, she, it)
+    •  	URLs
+    •	All entries are in the same language
+    :param input: Either a dataframe or an individual string
+    :param type: Tells fucntion whether input is a dataframe or an individual string
+    :return: (if dataframe), returns a dataframe containing CPC classfication codes and their associated 'cleaned' description
+    :return:  (if string), returns a 'cleaned' version of the input string
+    """
+    if type == 'Dataframe':
+        cleaneddf = pd.DataFrame(columns=['Class', 'Description'])
+        for i in range(0, len(input)):
+            row_list = input.loc[i, :].values.flatten().tolist()
+            noNaN_row = [x for x in row_list if str(x) != 'nan']
+            listrow = []
+            if len(noNaN_row) > 0:
+                row = noNaN_row[:-1]
+                row = [x.strip() for x in row]
+                row = (" ").join(row)
+                text_tokens = word_tokenize(row)  # splits abstracts into individual tokens to allow removal of stopwords by list comprehension
+                Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords]  # removes stopwords
+                row = (" ").join(Stopword_Filtered_List)  # returns abstract to string form
+                removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$']
+                for char in removechars:
+                    row = list(map(lambda x: x.replace(char, ''), row))
+                row = ''.join(row)
+                wnum = row.split(' ')
+                wnum = [x.lower() for x in wnum]
+                #remove duplicate words
+                wnum = list(dict.fromkeys(wnum))
+                #removing numbers
+                wonum = []
+                for x in wnum:
+                    xv = list(x)
+                    xv = [i.isnumeric() for i in xv]
+                    if True in xv:
+                        continue
+                    else:
+                        wonum.append(x)
+                row = ' '.join(wonum)
+                l = [noNaN_row[-1], row]
+                cleaneddf.loc[len(cleaneddf)] = l
+        cleaneddf = cleaneddf.drop_duplicates(subset=['Description'])
+        cleaneddf.to_csv('E:/Users/eeo21/Startup/CPC_Classifications_List/additionalcleanedclasses.csv', index=False)
+        return cleaneddf
+    elif type == 'String':
+        text_tokens = word_tokenize(input)  # splits abstracts into individual tokens to allow removal of stopwords by list comprehension
+        Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords]  # removes stopwords
+        row = (" ").join(Stopword_Filtered_List)  # returns abstract to string form
+        removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$']
+        for char in removechars:
+            row = list(map(lambda x: x.replace(char, ''), row))
+        row = ''.join(row)
+        wnum = row.split(' ')
+        wnum = [x.lower() for x in wnum]
+        # remove duplicate words
+        wnum = list(dict.fromkeys(wnum))
+        # removing numbers
+        wonum = []
+        for x in wnum:
+            xv = list(x)
+            xv = [i.isnumeric() for i in xv]
+            if True in xv:
+                continue
+            else:
+                wonum.append(x)
+        row = ' '.join(wonum)
+        return row
 def classifier(userin):
+    clean_in = clean_data(userin, type='String')
     in_emb = sentence_embedder(clean_in, 'Model_bert')
     Number = 10
     return history
+############# GRADIO APP ###############
 theme = gr.themes.Base(
     primary_hue="indigo",
 ).set(