Spaces:

GT4SD
/

PatentToolkit

Runtime error

File size: 13,778 Bytes

6d10c64
a1bc39d
4c61fcd
7de20a0
 
 
97632b5
 
 
 
 
 
 
 
 
 
 
 
 
 
4515305
 
 
97632b5
 
 
 
 
 
d5be6af
 
cab9610
2b64787
 
 
 
97632b5
 
 
 
 
e610ece
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5293e61
e610ece
1423dfb
 
 
 
 
 
 
 
 
 
a1bc39d
 
 
 
 
 
 
 
 
 
 
 
 
192d9af
9420ba3
97632b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9420ba3
46d20c8
97632b5
1423dfb
4515305
9e8be11
e610ece
9e8be11
e610ece
9e8be11
eb55011
53bd282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e450f6c
53bd282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09e2691
53bd282
7f2ec6d
09e2691
7f2ec6d
09e2691
53bd282
 
 
7f2ec6d
97632b5
7f2ec6d
 
 
 
 
 
 
3eb58cd
 
b1b3f71
 
4515305
3fbe817
b1b3f71
9224ffd
b24494c
4ca8440
5bdbc1f
3eb58cd
 
fd71939
b1b3f71
3eb58cd
04fbf3e
3eb58cd
04fbf3e
 
3c56f3a
 
 
 
21d804e
9907d16
9224ffd
429953d
04fbf3e
46d20c8
5b4db95
3c56f3a
9907d16
5b4db95
 
 
04fbf3e
 
 
 
 
 
 
 
5b4db95
04fbf3e
 
 
 
 
 
 
 
3eb58cd
919b891
 
 
9224ffd
919b891
9224ffd
919b891
 
 
 
 
 
92d4436
8373fbf
b56fd2b
92d4436
c585a5d
429953d
b56fd2b
9224ffd
919b891
46d20c8
9907d16
8cacfad
 
 
 
 
d6b5ec6
9907d16
4ca8440
 
 
f076e4f
 
92d4436
4ca8440
92d4436
4ca8440
 
 
f076e4f
38098e8
46d20c8
99c33b8
e5a45fc
 
 
914a779
e5a45fc
 
7f2ec6d
53bd282
 
 
8e7188b
38098e8
 
 
 
 
212ca5f
38098e8
212ca5f
38098e8
 
 
61c7a4c
 
bef2a73

import os
import gradio as gr
import pandas as pd
import torch
import torch.nn as nn
import transformers
from transformers import AutoTokenizer, AutoConfig, LlamaForCausalLM, LlamaTokenizer, GenerationConfig, AutoModel
import pandas as pd
import tensorflow as tf
import numpy as np
import math
import time
import csv
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import string

########### Import Classifier Embeddings #########
class_embeddings = pd.read_csv('Embeddings/MainClassEmbeddings.csv')

########### DATA CLEANER VARIABLES #############
all_stopwords = stopwords.words('english') # Making sure to only use English stopwords
extra_stopwords = ['ii', 'iii'] # Can add extra stopwords to be removed from dataset/input abstracts
all_stopwords.extend(extra_stopwords)

########### GET CLAIMED TRAINED MODEL ###########
auth_token = os.environ.get("AUTH_TOKEN_SECRET")
tokenizer = LlamaTokenizer.from_pretrained("Claimed/capybara", use_auth_token=auth_token)
model = LlamaForCausalLM.from_pretrained(
     "Claimed/capybara", use_auth_token=auth_token, 
    load_in_8bit=True,
    device_map="auto") #low_cpu_mem_usage=True)

########## DEFINING FUNCTIONS ###################
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] 
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf)
def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'):
    predictions = pd.DataFrame(columns=['Class Name', 'Score'])
    for i in range(len(class_embeddings)):
        class_name = class_embeddings.iloc[i, 0]
        embedding = class_embeddings.iloc[i, 2]
        embedding = convert_saved_embeddings(embedding)
        abstract_embedding = abstract_embedding.numpy()
        abstract_embedding = torch.from_numpy(abstract_embedding)
        cos = torch.nn.CosineSimilarity(dim=1)
        score = cos(abstract_embedding, embedding).numpy().tolist()
        result = [class_name, score[0]]
        predictions.loc[len(predictions)] = result
    if Sensitivity == 'High':
        Threshold = 0.5
    elif Sensitivity == 'Medium':
        Threshold = 0.40
    elif Sensitivity == 'Low':
        Threshold = 0.35
    GreenLikelihood = 'False'
    for i in range(len(greenpredictions)):
        score = greenpredictions.iloc[i, 1]
        if float(score) >= Threshold:
            GreenLikelihood = 'True'
            break
        else:
            continue
    HighestSimilarity = predictions.nlargest(N, ['Score'])
    return HighestSimilarity

def sentence_embedder(sentences, model_path):
  tokenizer = AutoTokenizer.from_pretrained(model_path) #instantiating the sentence embedder using HuggingFace library
  model = AutoModel.from_pretrained(model_path, from_tf=True) #making a model instance
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
  # Compute token embeddings
  with torch.no_grad():
    model_output = model(**encoded_input)
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) #outputs a (1, 384) tensor representation of input text
  return sentence_embeddings
    
def add_text(history, text):
    history = history + [(text, None)]
    return history, ""

def add_file(history, file):
    history = history + [((file.name,), None)]
    return history

def bot(history):
    response = "**That's cool!**"
    history[-1][1] = response
    return history


########## LOADING PRE-COMPUTED EMBEDDINGS ##########
def clean_data(input, type='Dataframe'):
    if type == 'Dataframe':
        cleaneddf = pd.DataFrame(columns=['Class', 'Description'])
        for i in range(0, len(input)):
            row_list = input.loc[i, :].values.flatten().tolist()
            noNaN_row = [x for x in row_list if str(x) != 'nan']
            listrow = []
            if len(noNaN_row) > 0:
                row = noNaN_row[:-1]
                row = [x.strip() for x in row]
                row = (" ").join(row)
                text_tokens = word_tokenize(row)  # splits abstracts into individual tokens to allow removal of stopwords by list comprehension
                Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords]  # removes stopwords
                row = (" ").join(Stopword_Filtered_List)  # returns abstract to string form
                removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$']
                for char in removechars:
                    row = list(map(lambda x: x.replace(char, ''), row))

                row = ''.join(row)
                wnum = row.split(' ')
                wnum = [x.lower() for x in wnum]
                #remove duplicate words
                wnum = list(dict.fromkeys(wnum))
                #removing numbers
                wonum = []
                for x in wnum:
                    xv = list(x)
                    xv = [i.isnumeric() for i in xv]
                    if True in xv:
                        continue
                    else:
                        wonum.append(x)
                row = ' '.join(wonum)
                l = [noNaN_row[-1], row]
                cleaneddf.loc[len(cleaneddf)] = l
        cleaneddf = cleaneddf.drop_duplicates(subset=['Description'])
        cleaneddf.to_csv('E:/Users/eeo21/Startup/CPC_Classifications_List/additionalcleanedclasses.csv', index=False)
        return cleaneddf

    elif type == 'String':
        text_tokens = word_tokenize(input)  # splits abstracts into individual tokens to allow removal of stopwords by list comprehension
        Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords]  # removes stopwords
        row = (" ").join(Stopword_Filtered_List)  # returns abstract to string form
        removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$']
        for char in removechars:
            row = list(map(lambda x: x.replace(char, ''), row))
        row = ''.join(row)
        wnum = row.split(' ')
        wnum = [x.lower() for x in wnum]
        # remove duplicate words
        wnum = list(dict.fromkeys(wnum))
        # removing numbers
        wonum = []
        for x in wnum:
            xv = list(x)
            xv = [i.isnumeric() for i in xv]
            if True in xv:
                continue
            else:
                wonum.append(x)
        row = ' '.join(wonum)
        return row


def classifier(userin):
    clean_in = clean_data(userin, type='String')
    in_emb = sentence_embedder(clean_in, 'Model_bert')
    
    Number = 10
    broad_scope_predictions = broad_scope_class_predictor(class_embeddings, in_emb, Number, Sensitivity='High')

    return broad_scope_predictions
    
def generateresponse(history):#, task):
    """
    Model definition here:
    """

    global model
    global tokenizer

    user = history[-1][0]

    PROMPT = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
    ### Instruction:
    {user}
    ### Response:"""

    inputs = tokenizer(
        PROMPT,
        return_tensors="pt",
    ) 
    input_ids = inputs["input_ids"].cuda()

    generation_config = GenerationConfig(
        temperature=0.6,
        top_p=0.95,
        repetition_penalty=1.15,
    ) 
    print("Generating...")
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=256,
    ) 
    output = []
    for s in generation_output.sequences:
        output.append(tokenizer.decode(s))
        print(tokenizer.decode(s))
    
    outputs = (output[0].split('### Response:'))[1]
    
    response = f"Response: {outputs}"
    history[-1][1] = response
    print(history)
    return history

############# GRADIO APP ###############
theme = gr.themes.Base(
    primary_hue="indigo",
).set(
    prose_text_size='*text_sm'
)

with gr.Blocks(title='Claimed', theme=theme) as demo:

    gr.Markdown("""
    # CLAIMED - A GENERATIVE TOOLKIT FOR PATENT ATTORNEYS

    The patenting process can be incredibly time-consuming and expensive. We're on a mission to change that.
    
    Welcome to our demo! We've trained Meta's Llama on over 200k entries, with a focus on tasks related to the intellectual property domain.

    Please note that this is for research purposes and shouldn't be used commercially. 

    None of the outputs of this model, taken in part or in its entirety, constitutes legal advice. If you are seeking protection for you intellectual property, consult a registered patent/trademark attorney.
    
    """)
    
    with gr.Tab("Claim Drafter"):
        gr.Markdown(""" 
        Use this tool to expand your idea into the technical language of a patent claim. You can specify the type of claim you want using the dropdown menu.
        """)
        gr.Dropdown(["Apparatus Claim", "Method of Use Claim", "Method Claim", ], label='Choose Claim Type Here')        
        
        with gr.Row(scale=1, min_width=600):
            text1 = gr.Textbox(label="Input",
                              placeholder='Type in your idea here!')
            text2 = gr.Textbox(label="Output")
   
    with gr.Tab("Description Generator"):
        gr.Markdown(""" 
        Use this tool to expand your patent claim into a description. You can also use this tool to generate abstracts and give you ideas about the benefit of an invention by changing the settings in the dropdown menu.
         """)
        gr.Dropdown(["Generate Description", "Generate Abstract", "Benefits of the invention"], label='Choose Generation Type Here')
        with gr.Row(scale=1, min_width=600):
                    
            text1 = gr.Textbox(label="Input",
                              placeholder='Type in your idea here!')
            text2 = gr.Textbox(label="Output")

    # with gr.Tab("Knowledge Graph"):
    #     gr.Markdown(""" 
    #     Use this tool to generate a knowledge graph of your invention. This will help highlight the links between features. 
    #     """)
    #     with gr.Row(scale=1, min_width=600):
    #         text1 = gr.Textbox(label="Input",
    #                           placeholder='Type in your idea here!')
    #         text2 = gr.Textbox(label="Output")

    # with gr.Tab("Prosecution Ideator"):
    #     gr.Markdown(""" 
    #     Use this tool to generate ideas for how to overcome objections to novelty and inventive step. Outputs are in the problem-solution format. 
    #     """)
    #     with gr.Row(scale=1, min_width=600):
    #         text1 = gr.Textbox(label="Input",
    #                           placeholder='Type in your idea here!')
    #         text2 = gr.Textbox(label="Output")

    # with gr.Tab("Claimed Infill"):
    #     gr.Markdown(""" 
    #     Below is our 

    #     Example input: A device to help the visually impaired using proprioception.

    #     Output: 
    #     """)
    #     with gr.Row(scale=1, min_width=600):
    #         text1 = gr.Textbox(label="Input",
    #                           placeholder='Type in your idea here!')
    #         text2 = gr.Textbox(label="Output")

    
    with gr.Tab("CPC Search Tool"):
        gr.Markdown("""
        Use this tool to classify your invention according to the Cooperative Patent Classification system. 

        Click on the link to initiate either an Espacenet or Google Patents classification search using the generated classifications. You can specify which you would like using the dropdown menu. 
        """)

        gr.Dropdown(["Google Patent Search", "Espacenet Patent Search"], label='Choose Search Type Here')        
        with gr.Row(scale=1, min_width=600):
            userin = gr.Textbox(label="Input",
                              placeholder='Type in your Claim/Description/Abstract Here')
            output = gr.Textbox(label="Output")
        with gr.Row():
            classify_btn = gr.Button("Classify")
            classify_btn.click(fn=classifier, inputs=[userin] , outputs=output)


    gr.Markdown(""" 

    # THE CHATBOT

    Do you want a bit more freedom over the outputs you generate? No worries, you can use a chatbot version of our model below. You can ask it anything. 

    If you're concerned about a particular output, hit the flag button and we will use that information to improve the model.


    """)

   
    chatbot = gr.Chatbot([], elem_id="Claimed Assistant").style(height=500)
    with gr.Row():
        with gr.Column(scale=0.85):
            txt = gr.Textbox(
                show_label=False,
                placeholder="Enter text and submit",
            ).style(container=False)
        with gr.Column(scale=0.15, min_width=0):
            btn = gr.Button("Submit")
        
        txt.submit(add_text, [chatbot, txt], [chatbot, txt]).then(
            generateresponse, chatbot, chatbot)

    gr.Markdown("""
    # HAVE AN IDEA? GET IT CLAIMED 

    In the future, we are looking to expand our model's capabilities further to assist in a range of IP related tasks.

    If you are interested in using a more powerful model that we have trained, or if you have any suggestions of features you would like to see us add, please get in touch!

    As far as data is concerned, you have nothing to worry about! We don't store any of your inputs to use for further training, we're not OpenAI.  
    
    """)



demo.launch()