Spaces:
Runtime error
Runtime error
import os | |
import gradio as gr | |
import pandas as pd | |
import torch | |
import torch.nn as nn | |
import transformers | |
from transformers import AutoTokenizer, AutoConfig, LlamaForCausalLM, LlamaTokenizer, GenerationConfig, AutoModel | |
import pandas as pd | |
import tensorflow as tf | |
import numpy as np | |
import math | |
import time | |
import csv | |
import nltk | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
import string | |
########### Import Classifier Embeddings ######### | |
class_embeddings = pd.read_csv('Embeddings/MainClassEmbeddings.csv') | |
########### DATA CLEANER VARIABLES ############# | |
all_stopwords = stopwords.words('english') # Making sure to only use English stopwords | |
extra_stopwords = ['ii', 'iii'] # Can add extra stopwords to be removed from dataset/input abstracts | |
all_stopwords.extend(extra_stopwords) | |
########### GET CLAIMED TRAINED MODEL ########### | |
auth_token = os.environ.get("AUTH_TOKEN_SECRET") | |
tokenizer = LlamaTokenizer.from_pretrained("Claimed/capybara", use_auth_token=auth_token) | |
model = LlamaForCausalLM.from_pretrained( | |
"Claimed/capybara", use_auth_token=auth_token, | |
load_in_8bit=True, | |
device_map="auto") #low_cpu_mem_usage=True) | |
########## DEFINING FUNCTIONS ################### | |
def mean_pooling(model_output, attention_mask): | |
token_embeddings = model_output[0] | |
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() | |
return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf) | |
def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'): | |
predictions = pd.DataFrame(columns=['Class Name', 'Score']) | |
for i in range(len(class_embeddings)): | |
class_name = class_embeddings.iloc[i, 0] | |
embedding = class_embeddings.iloc[i, 2] | |
embedding = convert_saved_embeddings(embedding) | |
abstract_embedding = abstract_embedding.numpy() | |
abstract_embedding = torch.from_numpy(abstract_embedding) | |
cos = torch.nn.CosineSimilarity(dim=1) | |
score = cos(abstract_embedding, embedding).numpy().tolist() | |
result = [class_name, score[0]] | |
predictions.loc[len(predictions)] = result | |
if Sensitivity == 'High': | |
Threshold = 0.5 | |
elif Sensitivity == 'Medium': | |
Threshold = 0.40 | |
elif Sensitivity == 'Low': | |
Threshold = 0.35 | |
GreenLikelihood = 'False' | |
for i in range(len(greenpredictions)): | |
score = greenpredictions.iloc[i, 1] | |
if float(score) >= Threshold: | |
GreenLikelihood = 'True' | |
break | |
else: | |
continue | |
HighestSimilarity = predictions.nlargest(N, ['Score']) | |
return HighestSimilarity | |
def sentence_embedder(sentences, model_path): | |
tokenizer = AutoTokenizer.from_pretrained(model_path) #instantiating the sentence embedder using HuggingFace library | |
model = AutoModel.from_pretrained(model_path, from_tf=True) #making a model instance | |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') | |
# Compute token embeddings | |
with torch.no_grad(): | |
model_output = model(**encoded_input) | |
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) #outputs a (1, 384) tensor representation of input text | |
return sentence_embeddings | |
def add_text(history, text): | |
history = history + [(text, None)] | |
return history, "" | |
def add_file(history, file): | |
history = history + [((file.name,), None)] | |
return history | |
def bot(history): | |
response = "**That's cool!**" | |
history[-1][1] = response | |
return history | |
########## LOADING PRE-COMPUTED EMBEDDINGS ########## | |
def clean_data(input, type='Dataframe'): | |
if type == 'Dataframe': | |
cleaneddf = pd.DataFrame(columns=['Class', 'Description']) | |
for i in range(0, len(input)): | |
row_list = input.loc[i, :].values.flatten().tolist() | |
noNaN_row = [x for x in row_list if str(x) != 'nan'] | |
listrow = [] | |
if len(noNaN_row) > 0: | |
row = noNaN_row[:-1] | |
row = [x.strip() for x in row] | |
row = (" ").join(row) | |
text_tokens = word_tokenize(row) # splits abstracts into individual tokens to allow removal of stopwords by list comprehension | |
Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords] # removes stopwords | |
row = (" ").join(Stopword_Filtered_List) # returns abstract to string form | |
removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$'] | |
for char in removechars: | |
row = list(map(lambda x: x.replace(char, ''), row)) | |
row = ''.join(row) | |
wnum = row.split(' ') | |
wnum = [x.lower() for x in wnum] | |
#remove duplicate words | |
wnum = list(dict.fromkeys(wnum)) | |
#removing numbers | |
wonum = [] | |
for x in wnum: | |
xv = list(x) | |
xv = [i.isnumeric() for i in xv] | |
if True in xv: | |
continue | |
else: | |
wonum.append(x) | |
row = ' '.join(wonum) | |
l = [noNaN_row[-1], row] | |
cleaneddf.loc[len(cleaneddf)] = l | |
cleaneddf = cleaneddf.drop_duplicates(subset=['Description']) | |
cleaneddf.to_csv('E:/Users/eeo21/Startup/CPC_Classifications_List/additionalcleanedclasses.csv', index=False) | |
return cleaneddf | |
elif type == 'String': | |
text_tokens = word_tokenize(input) # splits abstracts into individual tokens to allow removal of stopwords by list comprehension | |
Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords] # removes stopwords | |
row = (" ").join(Stopword_Filtered_List) # returns abstract to string form | |
removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$'] | |
for char in removechars: | |
row = list(map(lambda x: x.replace(char, ''), row)) | |
row = ''.join(row) | |
wnum = row.split(' ') | |
wnum = [x.lower() for x in wnum] | |
# remove duplicate words | |
wnum = list(dict.fromkeys(wnum)) | |
# removing numbers | |
wonum = [] | |
for x in wnum: | |
xv = list(x) | |
xv = [i.isnumeric() for i in xv] | |
if True in xv: | |
continue | |
else: | |
wonum.append(x) | |
row = ' '.join(wonum) | |
return row | |
def classifier(userin): | |
clean_in = clean_data(userin, type='String') | |
in_emb = sentence_embedder(clean_in, 'Model_bert') | |
Number = 10 | |
broad_scope_predictions = broad_scope_class_predictor(class_embeddings, in_emb, Number, Sensitivity='High') | |
return broad_scope_predictions | |
def generateresponse(history):#, task): | |
""" | |
Model definition here: | |
""" | |
global model | |
global tokenizer | |
user = history[-1][0] | |
PROMPT = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. | |
### Instruction: | |
{user} | |
### Response:""" | |
inputs = tokenizer( | |
PROMPT, | |
return_tensors="pt", | |
) | |
input_ids = inputs["input_ids"].cuda() | |
generation_config = GenerationConfig( | |
temperature=0.6, | |
top_p=0.95, | |
repetition_penalty=1.15, | |
) | |
print("Generating...") | |
generation_output = model.generate( | |
input_ids=input_ids, | |
generation_config=generation_config, | |
return_dict_in_generate=True, | |
output_scores=True, | |
max_new_tokens=256, | |
) | |
output = [] | |
for s in generation_output.sequences: | |
output.append(tokenizer.decode(s)) | |
print(tokenizer.decode(s)) | |
outputs = (output[0].split('### Response:'))[1] | |
response = f"Response: {outputs}" | |
history[-1][1] = response | |
print(history) | |
return history | |
############# GRADIO APP ############### | |
theme = gr.themes.Base( | |
primary_hue="indigo", | |
).set( | |
prose_text_size='*text_sm' | |
) | |
with gr.Blocks(title='Claimed', theme=theme) as demo: | |
gr.Markdown(""" | |
# CLAIMED - A GENERATIVE TOOLKIT FOR PATENT ATTORNEYS | |
The patenting process can be incredibly time-consuming and expensive. We're on a mission to change that. | |
Welcome to our demo! We've trained Meta's Llama on over 200k entries, with a focus on tasks related to the intellectual property domain. | |
Please note that this is for research purposes and shouldn't be used commercially. | |
None of the outputs of this model, taken in part or in its entirety, constitutes legal advice. If you are seeking protection for you intellectual property, consult a registered patent/trademark attorney. | |
""") | |
with gr.Tab("Claim Drafter"): | |
gr.Markdown(""" | |
Use this tool to expand your idea into the technical language of a patent claim. You can specify the type of claim you want using the dropdown menu. | |
""") | |
gr.Dropdown(["Apparatus Claim", "Method of Use Claim", "Method Claim", ], label='Choose Claim Type Here') | |
with gr.Row(scale=1, min_width=600): | |
text1 = gr.Textbox(label="Input", | |
placeholder='Type in your idea here!') | |
text2 = gr.Textbox(label="Output") | |
with gr.Tab("Description Generator"): | |
gr.Markdown(""" | |
Use this tool to expand your patent claim into a description. You can also use this tool to generate abstracts and give you ideas about the benefit of an invention by changing the settings in the dropdown menu. | |
""") | |
gr.Dropdown(["Generate Description", "Generate Abstract", "Benefits of the invention"], label='Choose Generation Type Here') | |
with gr.Row(scale=1, min_width=600): | |
text1 = gr.Textbox(label="Input", | |
placeholder='Type in your idea here!') | |
text2 = gr.Textbox(label="Output") | |
# with gr.Tab("Knowledge Graph"): | |
# gr.Markdown(""" | |
# Use this tool to generate a knowledge graph of your invention. This will help highlight the links between features. | |
# """) | |
# with gr.Row(scale=1, min_width=600): | |
# text1 = gr.Textbox(label="Input", | |
# placeholder='Type in your idea here!') | |
# text2 = gr.Textbox(label="Output") | |
# with gr.Tab("Prosecution Ideator"): | |
# gr.Markdown(""" | |
# Use this tool to generate ideas for how to overcome objections to novelty and inventive step. Outputs are in the problem-solution format. | |
# """) | |
# with gr.Row(scale=1, min_width=600): | |
# text1 = gr.Textbox(label="Input", | |
# placeholder='Type in your idea here!') | |
# text2 = gr.Textbox(label="Output") | |
# with gr.Tab("Claimed Infill"): | |
# gr.Markdown(""" | |
# Below is our | |
# Example input: A device to help the visually impaired using proprioception. | |
# Output: | |
# """) | |
# with gr.Row(scale=1, min_width=600): | |
# text1 = gr.Textbox(label="Input", | |
# placeholder='Type in your idea here!') | |
# text2 = gr.Textbox(label="Output") | |
with gr.Tab("CPC Search Tool"): | |
gr.Markdown(""" | |
Use this tool to classify your invention according to the Cooperative Patent Classification system. | |
Click on the link to initiate either an Espacenet or Google Patents classification search using the generated classifications. You can specify which you would like using the dropdown menu. | |
""") | |
gr.Dropdown(["Google Patent Search", "Espacenet Patent Search"], label='Choose Search Type Here') | |
with gr.Row(scale=1, min_width=600): | |
userin = gr.Textbox(label="Input", | |
placeholder='Type in your Claim/Description/Abstract Here') | |
output = gr.Textbox(label="Output") | |
with gr.Row(): | |
classify_btn = gr.Button("Classify") | |
classify_btn.click(fn=classifier, inputs=[userin] , outputs=output) | |
gr.Markdown(""" | |
# THE CHATBOT | |
Do you want a bit more freedom over the outputs you generate? No worries, you can use a chatbot version of our model below. You can ask it anything. | |
If you're concerned about a particular output, hit the flag button and we will use that information to improve the model. | |
""") | |
chatbot = gr.Chatbot([], elem_id="Claimed Assistant").style(height=500) | |
with gr.Row(): | |
with gr.Column(scale=0.85): | |
txt = gr.Textbox( | |
show_label=False, | |
placeholder="Enter text and submit", | |
).style(container=False) | |
with gr.Column(scale=0.15, min_width=0): | |
btn = gr.Button("Submit") | |
txt.submit(add_text, [chatbot, txt], [chatbot, txt]).then( | |
generateresponse, chatbot, chatbot) | |
gr.Markdown(""" | |
# HAVE AN IDEA? GET IT CLAIMED | |
In the future, we are looking to expand our model's capabilities further to assist in a range of IP related tasks. | |
If you are interested in using a more powerful model that we have trained, or if you have any suggestions of features you would like to see us add, please get in touch! | |
As far as data is concerned, you have nothing to worry about! We don't store any of your inputs to use for further training, we're not OpenAI. | |
""") | |
demo.launch() |