Spaces:
Runtime error
Runtime error
File size: 5,281 Bytes
cc3cff0 2349e64 cc3cff0 ef7e7c8 cc3cff0 2349e64 4979bc7 2349e64 2168bad 2349e64 2168bad 2349e64 06184f0 2349e64 6a4d8b5 2168bad 6a4d8b5 aab02c5 6a4d8b5 2168bad 2349e64 aab02c5 2349e64 b717ac4 2349e64 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
import spacy
import pytextrank
from nlp_entities import *
import torch
import streamlit as st
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
model_checkpoint = "vives/distilbert-base-uncased-finetuned-cvent-2019_2022"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint, output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
#streamlit stuff
tags = st.text_input("Input tags separated by commas")
text = st.text_input("Input text to classify")
topkp = st.slider("Number of key phrases to extract from text", 10,30,20)
#Methods for tag processing
def pool_embeddings(out, tok):
embeddings = out["hidden_states"][-1]
attention_mask = tok['attention_mask']
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
masked_embeddings = embeddings * mask
summed = torch.sum(masked_embeddings, 1)
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
mean_pooled = summed / summed_mask
return mean_pooled
import pandas as pd
def get_transcript(file):
data = pd.io.json.read_json(file)
transcript = data['results'].values[1][0]['transcript']
transcript = transcript.lower()
return transcript
def concat_tokens_tags(sentences):
tokens = {'input_ids': [], 'attention_mask': [], 'KPS': []}
for sentence in sentences:
# encode each sentence and append to dictionary
new_tokens = tokenizer.encode_plus(sentence, max_length=64,
truncation=True, padding='max_length',
return_tensors='pt')
tokens['input_ids'].append(new_tokens['input_ids'][0])
tokens['attention_mask'].append(new_tokens['attention_mask'][0])
tokens['KPS'].append(sentence)
# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
return tokens
# Process tags
if tags:
tags = [x.lower().strip() for x in tags.split(",")]
tags_tokens = concat_tokens_tags(tags)
tags_tokens.pop("KPS")
with torch.no_grad():
outputs_tags = model(**tags_tokens)
pools_tags = pool_embeddings(outputs_tags, tags_tokens).detach().numpy()
token_dict = {}
for tag,embedding in zip(tags,pools_tags):
token_dict[tag] = embedding
#Code related with processing text, extracting KPs, and doing distance to tag
def concat_tokens(sentences):
tokens = {'input_ids': [], 'attention_mask': [], 'KPS': {}}
for sentence, values in sentences.items():
weight = values['weight']
# encode each sentence and append to dictionary
new_tokens = tokenizer.encode_plus(sentence, max_length=64,
truncation=True, padding='max_length',
return_tensors='pt')
tokens['input_ids'].append(new_tokens['input_ids'][0])
tokens['attention_mask'].append(new_tokens['attention_mask'][0])
tokens['KPS'][sentence] = weight
# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
return tokens
def calculate_weighted_embed_dist(out, tokens, weight, text,kp_dict, idx, exclude_text=False,exclude_words=False):
sim_dict = {}
pools = pool_embeddings_count(out, tokens, idx).detach().numpy()
for key in kp_dict.keys():
if exclude_text and text in key:
continue
if exclude_words and True in [x in key for x in text.split(" ")]:
continue
sim_dict[key] = cosine_similarity(
pools,
[kp_dict[key]]
)[0][0] * weight
return sim_dict
def pool_embeddings_count(out, tok, idx):
embeddings = out["hidden_states"][-1][idx:idx+1,:,:]
attention_mask = tok['attention_mask'][idx]
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
masked_embeddings = embeddings * mask
summed = torch.sum(masked_embeddings, 1)
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
mean_pooled = summed / summed_mask
return mean_pooled
import pandas as pd
def extract_tokens(text,top_kp=30):
kps = return_ners_and_kp([text], ret_ne=True)['KP']
#only process the top_kp tokens
kps = sorted(kps.items(), key= lambda x: x[1]['weight'], reverse = True)[:top_kp]
kps = {x:y for x,y in kps}
return concat_tokens(kps)
#Process text and classify it
if text and tags:
text = text.lower()
t1_tokens = extract_tokens(text, topkp)
t1_kps = t1_tokens.pop("KPS")
with torch.no_grad():
outputs = model(**t1_tokens)
tag_distance = None
for i,kp in enumerate(t1_kps):
if tag_distance is None:
tag_distance = calculate_weighted_embed_dist(outputs, t1_tokens,t1_kps[kp], kp, token_dict,i,exclude_text=False,exclude_words=False)
else:
curr = calculate_weighted_embed_dist(outputs, t1_tokens,t1_kps[kp], kp, token_dict,i,exclude_text=False,exclude_words=False)
tag_distance = {x:tag_distance[x] + curr[x] for x in tag_distance.keys()}
tag_distance = sorted(tag_distance.items(), key= lambda x: x[1], reverse = True)
tag_distance = {x:y for x,y in tag_distance}
st.json(tag_distance) |