|
--- |
|
library_name: transformers |
|
tags: [] |
|
--- |
|
|
|
|
|
|
|
|
|
```python |
|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
model = AutoModelForCausalLM.from_pretrained("tomg-group-umd/step-00047360-recurrence_full_512_0", trust_remote_code=True) |
|
tokenizer = AutoTokenizer.from_pretrained("tomg-group-umd/step-00047360-recurrence_full_512_0") |
|
|
|
device=torch.device("cuda:0") |
|
|
|
input_ids = tokenizer.encode("The capital of Westphalia is", return_tensors="pt", add_special_tokens=True).to(device)[:, :-1] |
|
model.eval() |
|
model.to(device) |
|
|
|
model(input_ids) |
|
|
|
# or, more efficiently |
|
amp_settings = {"device_type": "cuda", "enabled": True, "dtype": torch.bfloat16} |
|
if not amp_settings["enabled"]: |
|
torch.backends.cuda.enable_math_sdp(True) |
|
|
|
|
|
|
|
with torch.autocast(**amp_settings), torch.no_grad(): |
|
model(input_ids=input_ids) |
|
|
|
###### Caching: |
|
# first step: |
|
past_key_values = None |
|
outputs = model(input_ids=input_ids, use_cache=True, past_key_values=past_key_values) |
|
past_key_values = outputs.past_key_values |
|
# next step |
|
outputs = model(input_ids=input_ids, use_cache=True, past_key_values=past_key_values) |
|
|
|
######## Generate! |
|
with torch.autocast(**amp_settings), torch.no_grad(): |
|
output_ids = model.generate(input_ids, max_new_tokens=20, use_cache=True, num_steps=32) |
|
|
|
print(tokenizer.decode(output_ids[0])) |
|
|
|
|
|
# with or without cache |
|
with torch.autocast(**amp_settings), torch.no_grad(): |
|
output_ids = model.generate(input_ids, max_new_tokens=20, use_cache=False, num_steps=32) |
|
|
|
print(tokenizer.decode(output_ids[0])) |
|
|
|
# Both are supposed to print: |
|
|
|
# <|begin_text|>The capital of Westphalia is the city of Münster. The city is located in the north of the state and is |
|
``` |
|
|