| | from typing import Dict, List, Any |
| | from transformers import ( |
| | AutoModelForCausalLM, |
| | AutoTokenizer) |
| | import torch |
| |
|
| | model = AutoModelForCausalLM.from_pretrained( |
| | "sjster/test_medium", |
| | trust_remote_code=True, |
| | quantization_config=None, |
| | torch_dtype=torch.float, |
| | device_map="auto", |
| | ) |
| |
|
| | class EndpointHandler(): |
| | def __init__(self, path=""): |
| | |
| | self.model = AutoModelForCausalLM.from_pretrained( |
| | path, |
| | trust_remote_code=True, |
| | quantization_config=None, |
| | torch_dtype=torch.float, |
| | device_map="auto") |
| | self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True) |
| | self.tokenizer.padding_side = "left" |
| | self.tokenizer.pad_token = self.tokenizer.eos_token |
| | self.tokenizer.add_eos_token = True |
| |
|
| | def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: |
| | """ |
| | data args: |
| | inputs (:obj: `str` | `PIL.Image` | `np.array`) |
| | kwargs |
| | Return: |
| | A :obj:`list` | `dict`: will be serialized and returned |
| | """ |
| |
|
| | inputs = data.pop("inputs", data) |
| | messages = [ |
| | { |
| | "role": "user", |
| | "content": "" |
| | + inputs, |
| | }, |
| | ] |
| | encodeds = self.tokenizer.apply_chat_template(messages, return_tensors="pt") |
| | encoded_length = len(encodeds[0]) |
| | model_inputs = encodeds.to('cuda') |
| | result = self.model.generate(model_inputs, |
| | do_sample=False, |
| | output_scores=True, |
| | return_dict_in_generate=True, |
| | output_attentions=True, |
| | output_hidden_states=True, |
| | |
| | |
| | early_stopping = True, |
| | |
| | max_new_tokens=400) |
| | x, logits_gen = result.sequences, result.scores |
| | x = x[:,encoded_length:] |
| | decoded = self.tokenizer.batch_decode(x) |
| |
|
| | return [{"outputs": decoded[0]}] |
| |
|