Morgan Funtowicz
commited on
Commit
·
a8540ed
1
Parent(s):
90c13c1
misc(config): add proper way to detect if cpu may support bfloat16
Browse files- handler.py +16 -2
handler.py
CHANGED
|
@@ -11,9 +11,23 @@ from torch.backends.mkldnn import VERBOSE_ON_CREATION, VERBOSE_OFF
|
|
| 11 |
from sentence_transformers import SentenceTransformer
|
| 12 |
|
| 13 |
# Not used for now
|
| 14 |
-
ENABLE_QUANTIZATION = bool(os.environ.get("HFENDPOINT_ENABLE_QUANTIZATION", "0"))
|
| 15 |
SUPPORTED_AMP_DTYPES = {torch.float32, torch.bfloat16}
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
def get_usage(tokens: Union[Sized, Sequence[Sized]], is_batched: bool) -> Usage:
|
| 18 |
"""
|
| 19 |
Compute the number of processed tokens and return as Usage object matching OpenAI
|
|
@@ -39,7 +53,7 @@ class SentenceTransformerHandler(Handler):
|
|
| 39 |
self._allocate_model()
|
| 40 |
|
| 41 |
def _allocate_model(self):
|
| 42 |
-
dtype = torch.bfloat16 if
|
| 43 |
model = SentenceTransformer(self._config.model_id, device="cpu", model_kwargs={"torch_dtype": dtype})
|
| 44 |
|
| 45 |
if platform.machine() == "x86_64":
|
|
|
|
| 11 |
from sentence_transformers import SentenceTransformer
|
| 12 |
|
| 13 |
# Not used for now
|
|
|
|
| 14 |
SUPPORTED_AMP_DTYPES = {torch.float32, torch.bfloat16}
|
| 15 |
|
| 16 |
+
|
| 17 |
+
def has_bf16_support() -> bool:
|
| 18 |
+
"""
|
| 19 |
+
Helper to detect if the hardware supports bfloat16
|
| 20 |
+
|
| 21 |
+
Note:
|
| 22 |
+
Intel libraries, such as oneDNN, provide emulation for bfloat16 even if the underlying hardware does not support it.
|
| 23 |
+
This means CPU ISA with AVX512 will work, even if not with the same performances as one could expect from CPU ISA with AVX512_BF16.
|
| 24 |
+
Also, AMX_BF16 is implicitly assumed true when AVX512_BF16 is true (that's the case on Intel Sapphire Rapids).
|
| 25 |
+
|
| 26 |
+
:return: True if the hardware supports (or can emulate) bfloat16, False otherwise
|
| 27 |
+
"""
|
| 28 |
+
return torch.cpu._is_avx512_bf16_supported() or torch.cpu._is_avx512_supported()
|
| 29 |
+
|
| 30 |
+
|
| 31 |
def get_usage(tokens: Union[Sized, Sequence[Sized]], is_batched: bool) -> Usage:
|
| 32 |
"""
|
| 33 |
Compute the number of processed tokens and return as Usage object matching OpenAI
|
|
|
|
| 53 |
self._allocate_model()
|
| 54 |
|
| 55 |
def _allocate_model(self):
|
| 56 |
+
dtype = torch.bfloat16 if has_bf16_support() else torch.float32
|
| 57 |
model = SentenceTransformer(self._config.model_id, device="cpu", model_kwargs={"torch_dtype": dtype})
|
| 58 |
|
| 59 |
if platform.machine() == "x86_64":
|