hfendpoints-images
/

embeddings-sentence-transformers-cpu

Morgan Funtowicz commited on May 5, 2025

Commit

a8540ed

1 Parent(s): 90c13c1

misc(config): add proper way to detect if cpu may support bfloat16

Files changed (1) hide show

handler.py CHANGED Viewed

@@ -11,9 +11,23 @@ from torch.backends.mkldnn import VERBOSE_ON_CREATION, VERBOSE_OFF
 from sentence_transformers import SentenceTransformer
 # Not used for now
-ENABLE_QUANTIZATION = bool(os.environ.get("HFENDPOINT_ENABLE_QUANTIZATION", "0"))
 SUPPORTED_AMP_DTYPES = {torch.float32, torch.bfloat16}
 def get_usage(tokens: Union[Sized, Sequence[Sized]], is_batched: bool) -> Usage:
     """
     Compute the number of processed tokens and return as Usage object matching OpenAI
@@ -39,7 +53,7 @@ class SentenceTransformerHandler(Handler):
         self._allocate_model()
     def _allocate_model(self):
-        dtype = torch.bfloat16 if torch.cpu._is_avx512_bf16_supported() else torch.float32
         model = SentenceTransformer(self._config.model_id, device="cpu", model_kwargs={"torch_dtype": dtype})
         if platform.machine() == "x86_64":

 from sentence_transformers import SentenceTransformer
 # Not used for now
 SUPPORTED_AMP_DTYPES = {torch.float32, torch.bfloat16}
+def has_bf16_support() -> bool:
+    """
+    Helper to detect if the hardware supports bfloat16
+    Note:
+        Intel libraries, such as oneDNN, provide emulation for bfloat16 even if the underlying hardware does not support it.
+    This means CPU ISA with AVX512 will work, even if not with the same performances as one could expect from CPU ISA with AVX512_BF16.
+    Also, AMX_BF16 is implicitly assumed true when AVX512_BF16 is true (that's the case on Intel Sapphire Rapids).
+    :return: True if the hardware supports (or can emulate) bfloat16, False otherwise
+    """
+    return torch.cpu._is_avx512_bf16_supported() or torch.cpu._is_avx512_supported()
 def get_usage(tokens: Union[Sized, Sequence[Sized]], is_batched: bool) -> Usage:
     """
     Compute the number of processed tokens and return as Usage object matching OpenAI
         self._allocate_model()
     def _allocate_model(self):
+        dtype = torch.bfloat16 if has_bf16_support() else torch.float32
         model = SentenceTransformer(self._config.model_id, device="cpu", model_kwargs={"torch_dtype": dtype})
         if platform.machine() == "x86_64":