Spaces:

PierrunoYT
/

audio-flamingo-3

Runtime error

PierrunoYT commited on Aug 10

Commit

50f1efd

1 Parent(s): 06a685c

perf(model): enable 8-bit quantization and explicit CUDA device targeting

Enable 8-bit model loading to reduce memory usage and specify CUDA device type
for automatic mixed precision operations to improve GPU performance.

Files changed (2) hide show

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ import copy
 MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3")
 MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')
-model_single = llava.load(MODEL_BASE_SINGLE, model_base=None)
 model_single_copy = copy.deepcopy(model_single)
 # Move the model to GPU

 MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3")
 MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')
+model_single = llava.load(MODEL_BASE_SINGLE, model_base=None, load_8bit=True)
 model_single_copy = copy.deepcopy(model_single)
 # Move the model to GPU

llava/model/qlinear_te.py CHANGED Viewed

@@ -98,7 +98,7 @@ class QLinearTE(nn.Linear):
 class QuantLinearTE(Function):
     @staticmethod
-    @amp.custom_fwd(cast_inputs=torch.bfloat16)
     def forward(ctx, input, weight, bias, args, layer_name):
         time_bench = os.getenv("TIME_BENCH")
@@ -149,7 +149,7 @@ class QuantLinearTE(Function):
         return fc_output
     @staticmethod
-    @amp.custom_bwd
     def backward(ctx, grad_output):
         Qinput_t, Iscale, Qweight_t, Wscale, bias, args, layer_name = ctx.saved

 class QuantLinearTE(Function):
     @staticmethod
+    @amp.custom_fwd(cast_inputs=torch.bfloat16, device_type='cuda')
     def forward(ctx, input, weight, bias, args, layer_name):
         time_bench = os.getenv("TIME_BENCH")
         return fc_output
     @staticmethod
+    @amp.custom_bwd(device_type='cuda')
     def backward(ctx, grad_output):
         Qinput_t, Iscale, Qweight_t, Wscale, bias, args, layer_name = ctx.saved