david-thrower commited on
Commit
1ceaf3e
·
verified ·
1 Parent(s): 1aaff9f

Update app.py

Browse files

Try ONNX runtime at 8 bit quantization.

Files changed (1) hide show
  1. app.py +20 -6
app.py CHANGED
@@ -1,13 +1,27 @@
1
- import gradio as gr
2
- import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
 
5
- MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
6
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
7
 
8
  print("Loading tokenizer & model…")
 
 
 
 
 
9
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
10
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to(DEVICE)
 
 
 
11
 
12
  # -------------------------------------------------
13
  # Optional tool(s)
 
1
+ # import gradio as gr
2
+ # import torch
3
+ # from transformers import AutoTokenizer, AutoModelForCausalLM
4
 
5
+ # MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
6
+ # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
7
+
8
+ # print("Loading tokenizer & model…")
9
+ # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
10
+ # model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to(DEVICE)
11
+
12
+ #########
13
 
14
  print("Loading tokenizer & model…")
15
+ import gradio as gr
16
+ from transformers import AutoTokenizer
17
+ from optimum.onnxruntime import ORTModelForCausalLM
18
+
19
+ MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
20
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
21
+ model = ORTModelForCausalLM.from_pretrained(MODEL_ID, export=True, quantize=True)
22
+
23
+ #########
24
+
25
 
26
  # -------------------------------------------------
27
  # Optional tool(s)