|
|
import gradio as gr |
|
|
import csv |
|
|
import os |
|
|
import numpy as np |
|
|
|
|
|
def load_gpu_data(): |
|
|
"""Load GPU data from gpus.csv file.""" |
|
|
gpu_data = {} |
|
|
csv_path = os.path.join(os.path.dirname(__file__), 'gpus.csv') |
|
|
|
|
|
try: |
|
|
with open(csv_path, 'r') as file: |
|
|
reader = csv.DictReader(file) |
|
|
for row in reader: |
|
|
gpu_name = row['gpu_model'].replace('_', ' ') |
|
|
tflops = float(row['sparce_tflops']) |
|
|
gpu_data[gpu_name] = tflops |
|
|
except Exception as e: |
|
|
print(f"Error loading GPU data: {e}") |
|
|
gpu_data = {"Custom": 0} |
|
|
|
|
|
return gpu_data |
|
|
|
|
|
def calculate_training_time(model_size_billions, tflops_per_gpu, num_gpus, tokens_millions, mfu_percentage): |
|
|
""" |
|
|
Calculate the time to train a model. |
|
|
|
|
|
Formula: |
|
|
- Total FLOPs = 6 * num_params * num_tokens |
|
|
- Effective FLOPs per second = tflops_per_gpu * num_gpus * 10^12 * (MFU/100) |
|
|
- Training time = Total FLOPs / Effective FLOPs per second |
|
|
|
|
|
Args: |
|
|
model_size_billions: Model size in billions of parameters |
|
|
tflops_per_gpu: BF16 TFLOPs per GPU (effective, non-sparsity) |
|
|
num_gpus: Number of GPUs used |
|
|
tokens_millions: Number of tokens in millions |
|
|
mfu_percentage: Model FLOPs Utilization percentage |
|
|
|
|
|
Returns: |
|
|
Training time in hours |
|
|
""" |
|
|
|
|
|
num_params = model_size_billions * 1e9 |
|
|
num_tokens = tokens_millions * 1e6 |
|
|
|
|
|
|
|
|
total_flops = 6 * num_params * num_tokens |
|
|
|
|
|
|
|
|
|
|
|
flops_per_second = tflops_per_gpu * num_gpus * 1e12 * (mfu_percentage / 100) |
|
|
|
|
|
|
|
|
training_time_seconds = total_flops / flops_per_second |
|
|
|
|
|
|
|
|
training_time_hours = training_time_seconds / 3600 |
|
|
|
|
|
return training_time_hours |
|
|
|
|
|
def format_output(hours): |
|
|
"""Format the output in a readable way.""" |
|
|
if hours < 24: |
|
|
return f"{hours:.2f} hours" |
|
|
else: |
|
|
days = hours / 24 |
|
|
if days < 30: |
|
|
return f"{days:.2f} days ({hours:.1f} hours)" |
|
|
else: |
|
|
months = days / 30 |
|
|
return f"{months:.2f} months ({days:.1f} days, {hours:.0f} hours)" |
|
|
|
|
|
def slider_to_model_size(value): |
|
|
"""Convert logarithmic slider value to actual model size in billions.""" |
|
|
|
|
|
min_log = np.log10(0.1) |
|
|
max_log = np.log10(1000) |
|
|
log_value = min_log + (max_log - min_log) * value / 100 |
|
|
return 10 ** log_value |
|
|
|
|
|
def model_size_to_slider(size_billions): |
|
|
"""Convert model size in billions to slider value.""" |
|
|
min_log = np.log10(0.1) |
|
|
max_log = np.log10(1000) |
|
|
log_value = np.log10(size_billions) |
|
|
return 100 * (log_value - min_log) / (max_log - min_log) |
|
|
|
|
|
def format_model_size(size_billions): |
|
|
"""Format model size for display.""" |
|
|
if size_billions < 1: |
|
|
return f"{size_billions * 1000:.0f}M" |
|
|
elif size_billions < 1000: |
|
|
return f"{size_billions:.1f}B" |
|
|
else: |
|
|
return f"{size_billions / 1000:.1f}T" |
|
|
|
|
|
def update_calculation(model_size_value, model_size_unit, use_gpu_model, gpu_model, custom_tflops, num_gpus, tokens_value, tokens_unit, mfu_percentage): |
|
|
"""Update the calculation and return formatted results.""" |
|
|
|
|
|
if model_size_unit == "B": |
|
|
model_size_billions = model_size_value |
|
|
else: |
|
|
model_size_billions = model_size_value * 1000 |
|
|
|
|
|
|
|
|
if tokens_unit == "M": |
|
|
tokens_millions = tokens_value |
|
|
elif tokens_unit == "B": |
|
|
tokens_millions = tokens_value * 1000 |
|
|
else: |
|
|
tokens_millions = tokens_value * 1000000 |
|
|
|
|
|
|
|
|
if use_gpu_model and gpu_model != "Custom": |
|
|
gpu_data = load_gpu_data() |
|
|
tflops_per_gpu = gpu_data.get(gpu_model, custom_tflops) |
|
|
gpu_info = f"{gpu_model} ({tflops_per_gpu} TFLOPs)" |
|
|
else: |
|
|
tflops_per_gpu = custom_tflops |
|
|
gpu_info = f"Custom ({tflops_per_gpu} TFLOPs)" |
|
|
|
|
|
hours = calculate_training_time(model_size_billions, tflops_per_gpu, num_gpus, tokens_millions, mfu_percentage) |
|
|
|
|
|
|
|
|
total_flops = 6 * (model_size_billions * 1e9) * (tokens_millions * 1e6) |
|
|
effective_tflops = tflops_per_gpu * num_gpus * (mfu_percentage / 100) |
|
|
|
|
|
breakdown = f""" |
|
|
### Calculation Breakdown: |
|
|
- **GPU Selection**: {gpu_info} |
|
|
- **Model Size**: {format_model_size(model_size_billions)} parameters ({model_size_billions:.2f}B) |
|
|
- **Training Tokens**: {tokens_value}{tokens_unit} tokens ({tokens_millions:.0f}M) |
|
|
- **Total FLOPs**: {total_flops:.2e} FLOPs |
|
|
- **Formula**: 6 × {model_size_billions:.2f}B params × {tokens_millions:.0f}M tokens |
|
|
- **Effective TFLOPs**: {effective_tflops:.2f} TFLOPs/s |
|
|
- **Formula**: {tflops_per_gpu} TFLOPs/GPU × {num_gpus} GPUs × {mfu_percentage}% MFU |
|
|
|
|
|
### Training Time: |
|
|
**{format_output(hours)}** |
|
|
""" |
|
|
|
|
|
return breakdown |
|
|
|
|
|
|
|
|
gpu_data = load_gpu_data() |
|
|
gpu_choices = ["Custom"] + list(gpu_data.keys()) |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Model Training Time Calculator") as demo: |
|
|
gr.Markdown("# Model Training Time Calculator") |
|
|
gr.Markdown("Calculate the time required to train a model based on model size, hardware specs, and token count.") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
with gr.Row(): |
|
|
model_size_value = gr.Number( |
|
|
minimum=0.5, |
|
|
maximum=1000, |
|
|
value=7, |
|
|
step=0.1, |
|
|
label="Model Size", |
|
|
info="Enter model size (0.5-1000)" |
|
|
) |
|
|
model_size_unit = gr.Radio( |
|
|
choices=["B", "T"], |
|
|
value="B", |
|
|
label="Unit", |
|
|
info="Model size unit" |
|
|
) |
|
|
|
|
|
|
|
|
use_gpu_model = gr.Checkbox( |
|
|
value=True, |
|
|
label="Use GPU Model from List", |
|
|
info="Check to select a GPU model, uncheck to input custom TFLOPs" |
|
|
) |
|
|
|
|
|
gpu_model = gr.Dropdown( |
|
|
choices=gpu_choices, |
|
|
value="H100" if "H100" in gpu_choices else gpu_choices[0], |
|
|
label="GPU Model", |
|
|
info="Select a GPU model from the list", |
|
|
visible=True |
|
|
) |
|
|
|
|
|
custom_tflops = gr.Slider( |
|
|
minimum=10, |
|
|
maximum=2000, |
|
|
value=300, |
|
|
step=10, |
|
|
label="Custom BF16 TFLOPs per GPU", |
|
|
info="Effective (non-sparsity) TFLOPs per GPU", |
|
|
visible=False |
|
|
) |
|
|
|
|
|
num_gpus = gr.Slider( |
|
|
minimum=1, |
|
|
maximum=1024, |
|
|
value=8, |
|
|
step=1, |
|
|
label="Number of GPUs", |
|
|
info="Total number of GPUs for training" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
tokens_value = gr.Slider( |
|
|
minimum=1, |
|
|
maximum=1000, |
|
|
value=100, |
|
|
step=1, |
|
|
label="Training Tokens", |
|
|
info="Number of training tokens" |
|
|
) |
|
|
tokens_unit = gr.Radio( |
|
|
choices=["M", "B", "T"], |
|
|
value="B", |
|
|
label="Unit", |
|
|
info="Token count unit" |
|
|
) |
|
|
|
|
|
mfu = gr.Slider( |
|
|
minimum=10, |
|
|
maximum=100, |
|
|
value=50, |
|
|
step=5, |
|
|
label="Model FLOPs Utilization (MFU) %", |
|
|
info="Efficiency of hardware utilization (50% is typical for low-end estimate)" |
|
|
) |
|
|
|
|
|
with gr.Column(): |
|
|
output = gr.Markdown(label="Results") |
|
|
|
|
|
|
|
|
def toggle_gpu_input(use_gpu): |
|
|
return gr.update(visible=use_gpu), gr.update(visible=not use_gpu or use_gpu and gpu_model.value == "Custom") |
|
|
|
|
|
use_gpu_model.change( |
|
|
fn=toggle_gpu_input, |
|
|
inputs=[use_gpu_model], |
|
|
outputs=[gpu_model, custom_tflops] |
|
|
) |
|
|
|
|
|
|
|
|
def check_custom_selected(gpu_model_value): |
|
|
return gr.update(visible=gpu_model_value == "Custom") |
|
|
|
|
|
gpu_model.change( |
|
|
fn=check_custom_selected, |
|
|
inputs=[gpu_model], |
|
|
outputs=[custom_tflops] |
|
|
) |
|
|
|
|
|
|
|
|
all_inputs = [model_size_value, model_size_unit, use_gpu_model, gpu_model, custom_tflops, num_gpus, tokens_value, tokens_unit, mfu] |
|
|
|
|
|
for input_component in all_inputs: |
|
|
input_component.change( |
|
|
fn=update_calculation, |
|
|
inputs=all_inputs, |
|
|
outputs=output |
|
|
) |
|
|
|
|
|
|
|
|
demo.load( |
|
|
fn=update_calculation, |
|
|
inputs=all_inputs, |
|
|
outputs=output |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |