Spaces:

scratchtoscale
/

training-time-calculator

Running

App Files Files Community

training-time-calculator / training_time_calculator.py

muellerzr

Create training_time_calculator.py

32e471c verified 3 months ago

raw

history blame contribute delete

9.26 kB

	import gradio as gr
	import csv
	import os
	import numpy as np

	def load_gpu_data():
	"""Load GPU data from gpus.csv file."""
	gpu_data = {}
	csv_path = os.path.join(os.path.dirname(__file__), 'gpus.csv')

	try:
	with open(csv_path, 'r') as file:
	reader = csv.DictReader(file)
	for row in reader:
	gpu_name = row['gpu_model'].replace('_', ' ')
	tflops = float(row['sparce_tflops'])
	gpu_data[gpu_name] = tflops
	except Exception as e:
	print(f"Error loading GPU data: {e}")
	gpu_data = {"Custom": 0}

	return gpu_data

	def calculate_training_time(model_size_billions, tflops_per_gpu, num_gpus, tokens_millions, mfu_percentage):
	"""
	Calculate the time to train a model.

	Formula:
	- Total FLOPs = 6 * num_params * num_tokens
	- Effective FLOPs per second = tflops_per_gpu * num_gpus * 10^12 * (MFU/100)
	- Training time = Total FLOPs / Effective FLOPs per second

	Args:
	model_size_billions: Model size in billions of parameters
	tflops_per_gpu: BF16 TFLOPs per GPU (effective, non-sparsity)
	num_gpus: Number of GPUs used
	tokens_millions: Number of tokens in millions
	mfu_percentage: Model FLOPs Utilization percentage

	Returns:
	Training time in hours
	"""
	# Convert inputs to base units
	num_params = model_size_billions * 1e9
	num_tokens = tokens_millions * 1e6

	# Calculate total FLOPs needed
	total_flops = 6 * num_params * num_tokens

	# Calculate effective FLOPs per second
	# tflops_per_gpu is in 10^12 FLOPs per second
	flops_per_second = tflops_per_gpu * num_gpus * 1e12 * (mfu_percentage / 100)

	# Calculate training time in seconds
	training_time_seconds = total_flops / flops_per_second

	# Convert to hours
	training_time_hours = training_time_seconds / 3600

	return training_time_hours

	def format_output(hours):
	"""Format the output in a readable way."""
	if hours < 24:
	return f"{hours:.2f} hours"
	else:
	days = hours / 24
	if days < 30:
	return f"{days:.2f} days ({hours:.1f} hours)"
	else:
	months = days / 30
	return f"{months:.2f} months ({days:.1f} days, {hours:.0f} hours)"

	def slider_to_model_size(value):
	"""Convert logarithmic slider value to actual model size in billions."""
	# Map 0-100 to 0.1B-1000B logarithmically
	min_log = np.log10(0.1) # -1
	max_log = np.log10(1000) # 3
	log_value = min_log + (max_log - min_log) * value / 100
	return 10 ** log_value

	def model_size_to_slider(size_billions):
	"""Convert model size in billions to slider value."""
	min_log = np.log10(0.1)
	max_log = np.log10(1000)
	log_value = np.log10(size_billions)
	return 100 * (log_value - min_log) / (max_log - min_log)

	def format_model_size(size_billions):
	"""Format model size for display."""
	if size_billions < 1:
	return f"{size_billions * 1000:.0f}M"
	elif size_billions < 1000:
	return f"{size_billions:.1f}B"
	else:
	return f"{size_billions / 1000:.1f}T"

	def update_calculation(model_size_value, model_size_unit, use_gpu_model, gpu_model, custom_tflops, num_gpus, tokens_value, tokens_unit, mfu_percentage):
	"""Update the calculation and return formatted results."""
	# Convert model size to billions
	if model_size_unit == "B":
	model_size_billions = model_size_value
	else: # T
	model_size_billions = model_size_value * 1000

	# Convert tokens to millions
	if tokens_unit == "M":
	tokens_millions = tokens_value
	elif tokens_unit == "B":
	tokens_millions = tokens_value * 1000
	else: # T
	tokens_millions = tokens_value * 1000000

	# Determine TFLOPs value
	if use_gpu_model and gpu_model != "Custom":
	gpu_data = load_gpu_data()
	tflops_per_gpu = gpu_data.get(gpu_model, custom_tflops)
	gpu_info = f"{gpu_model} ({tflops_per_gpu} TFLOPs)"
	else:
	tflops_per_gpu = custom_tflops
	gpu_info = f"Custom ({tflops_per_gpu} TFLOPs)"

	hours = calculate_training_time(model_size_billions, tflops_per_gpu, num_gpus, tokens_millions, mfu_percentage)

	# Create detailed breakdown
	total_flops = 6 * (model_size_billions * 1e9) * (tokens_millions * 1e6)
	effective_tflops = tflops_per_gpu * num_gpus * (mfu_percentage / 100)

	breakdown = f"""
	### Calculation Breakdown:
	- GPU Selection: {gpu_info}
	- Model Size: {format_model_size(model_size_billions)} parameters ({model_size_billions:.2f}B)
	- Training Tokens: {tokens_value}{tokens_unit} tokens ({tokens_millions:.0f}M)
	- Total FLOPs: {total_flops:.2e} FLOPs
	- Formula: 6 × {model_size_billions:.2f}B params × {tokens_millions:.0f}M tokens
	- Effective TFLOPs: {effective_tflops:.2f} TFLOPs/s
	- Formula: {tflops_per_gpu} TFLOPs/GPU × {num_gpus} GPUs × {mfu_percentage}% MFU

	### Training Time:
	{format_output(hours)}
	"""

	return breakdown

	# Load GPU data
	gpu_data = load_gpu_data()
	gpu_choices = ["Custom"] + list(gpu_data.keys())

	# Create the Gradio interface
	with gr.Blocks(title="Model Training Time Calculator") as demo:
	gr.Markdown("# Model Training Time Calculator")
	gr.Markdown("Calculate the time required to train a model based on model size, hardware specs, and token count.")

	with gr.Row():
	with gr.Column():
	with gr.Row():
	model_size_value = gr.Number(
	minimum=0.5,
	maximum=1000,
	value=7,
	step=0.1,
	label="Model Size",
	info="Enter model size (0.5-1000)"
	)
	model_size_unit = gr.Radio(
	choices=["B", "T"],
	value="B",
	label="Unit",
	info="Model size unit"
	)

	# GPU Selection
	use_gpu_model = gr.Checkbox(
	value=True,
	label="Use GPU Model from List",
	info="Check to select a GPU model, uncheck to input custom TFLOPs"
	)

	gpu_model = gr.Dropdown(
	choices=gpu_choices,
	value="H100" if "H100" in gpu_choices else gpu_choices[0],
	label="GPU Model",
	info="Select a GPU model from the list",
	visible=True
	)

	custom_tflops = gr.Slider(
	minimum=10,
	maximum=2000,
	value=300,
	step=10,
	label="Custom BF16 TFLOPs per GPU",
	info="Effective (non-sparsity) TFLOPs per GPU",
	visible=False
	)

	num_gpus = gr.Slider(
	minimum=1,
	maximum=1024,
	value=8,
	step=1,
	label="Number of GPUs",
	info="Total number of GPUs for training"
	)

	with gr.Row():
	tokens_value = gr.Slider(
	minimum=1,
	maximum=1000,
	value=100,
	step=1,
	label="Training Tokens",
	info="Number of training tokens"
	)
	tokens_unit = gr.Radio(
	choices=["M", "B", "T"],
	value="B",
	label="Unit",
	info="Token count unit"
	)

	mfu = gr.Slider(
	minimum=10,
	maximum=100,
	value=50,
	step=5,
	label="Model FLOPs Utilization (MFU) %",
	info="Efficiency of hardware utilization (50% is typical for low-end estimate)"
	)

	with gr.Column():
	output = gr.Markdown(label="Results")

	# Toggle between GPU model and custom TFLOPs
	def toggle_gpu_input(use_gpu):
	return gr.update(visible=use_gpu), gr.update(visible=not use_gpu or use_gpu and gpu_model.value == "Custom")

	use_gpu_model.change(
	fn=toggle_gpu_input,
	inputs=[use_gpu_model],
	outputs=[gpu_model, custom_tflops]
	)

	# Show custom TFLOPs when "Custom" is selected
	def check_custom_selected(gpu_model_value):
	return gr.update(visible=gpu_model_value == "Custom")

	gpu_model.change(
	fn=check_custom_selected,
	inputs=[gpu_model],
	outputs=[custom_tflops]
	)

	# Set up live updating
	all_inputs = [model_size_value, model_size_unit, use_gpu_model, gpu_model, custom_tflops, num_gpus, tokens_value, tokens_unit, mfu]

	for input_component in all_inputs:
	input_component.change(
	fn=update_calculation,
	inputs=all_inputs,
	outputs=output
	)

	# Initial calculation
	demo.load(
	fn=update_calculation,
	inputs=all_inputs,
	outputs=output
	)

	if __name__ == "__main__":
	demo.launch()