| | import os |
| |
|
| |
|
| | |
| | from builtin_architecture import make_model, make_model_custom |
| | from dataset import fromDataset, get_dataloader, TextCorpusDataset |
| | import torch |
| | from tqdm import tqdm, trange |
| | from logger import init_logger, flush |
| | import torchvision |
| | from trainingmanager import TrainingManager |
| | import torch.nn as nn |
| |
|
| |
|
| | def train_model( |
| | experiment_directory, |
| | trainset, |
| | testset, |
| | epochs, |
| | additional_epochs, |
| | model_params=None, |
| | schedule=False, |
| | **kwargs, |
| | ): |
| | os.system(f"caffeinate -is -w {os.getpid()} &") |
| |
|
| | if model_params is None: |
| | model_params = {} |
| |
|
| | device = "mps" if torch.backends.mps.is_available() else "cpu" |
| |
|
| | dataloader = get_dataloader(trainset) |
| |
|
| | testloader = get_dataloader(testset) |
| | if model_params == {}: |
| | net = make_model() |
| | else: |
| | net = make_model_custom(**model_params) |
| | net.to(device) |
| |
|
| | trainer = TrainingManager( |
| | net=net, |
| | dir=experiment_directory, |
| | dataloader=dataloader, |
| | device=device, |
| | trainstep_checkin_interval=100, |
| | epochs=epochs, |
| | val_dataloader=testloader, |
| | ) |
| |
|
| | |
| |
|
| | for batch, attn_mask in dataloader: |
| | init_logger( |
| | net, |
| | dir=os.path.join(experiment_directory, "tensorboard"), |
| | ) |
| | break |
| | if schedule: |
| | trainer.train_curriculum(**kwargs) |
| | else: |
| | trainer.train() |
| |
|
| | if additional_epochs > 0: |
| | print(f"Running additional {additional_epochs} epochs") |
| | additional_trainer = TrainingManager( |
| | net=net, |
| | dir=experiment_directory, |
| | dataloader=dataloader, |
| | device=device, |
| | trainstep_checkin_interval=100, |
| | epochs=epochs + additional_epochs, |
| | val_dataloader=testloader, |
| | ) |
| | additional_trainer.train() |
| | |
| | flush() |
| |
|
| | os.system("bash safe_cleanup.sh") |
| |
|
| |
|
| | def run_experiment(experiment_directory, epochs, additional_epochs, trainset, testset, del_runs, **kwargs): |
| | train_model(experiment_directory, trainset, testset, epochs, additional_epochs, schedule=True, **kwargs) |
| | if del_runs: |
| | os.system(f"rm -r {experiment_directory}/ckpt/*.pt") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | del_runs = False |
| | if del_runs: |
| | del_runs = ( |
| | del_runs and input("Confirm that this will delete checkpoints: ") == "y" |
| | ) |
| | if not del_runs: |
| | print("Exiting") |
| | exit() |
| |
|
| | parent_directory = "runs/code-decoder-v31-mega-licensed-1" |
| |
|
| | Curriculum = TrainingManager.get_curriculum_enum() |
| |
|
| | experiments = [ |
| | ( |
| | "curriculum-noloss", |
| | {"curriculum_type": Curriculum.CURRICULUM, "loss_based": False}, |
| | ), |
| | ( |
| | "curriculum-loss", |
| | {"curriculum_type": Curriculum.CURRICULUM, "loss_based": True}, |
| | ), |
| | ("noop", {"curriculum_type": Curriculum.NOOP, "loss_based": False}), |
| | |
| | ( |
| | "anticurriculum", |
| | {"curriculum_type": Curriculum.ANTICURRICULUM, "loss_based": False}, |
| | ), |
| | ( |
| | "anticurriculum-loss", |
| | {"curriculum_type": Curriculum.ANTICURRICULUM, "loss_based": True}, |
| | ), |
| | |
| | ("hybrid", {"curriculum_type": Curriculum.HYBRID, "loss_based": False}), |
| | ("hybrid-loss", {"curriculum_type": Curriculum.HYBRID, "loss_based": True}), |
| |
|
| | ("sequential", {"curriculum_type": Curriculum.SEQUENTIAL, "loss_based": False}), |
| | ( |
| | "sequential-loss", |
| | {"curriculum_type": Curriculum.SEQUENTIAL, "loss_based": True}, |
| | ), |
| | ] |
| |
|
| | EPOCHS = 10 |
| | ADDITIONAL_EPOCHS = 20 |
| | trainset, testset = fromDataset( |
| | TextCorpusDataset( |
| | root_dir=os.path.expanduser( |
| | "~/torch_datasets/github-python/mega_licensed_corpus" |
| | ), |
| | vocab_size=33819, |
| | IS_CODE=True, |
| | IS_CUSTOM=True, |
| | max_length=256, |
| | sliding_window=False, |
| | stride=10, |
| | get_rarity_score=True, |
| | get_entropy_score=False |
| | ) |
| | ) |
| | |
| |
|
| | for experiment_name, params in experiments: |
| | experiment_directory = os.path.join(parent_directory, experiment_name) |
| | |
| | print(f"Running experiment: {experiment_name}") |
| | print(f"Params: {params}") |
| | |
| | |
| |
|
| | run_experiment( |
| | experiment_directory, |
| | EPOCHS, |
| | ADDITIONAL_EPOCHS, |
| | trainset, |
| | testset, |
| | del_runs, |
| | **params, |
| | ) |
| |
|
| | |
| | import gc |
| | gc.collect() |
| | for obj in gc.get_objects(): |
| | try: |
| | if torch.is_tensor(obj): |
| | del obj |
| | except: |
| | pass |
| | if torch.backends.mps.is_available(): |
| | torch._C._mps_emptyCache() |
| | torch.mps.empty_cache() |
| |
|
| |
|