Spaces:

rahul7star
/

Train-Lora

Running

App Files Files Community

rahul7star commited on Nov 9

Commit

dba7fbf

verified ·

1 Parent(s): 17cde10

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -55

app.py CHANGED Viewed

@@ -3,26 +3,24 @@
 Universal Dynamic LoRA Trainer (Accelerate + PEFT + Gradio)
 - Gemma LLM default
 - Auto LoRA target modules
-- CSV/Parquet datasets
-- Dropdowns for short/long prompt columns, batch size, num_workers
-- Live logs (tokenization, forward/backward, step loss)
-- Live progress bar
 """
-import os, torch, gradio as gr, pandas as pd, numpy as np
 from pathlib import Path
-from tqdm.auto import tqdm
-from huggingface_hub import create_repo, upload_folder, hf_hub_download
 from torch.utils.data import Dataset, DataLoader
 from peft import LoraConfig, get_peft_model
 from accelerate import Accelerator
 import torch.nn as nn
-# Optional LLM support
 try:
     from transformers import AutoTokenizer, AutoModelForCausalLM
     TRANSFORMERS_AVAILABLE = True
-except Exception:
     TRANSFORMERS_AVAILABLE = False
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -55,8 +53,9 @@ class MediaTextDataset(Dataset):
         self.text_columns = text_columns or ["short_prompt", "long_prompt"]
         print(f"[DEBUG] Loaded dataset: {file_path}, columns: {list(self.df.columns)}")
-        print(f"[DEBUG] Sample rows:\n{self.df.head(3)}")
     def __len__(self):
         return len(self.df)
@@ -66,31 +65,30 @@ class MediaTextDataset(Dataset):
         text_data = {col: rec[col] if col in rec else "" for col in self.text_columns}
         return {"text": text_data}
-# ---------------- Dynamic pipeline loader ----------------
 def load_pipeline_auto(base_model, dtype=torch.float16):
-    low = base_model.lower()
-    if "gemma" in low:
         if not TRANSFORMERS_AVAILABLE:
-            raise RuntimeError("Transformers not installed for LLM support.")
         print(f"[INFO] Using Gemma LLM for {base_model}")
         tokenizer = AutoTokenizer.from_pretrained(base_model)
         model = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=dtype)
         return {"model": model, "tokenizer": tokenizer}
     else:
-        raise NotImplementedError("Only Gemma LLM is implemented for LoRA training in this version.")
-def find_target_modules(model, model_name=None):
     candidates = ["q_proj", "k_proj", "v_proj", "out_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
     names = [n for n, m in model.named_modules() if isinstance(m, torch.nn.Linear)]
     targets = [n.split(".")[-1] for n in names if n.split(".")[-1] in candidates]
     if not targets:
         targets = [n.split(".")[-1] for n, m in model.named_modules() if isinstance(m, torch.nn.Linear)]
-        print(f"[WARNING] No standard attention modules found in {model_name}, using all Linear layers for LoRA")
     else:
         print(f"[INFO] LoRA target modules detected: {targets}")
     return targets
-# ---------------- Training (generator) ----------------
 def train_lora_stream(base_model, dataset_src, csv_name, text_cols, output_dir,
                       epochs=1, lr=1e-4, r=8, alpha=16, batch_size=1, num_workers=0,
                       max_train_records=None):
@@ -98,17 +96,13 @@ def train_lora_stream(base_model, dataset_src, csv_name, text_cols, output_dir,
     pipe = load_pipeline_auto(base_model)
     model_obj = pipe["model"]
     tokenizer = pipe["tokenizer"]
-    target_modules = find_target_modules(model_obj, base_model)
     lcfg = LoraConfig(r=r, lora_alpha=alpha, target_modules=target_modules, lora_dropout=0.0)
     lora_module = get_peft_model(model_obj, lcfg)
     dataset = MediaTextDataset(dataset_src, csv_name, text_columns=text_cols, max_records=max_train_records)
     loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
-    # Prepare with accelerator
-    lora_module, opt, loader = accelerator.prepare(
-        lora_module, torch.optim.AdamW(lora_module.parameters(), lr=lr), loader
-    )
     total_steps = epochs * len(loader)
     step_counter = 0
@@ -117,7 +111,7 @@ def train_lora_stream(base_model, dataset_src, csv_name, text_cols, output_dir,
     yield "[DEBUG] Starting training loop...\n", 0.0
     for ep in range(epochs):
-        yield f"[DEBUG] Epoch {ep+1}/{epochs}\n", step_counter/total_steps
         for i, batch in enumerate(loader):
             ex = batch[0]
             texts = ex["text"]
@@ -125,7 +119,7 @@ def train_lora_stream(base_model, dataset_src, csv_name, text_cols, output_dir,
             # Tokenization
             tokens = tokenizer([texts.get("short_prompt",""), texts.get("long_prompt","")],
                                padding=True, truncation=True, return_tensors="pt").to(DEVICE)
-            logs.append(f"[DEBUG] Step {step_counter}, tokens input_ids shape: {tokens['input_ids'].shape}")
             # Forward pass
             outputs = lora_module(**tokens)
@@ -138,17 +132,17 @@ def train_lora_stream(base_model, dataset_src, csv_name, text_cols, output_dir,
             opt.zero_grad()
             step_counter += 1
-            # Yield last 10 logs + progress
-            yield "\n".join(logs[-10:]), step_counter/total_steps
     Path(output_dir).mkdir(exist_ok=True)
     lora_module.save_pretrained(output_dir)
     yield f"[INFO] LoRA saved to {output_dir}\n", 1.0
-# ---------------- Upload ----------------
 def upload_adapter(local, repo_id):
-    token=os.environ.get("HF_TOKEN")
-    if not token: raise RuntimeError("HF_TOKEN missing")
     create_repo(repo_id, exist_ok=True)
     upload_folder(local, repo_id=repo_id, repo_type="model", token=token)
     return f"https://huggingface.co/{repo_id}"
@@ -159,43 +153,46 @@ def run_ui():
         gr.Markdown("# 🌐 Universal Dynamic LoRA Trainer (Gemma LLM)")
         with gr.Row():
-            base_model=gr.Textbox(label="Base model", value="google/gemma-3-4b-it")
-            dataset=gr.Textbox(label="Dataset folder or HF repo", value="rahul7star/prompt-enhancer-dataset-01")
-            csvname=gr.Textbox(label="CSV/Parquet file", value="train.csv")
-            short_col=gr.Textbox(label="Short prompt column", value="short_prompt")
-            long_col=gr.Textbox(label="Long prompt column", value="long_prompt")
-            out=gr.Textbox(label="Output dir", value="./adapter_out")
-            repo=gr.Textbox(label="Upload HF repo (optional)", value="rahul7star/gemma-3-270m-ccebc0")
         with gr.Row():
             batch_size = gr.Number(value=1, label="Batch size")
             num_workers = gr.Number(value=0, label="DataLoader num_workers")
-            r=gr.Slider(1,64,value=8,label="LoRA rank")
-            a=gr.Slider(1,64,value=16,label="LoRA alpha")
-            ep=gr.Number(value=1,label="Epochs")
-            lr=gr.Number(value=1e-4,label="Learning rate")
             max_records = gr.Number(value=1000, label="Max training records")
-            btn=gr.Button("🚀 Start Training")
-        logs=gr.Textbox(label="Logs", lines=20)
-        progress = gr.Progress()
-        def launch(bm,ds,csv,sc,lc,out_dir,batch,num_w,r_,a_,ep_,lr_,max_rec,repo_):
-            # Stream logs from generator
             for log_text, prog in train_lora_stream(
                 bm, ds, csv, [sc, lc], out_dir,
                 int(ep_), float(lr_), int(r_), int(a_),
                 int(batch), int(num_w), max_train_records=int(max_rec)
             ):
-                yield log_text, prog
-            # Upload if repo provided
             if repo_:
                 link = upload_adapter(out_dir, repo_)
-                yield f"[INFO] Uploaded to {link}", 1.0
-        btn.click(launch,
-                  [base_model,dataset,csvname,short_col,long_col,out,batch_size,num_workers,r,a,ep,lr,max_records,repo],
-                  [logs, progress])
     return demo
-if __name__=="__main__":
     run_ui().launch(server_name="0.0.0.0", server_port=7860, share=True)

 Universal Dynamic LoRA Trainer (Accelerate + PEFT + Gradio)
 - Gemma LLM default
 - Auto LoRA target modules
+- CSV/Parquet support
+- Live logs and progress
 """
+import os, torch, gradio as gr, pandas as pd
 from pathlib import Path
 from torch.utils.data import Dataset, DataLoader
+from tqdm.auto import tqdm
 from peft import LoraConfig, get_peft_model
 from accelerate import Accelerator
 import torch.nn as nn
+from huggingface_hub import create_repo, upload_folder, hf_hub_download
+# Transformers support
 try:
     from transformers import AutoTokenizer, AutoModelForCausalLM
     TRANSFORMERS_AVAILABLE = True
+except:
     TRANSFORMERS_AVAILABLE = False
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
         self.text_columns = text_columns or ["short_prompt", "long_prompt"]
+        # Debug prints
         print(f"[DEBUG] Loaded dataset: {file_path}, columns: {list(self.df.columns)}")
+        print(f"[DEBUG] Sample row:\n{self.df.head(3)}")
     def __len__(self):
         return len(self.df)
         text_data = {col: rec[col] if col in rec else "" for col in self.text_columns}
         return {"text": text_data}
+# ---------------- Model Loader ----------------
 def load_pipeline_auto(base_model, dtype=torch.float16):
+    if "gemma" in base_model.lower():
         if not TRANSFORMERS_AVAILABLE:
+            raise RuntimeError("Transformers not installed")
         print(f"[INFO] Using Gemma LLM for {base_model}")
         tokenizer = AutoTokenizer.from_pretrained(base_model)
         model = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=dtype)
         return {"model": model, "tokenizer": tokenizer}
     else:
+        raise NotImplementedError("Only Gemma LLM supported currently")
+def find_target_modules(model):
     candidates = ["q_proj", "k_proj", "v_proj", "out_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
     names = [n for n, m in model.named_modules() if isinstance(m, torch.nn.Linear)]
     targets = [n.split(".")[-1] for n in names if n.split(".")[-1] in candidates]
     if not targets:
         targets = [n.split(".")[-1] for n, m in model.named_modules() if isinstance(m, torch.nn.Linear)]
+        print("[WARNING] No standard attention modules found, using all Linear layers")
     else:
         print(f"[INFO] LoRA target modules detected: {targets}")
     return targets
+# ---------------- Training generator ----------------
 def train_lora_stream(base_model, dataset_src, csv_name, text_cols, output_dir,
                       epochs=1, lr=1e-4, r=8, alpha=16, batch_size=1, num_workers=0,
                       max_train_records=None):
     pipe = load_pipeline_auto(base_model)
     model_obj = pipe["model"]
     tokenizer = pipe["tokenizer"]
+    target_modules = find_target_modules(model_obj)
     lcfg = LoraConfig(r=r, lora_alpha=alpha, target_modules=target_modules, lora_dropout=0.0)
     lora_module = get_peft_model(model_obj, lcfg)
     dataset = MediaTextDataset(dataset_src, csv_name, text_columns=text_cols, max_records=max_train_records)
     loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
+    lora_module, opt, loader = accelerator.prepare(lora_module, torch.optim.AdamW(lora_module.parameters(), lr=lr), loader)
     total_steps = epochs * len(loader)
     step_counter = 0
     yield "[DEBUG] Starting training loop...\n", 0.0
     for ep in range(epochs):
+        yield f"[DEBUG] Epoch {ep+1}/{epochs}\n", step_counter / total_steps
         for i, batch in enumerate(loader):
             ex = batch[0]
             texts = ex["text"]
             # Tokenization
             tokens = tokenizer([texts.get("short_prompt",""), texts.get("long_prompt","")],
                                padding=True, truncation=True, return_tensors="pt").to(DEVICE)
+            logs.append(f"[DEBUG] Step {step_counter}, input_ids shape: {tokens['input_ids'].shape}")
             # Forward pass
             outputs = lora_module(**tokens)
             opt.zero_grad()
             step_counter += 1
+            yield "\n".join(logs[-10:]), step_counter / total_steps
     Path(output_dir).mkdir(exist_ok=True)
     lora_module.save_pretrained(output_dir)
     yield f"[INFO] LoRA saved to {output_dir}\n", 1.0
+# ---------------- HF Upload ----------------
 def upload_adapter(local, repo_id):
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        raise RuntimeError("HF_TOKEN missing")
     create_repo(repo_id, exist_ok=True)
     upload_folder(local, repo_id=repo_id, repo_type="model", token=token)
     return f"https://huggingface.co/{repo_id}"
         gr.Markdown("# 🌐 Universal Dynamic LoRA Trainer (Gemma LLM)")
         with gr.Row():
+            base_model = gr.Textbox(label="Base model", value="google/gemma-3-4b-it")
+            dataset = gr.Textbox(label="Dataset folder or HF repo", value="rahul7star/prompt-enhancer-dataset-01")
+            csvname = gr.Textbox(label="CSV/Parquet file", value="train.csv")
+            short_col = gr.Textbox(label="Short prompt column", value="short_prompt")
+            long_col = gr.Textbox(label="Long prompt column", value="long_prompt")
+            out = gr.Textbox(label="Output dir", value="./adapter_out")
+            repo = gr.Textbox(label="Upload HF repo (optional)", value="")
         with gr.Row():
             batch_size = gr.Number(value=1, label="Batch size")
             num_workers = gr.Number(value=0, label="DataLoader num_workers")
+            r = gr.Slider(1, 64, value=8, label="LoRA rank")
+            a = gr.Slider(1, 64, value=16, label="LoRA alpha")
+            ep = gr.Number(value=1, label="Epochs")
+            lr = gr.Number(value=1e-4, label="Learning rate")
             max_records = gr.Number(value=1000, label="Max training records")
+            btn = gr.Button("🚀 Start Training")
+        logs_box = gr.Textbox(label="Logs", lines=20)
+        progress_bar = gr.Progress()
+        def launch(bm, ds, csv, sc, lc, out_dir, batch, num_w, r_, a_, ep_, lr_, max_rec, repo_):
             for log_text, prog in train_lora_stream(
                 bm, ds, csv, [sc, lc], out_dir,
                 int(ep_), float(lr_), int(r_), int(a_),
                 int(batch), int(num_w), max_train_records=int(max_rec)
             ):
+                progress_bar.progress = prog
+                yield log_text
             if repo_:
                 link = upload_adapter(out_dir, repo_)
+                yield f"[INFO] Uploaded to {link}"
+        btn.click(
+            launch,
+            [base_model, dataset, csvname, short_col, long_col, out, batch_size, num_workers, r, a, ep, lr, max_records, repo],
+            logs_box
+        )
     return demo
+if __name__ == "__main__":
     run_ui().launch(server_name="0.0.0.0", server_port=7860, share=True)