Spaces:

rahul7star
/

Train-Lora

Running

App Files Files Community

rahul7star commited on Nov 9

Commit

609c7e3

verified ·

1 Parent(s): c9e5f78

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -99

app.py CHANGED Viewed

@@ -1,113 +1,71 @@
-import os, torch, gradio as gr, pandas as pd, numpy as np
-from pathlib import Path
-from tqdm.auto import tqdm
-from huggingface_hub import create_repo, upload_folder, hf_hub_download
-from torch.utils.data import Dataset, DataLoader
-import torch.nn as nn
-# ============================================================
-# 🧠 Intelligent dataset loader
-# ============================================================
-from datasets import load_dataset, DatasetDict
-def load_dataset_intelligent(source: str, subset: str = None):
     """
-    🔍 Intelligent dataset loader for CSV, Parquet, or HF Hub.
-    Detects:
-      - Local CSV/parquet file
-      - Local folder containing CSVs
-      - Hugging Face Hub dataset repo
-    Returns dict of {split: DataFrame}
     """
-    def try_load_local_csv(path):
-        if os.path.exists(path) and path.endswith((".csv", ".parquet")):
-            print(f"📄 Loading local file: {path}")
-            return pd.read_parquet(path) if path.endswith(".parquet") else pd.read_csv(path)
-        return None
-    def try_load_local_folder(path):
-        if os.path.isdir(path):
-            csv_files = [f for f in os.listdir(path) if f.endswith((".csv", ".parquet"))]
-            if csv_files:
-                print(f"📁 Found folder with {len(csv_files)} data files in: {path}")
-                dataframes = {}
-                for file in csv_files:
-                    split_name = "train" if "train" in file else os.path.splitext(file)[0]
-                    fpath = os.path.join(path, file)
-                    df = pd.read_parquet(fpath) if fpath.endswith(".parquet") else pd.read_csv(fpath)
-                    dataframes[split_name] = df
-                return dataframes
-        return None
-    # 1️⃣ Local file
-    df = try_load_local_csv(source)
-    if df is not None:
-        return {"train": df}
-    # 2️⃣ Folder with CSVs
-    dfs = try_load_local_folder(source)
-    if dfs is not None:
-        return dfs
-    # 3️⃣ Hugging Face Hub
-    print(f"🌐 Attempting to load from Hugging Face Hub: {source}")
-    try:
-        ds = load_dataset(source, subset or None)
-        if isinstance(ds, DatasetDict):
-            print(f"✅ Loaded HF dataset with splits: {list(ds.keys())}")
-            return {split: ds[split].to_pandas() for split in ds.keys()}
-        else:
-            print("✅ Loaded single-split HF dataset")
-            return {"train": ds.to_pandas()}
-    except Exception as e:
-        raise FileNotFoundError(f"❌ Could not load dataset: {source}\nError: {str(e)}")
-# ============================================================
-# 📁 Diffusion Dataset (uses intelligent loader)
-# ============================================================
-class MediaTextDataset(Dataset):
-    def __init__(self, source, csv_name="dataset.csv", max_frames=5):
-        self.source = source
-        self.max_frames = max_frames
-        self.data_splits = load_dataset_intelligent(source)
-        # Auto-pick train split
-        self.df = self.data_splits.get("train") or list(self.data_splits.values())[0]
-        self.root = Path(source) if os.path.isdir(source) else None
-        import torchvision.transforms as T
-        self.img_tf = T.Compose([
-            T.ToPILImage(), T.Resize((512,512)),
-            T.ToTensor(), T.Normalize([0.5]*3, [0.5]*3)
-        ])
-        self.video_tf = T.Compose([
-            T.ToPILImage(), T.Resize((128,256)),
-            T.ToTensor(), T.Normalize([0.5]*3, [0.5]*3)
-        ])
-    def __len__(self): return len(self.df)
-    def __getitem__(self, i):
-        import torchvision
-        rec = self.df.iloc[i]
-        fname = rec.get("file_name") or rec.get("image") or rec.get("path")
-        text = rec.get("text") or rec.get("caption") or rec.get("prompt")
-        p = Path(self.root / fname) if self.root else Path(fname)
-        if not p.exists():
-            raise FileNotFoundError(f"Missing file: {p}")
-        if p.suffix.lower() in {".jpg",".jpeg",".png",".webp"}:
-            img = torchvision.io.read_image(str(p))
-            if isinstance(img, torch.Tensor): img = img.permute(1,2,0).numpy()
-            return {"type": "image", "image": self.img_tf(img), "caption": text}
-        elif p.suffix.lower() in {".mp4",".mov",".avi",".mkv"}:
-            vid,_,_ = torchvision.io.read_video(str(p))
-            total = len(vid)
-            if total == 0:
-                return {"type":"video","frames":torch.zeros((self.max_frames,3,128,256))}
-            idxs = np.linspace(0,total-1,self.max_frames).round().astype(int)
-            frames = torch.stack([self.video_tf(vid[j].numpy()) for j in idxs])
-            return {"type":"video","frames":frames,"caption":text}
-        else:
-            raise RuntimeError(f"Unsupported media: {p}")

+import os
+import pandas as pd
+from datasets import load_dataset
+import gradio as gr
+def load_data(source_path):
     """
+    Load dataset from either a local CSV file or a Hugging Face dataset path.
+    Automatically detects which type of source to use.
     """
+    try:
+        # --- Case 1: Local CSV file ---
+        if os.path.exists(source_path):
+            print(f"📂 Loading local dataset from: {source_path}")
+            df = pd.read_csv(source_path)
+            print(f"✅ Loaded {len(df)} rows from local CSV.")
+            return df
+        # --- Case 2: Hugging Face dataset ---
+        elif "/" in source_path:
+            print(f"🌐 Loading Hugging Face dataset: {source_path}")
+            dataset = load_dataset(source_path, split="train")
+            df = dataset.to_pandas()
+            print(f"✅ Loaded {len(df)} rows from Hugging Face dataset.")
+            return df
+        else:
+            raise FileNotFoundError("Invalid path: not a local file or HF dataset.")
+    except Exception as e:
+        print(f"❌ Error loading data: {e}")
+        return pd.DataFrame()
+def summarize_dataset(df):
+    """
+    Return a brief summary of the dataset for display in Gradio.
+    """
+    if df.empty:
+        return "❌ No data loaded.", ""
+    preview = df.head().to_markdown(index=False)
+    info = f"✅ Loaded {len(df)} rows and {len(df.columns)} columns.\n\n**Columns:** {', '.join(df.columns)}"
+    return info, preview
+def gradio_ui():
+    with gr.Blocks(title="Prompt Enhancer Data Loader") as demo:
+        gr.Markdown("## 🧠 Intelligent Dataset Loader")
+        gr.Markdown("Automatically loads from a local CSV file **or** a Hugging Face dataset repo.")
+        with gr.Row():
+            dataset_path = gr.Textbox(
+                label="Enter dataset path (local or HF repo)",
+                value="rahul7star/prompt-enhancer-dataset-01",
+                placeholder="e.g., /path/to/local.csv or username/dataset-name",
+            )
+        load_btn = gr.Button("🚀 Load Dataset")
+        output_info = gr.Markdown()
+        output_preview = gr.Markdown()
+        def handle_load(path):
+            df = load_data(path)
+            return summarize_dataset(df)
+        load_btn.click(handle_load, inputs=[dataset_path], outputs=[output_info, output_preview])
+    return demo
+if __name__ == "__main__":
+    gradio_ui().launch(server_name="0.0.0.0", server_port=7860)