Spaces:

black-forest-labs
/

FLUX.2-dev

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 13 days ago

Commit

989c44e

verified ·

1 Parent(s): 06529b5

Add prompt upsampling

Browse files

Files changed (1) hide show

app.py +150 -25

app.py CHANGED Viewed

@@ -13,6 +13,8 @@ from optimization import optimize_pipeline_
 import requests
 from PIL import Image
 import json
 dtype = torch.bfloat16
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -20,6 +22,34 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024
 def remote_text_encoder(prompts):
     from gradio_client import Client
@@ -29,8 +59,8 @@ def remote_text_encoder(prompts):
         api_name="/encode_text"
     )
     prompt_embeds = torch.load(result[0])
     return prompt_embeds
 # Load model
@@ -48,56 +78,136 @@ pipe = Flux2Pipeline.from_pretrained(
     transformer=dit,
     torch_dtype=torch.bfloat16
 )
-pipe.to("cuda")
 pipe.transformer.set_attention_backend("_flash_3_hub")
 optimize_pipeline_(
     pipe,
     image=[Image.new("RGB", (1024, 1024))],
-    prompt_embeds = remote_text_encoder("prompt").to("cuda"),
     guidance_scale=2.5,
     width=1024,
     height=1024,
     num_inference_steps=1
 )
-def get_duration(prompt, input_images=None, seed=42, randomize_seed=False, width=1024, height=1024, num_inference_steps=50, guidance_scale=2.5, progress=gr.Progress(track_tqdm=True)):
-    num_images = 0 if input_images is None else len(input_images)
-    step_duration = 1 + 0.7 * num_images
-    return max(65, num_inference_steps * step_duration + 10)
 @spaces.GPU(duration=get_duration)
-def infer(prompt, input_images=None, seed=42, randomize_seed=False, width=1024, height=1024, num_inference_steps=50, guidance_scale=2.5, progress=gr.Progress(track_tqdm=True)):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
-    # Get prompt embeddings from remote text encoder
-    progress(0.1, desc="Encoding prompt...")
-    prompt_embeds = remote_text_encoder(prompt).to("cuda")
     # Prepare image list (convert None or empty gallery to None)
     image_list = None
     if input_images is not None and len(input_images) > 0:
         image_list = []
         for item in input_images:
             image_list.append(item[0])
-    # Generate image
-    progress(0.3, desc="Generating image...")
-    generator = torch.Generator(device=device).manual_seed(seed)
-    image = pipe(
-        prompt_embeds=prompt_embeds,
-        image=image_list,
-        width=width,
-        height=height,
-        num_inference_steps=num_inference_steps,
-        guidance_scale=guidance_scale,
-        generator=generator,
-    ).images[0]
     return image, seed
@@ -118,6 +228,9 @@ css="""
     margin: 0 auto;
     max-width: 620px;
 }
 """
 with gr.Blocks() as demo:
@@ -152,6 +265,12 @@ FLUX.2 [dev] is a 32B model rectified flow capable of generating, editing and co
         with gr.Accordion("Advanced Settings", open=False):
             seed = gr.Slider(
                 label="Seed",
                 minimum=0,
@@ -180,6 +299,12 @@ FLUX.2 [dev] is a 32B model rectified flow capable of generating, editing and co
                     value=1024,
                 )
             with gr.Row():
                 num_inference_steps = gr.Slider(
@@ -219,7 +344,7 @@ FLUX.2 [dev] is a 32B model rectified flow capable of generating, editing and co
     gr.on(
         triggers=[run_button.click, prompt.submit],
         fn=infer,
-        inputs=[prompt, input_images, seed, randomize_seed, width, height, num_inference_steps, guidance_scale],
         outputs=[result, seed]
     )

 import requests
 from PIL import Image
 import json
+import base64
+from huggingface_hub import InferenceClient
 dtype = torch.bfloat16
 device = "cuda" if torch.cuda.is_available() else "cpu"
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024
+# Setup VLM Client
+hf_client = InferenceClient(
+    api_key=os.environ.get("HF_TOKEN"),
+)
+VLM_MODEL = "baidu/ERNIE-4.5-VL-424B-A47B-Base-PT"
+SYSTEM_PROMPT_TEXT_ONLY = """You are an expert prompt engineer for FLUX.2 by Black Forest Labs. Rewrite user prompts to be more descriptive while strictly preserving their core subject and intent.
+Guidelines:
+1. Structure: Keep structured inputs structured (enhance within fields). Convert natural language to detailed paragraphs.
+2. Details: Add concrete visual specifics - form, scale, textures, materials, lighting (quality, direction, color), shadows, spatial relationships, and environmental context.
+3. Text in Images: Put ALL text in quotation marks, matching the prompt's language. Always provide explicit quoted text for objects that would contain text in reality (signs, labels, screens, etc.) - without it, the model generates gibberish.
+Output only the revised prompt and nothing else."""
+SYSTEM_PROMPT_WITH_IMAGES = """You are FLUX.2 by Black Forest Labs, an image-editing expert. You convert editing requests into one concise instruction (50-80 words, ~30 for brief requests).
+Rules:
+- Single instruction only, no commentary
+- Use clear, analytical language (avoid "whimsical," "cascading," etc.)
+- Specify what changes AND what stays the same (face, lighting, composition)
+- Reference actual image elements
+- Turn negatives into positives ("don't change X" → "keep X")
+- Make abstractions concrete ("futuristic" → "glowing cyan neon, metallic panels")
+- Keep content PG-13
+Output only the final instruction in plain text and nothing else."""
 def remote_text_encoder(prompts):
     from gradio_client import Client
         api_name="/encode_text"
     )
+    # Load returns a tensor, usually on CPU by default
     prompt_embeds = torch.load(result[0])
     return prompt_embeds
 # Load model
     transformer=dit,
     torch_dtype=torch.bfloat16
 )
+pipe.to(device)
 pipe.transformer.set_attention_backend("_flash_3_hub")
+# Optimization runs once at startup
 optimize_pipeline_(
     pipe,
     image=[Image.new("RGB", (1024, 1024))],
+    prompt_embeds = remote_text_encoder("prompt").to(device),
     guidance_scale=2.5,
     width=1024,
     height=1024,
     num_inference_steps=1
 )
+def image_to_data_uri(img):
+    buffered = io.BytesIO()
+    img.save(buffered, format="PNG")
+    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    return f"data:image/png;base64,{img_str}"
+def upsample_prompt_logic(prompt, image_list):
+    try:
+        if image_list and len(image_list) > 0:
+            # Image + Text Editing Mode
+            system_content = SYSTEM_PROMPT_WITH_IMAGES
+            # Construct user message with text and images
+            user_content = [{"type": "text", "text": prompt}]
+            for img in image_list:
+                data_uri = image_to_data_uri(img)
+                user_content.append({
+                    "type": "image_url",
+                    "image_url": {"url": data_uri}
+                })
+            messages = [
+                {"role": "system", "content": system_content},
+                {"role": "user", "content": user_content}
+            ]
+        else:
+            # Text Only Mode
+            system_content = SYSTEM_PROMPT_TEXT_ONLY
+            messages = [
+                {"role": "system", "content": system_content},
+                {"role": "user", "content": prompt}
+            ]
+        completion = hf_client.chat.completions.create(
+            model=VLM_MODEL,
+            messages=messages,
+            max_tokens=1024
+        )
+        return completion.choices[0].message.content
+    except Exception as e:
+        print(f"Upsampling failed: {e}")
+        return prompt
+# Updated duration function to match generate_image arguments (including progress)
+def get_duration(prompt_embeds, image_list, width, height, num_inference_steps, guidance_scale, seed, force_dimensions, progress=gr.Progress(track_tqdm=True)):
+    num_images = 0 if image_list is None else len(image_list)
+    step_duration = 1 + 0.8 * num_images
+    return max(65, num_inference_steps * step_duration + 10)
 @spaces.GPU(duration=get_duration)
+def generate_image(prompt_embeds, image_list, width, height, num_inference_steps, guidance_scale, seed, force_dimensions, progress=gr.Progress(track_tqdm=True)):
+    # Move embeddings to GPU only when inside the GPU decorated function
+    prompt_embeds = prompt_embeds.to(device)
+    generator = torch.Generator(device=device).manual_seed(seed)
+    pipe_kwargs = {
+        "prompt_embeds": prompt_embeds,
+        "image": image_list,
+        "num_inference_steps": num_inference_steps,
+        "guidance_scale": guidance_scale,
+        "generator": generator,
+    }
+    if image_list is None or force_dimensions:
+        pipe_kwargs["width"] = width
+        pipe_kwargs["height"] = height
+    # Progress bar for the actual generation steps
+    if progress:
+        progress(0, desc="Starting generation...")
+    image = pipe(**pipe_kwargs).images[0]
+    return image
+def infer(prompt, input_images=None, seed=42, randomize_seed=False, width=1024, height=1024, num_inference_steps=50, guidance_scale=2.5, force_dimensions=False, prompt_upsampling=False, progress=gr.Progress(track_tqdm=True)):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     # Prepare image list (convert None or empty gallery to None)
     image_list = None
     if input_images is not None and len(input_images) > 0:
         image_list = []
         for item in input_images:
             image_list.append(item[0])
+    # 1. Upsampling (Network bound - No GPU needed)
+    final_prompt = prompt
+    if prompt_upsampling:
+        progress(0.05, desc="Upsampling prompt...")
+        final_prompt = upsample_prompt_logic(prompt, image_list)
+        print(f"Original Prompt: {prompt}")
+        print(f"Upsampled Prompt: {final_prompt}")
+    # 2. Text Encoding (Network bound - No GPU needed)
+    progress(0.1, desc="Encoding prompt...")
+    # This returns CPU tensors
+    prompt_embeds = remote_text_encoder(final_prompt)
+    # 3. Image Generation (GPU bound)
+    progress(0.3, desc="Waiting for GPU...")
+    image = generate_image(
+        prompt_embeds,
+        image_list,
+        width,
+        height,
+        num_inference_steps,
+        guidance_scale,
+        seed,
+        force_dimensions,
+        progress
+    )
     return image, seed
     margin: 0 auto;
     max-width: 620px;
 }
+.gallery-container img{
+    object-fit: contain;
+}
 """
 with gr.Blocks() as demo:
         with gr.Accordion("Advanced Settings", open=False):
+            prompt_upsampling = gr.Checkbox(
+                label="Prompt Upsampling",
+                value=True,
+                info="Automatically enhance the prompt using a VLM"
+            )
             seed = gr.Slider(
                 label="Seed",
                 minimum=0,
                     value=1024,
                 )
+            force_dimensions = gr.Checkbox(
+                label="Force width/height when image input",
+                value=False,
+                info="When unchecked, width/height settings are ignored if input images are provided"
+            )
             with gr.Row():
                 num_inference_steps = gr.Slider(
     gr.on(
         triggers=[run_button.click, prompt.submit],
         fn=infer,
+        inputs=[prompt, input_images, seed, randomize_seed, width, height, num_inference_steps, guidance_scale, force_dimensions, prompt_upsampling],
         outputs=[result, seed]
     )