import gradio as gr from llava.eval.model_vqa_demo import eval_model import os def inference(my_prompt, my_image): outputs = eval_model(my_image, my_prompt) return outputs with gr.Blocks(css=".custom-title { text-align: center; font-size: 2rem; font-weight: bold; margin-bottom: 20px; }") as demo: with gr.Row(): gr.Markdown("
VLC Demo
") gr.Markdown("This is the official demo of the latest version of 'Vision-Language-Camera: Aligning User Intent and Camera Expertise through Vision-Language Models'") with gr.Row(): with gr.Column(scale=1): prompt = gr.Textbox(label="Your Prompt", placeholder="Type your description or question...", lines=3) image = gr.Image(label="Input Image", type='filepath') run_btn = gr.Button("Run Inference", variant="primary") cur_dir = os.path.dirname(os.path.abspath(__file__)) gr.Examples( examples=[ [ f"{cur_dir}/examples/img1.jpg", "How to tune the camera parameters to make this image clearer?", ], ], inputs=[image, prompt], ) with gr.Column(scale=2): output_text = gr.Textbox(label="VLC Response", lines=3) run_btn.click(fn=inference, inputs=[prompt, image], outputs=[output_text]) demo.launch()