| import gradio as gr | |
| from llava.eval.model_vqa_demo import eval_model | |
| import os | |
| def inference(my_prompt, my_image): | |
| outputs = eval_model(my_image, my_prompt) | |
| return outputs | |
| with gr.Blocks(css=".custom-title { text-align: center; font-size: 2rem; font-weight: bold; margin-bottom: 20px; }") as demo: | |
| with gr.Row(): | |
| gr.Markdown("<div class='custom-title'>VLC Demo</div>") | |
| gr.Markdown("This is the official demo of the latest version of 'Vision-Language-Camera: Aligning User Intent and Camera Expertise through Vision-Language Models'") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| prompt = gr.Textbox(label="Your Prompt", placeholder="Type your description or question...", lines=3) | |
| image = gr.Image(label="Input Image", type='filepath') | |
| run_btn = gr.Button("Run Inference", variant="primary") | |
| cur_dir = os.path.dirname(os.path.abspath(__file__)) | |
| gr.Examples( | |
| examples=[ | |
| [ | |
| f"{cur_dir}/examples/img1.jpg", | |
| "How to tune the camera parameters to make this image clearer?", | |
| ], | |
| ], | |
| inputs=[image, prompt], | |
| ) | |
| with gr.Column(scale=2): | |
| output_text = gr.Textbox(label="VLC Response", lines=3) | |
| run_btn.click(fn=inference, inputs=[prompt, image], outputs=[output_text]) | |
| demo.launch() | |