import gradio as gr
from llava.eval.model_vqa_demo import eval_model
import os
def inference(my_prompt, my_image):
outputs = eval_model(my_image, my_prompt)
return outputs
with gr.Blocks(css=".custom-title { text-align: center; font-size: 2rem; font-weight: bold; margin-bottom: 20px; }") as demo:
with gr.Row():
gr.Markdown("
VLC Demo
")
gr.Markdown("This is the official demo of the latest version of 'Vision-Language-Camera: Aligning User Intent and Camera Expertise through Vision-Language Models'")
with gr.Row():
with gr.Column(scale=1):
prompt = gr.Textbox(label="Your Prompt", placeholder="Type your description or question...", lines=3)
image = gr.Image(label="Input Image", type='filepath')
run_btn = gr.Button("Run Inference", variant="primary")
cur_dir = os.path.dirname(os.path.abspath(__file__))
gr.Examples(
examples=[
[
f"{cur_dir}/examples/img1.jpg",
"How to tune the camera parameters to make this image clearer?",
],
],
inputs=[image, prompt],
)
with gr.Column(scale=2):
output_text = gr.Textbox(label="VLC Response", lines=3)
run_btn.click(fn=inference, inputs=[prompt, image], outputs=[output_text])
demo.launch()