Spaces:

Segizu
/

Computer_Vision

Sleeping

App Files Files Community

Segizu commited on Mar 14

Commit

b518740

1 Parent(s): 8ac08dc

yolov11

Browse files

Files changed (2) hide show

app.py +158 -80
requirements.txt +1 -5

app.py CHANGED Viewed

@@ -1,87 +1,165 @@
-import cv2
 import gradio as gr
 from ultralytics import YOLO
-from PIL import Image
 import tempfile
-# Cargamos el modelo YOLOv8 (puedes usar yolov8n.pt, yolov8s.pt, etc.)
-model = YOLO("yolov8n.pt")
-def process_video(video_path):
     """
-    Procesa un video, detecta personas, bicicletas y motos con YOLOv8,
-    y dibuja los recuadros y etiquetas en cada frame. Devuelve un .mp4 anotado.
     """
-    cap = cv2.VideoCapture(video_path)
-    if not cap.isOpened():
-        return None
-    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    # Creamos un archivo temporal para guardar el resultado
-    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
-    output_path = tmp_file.name
-    tmp_file.close()
-    # Usamos un códec compatible con navegadores (H.264 / avc1)
-    fourcc = cv2.VideoWriter_fourcc(*'avc1')
-    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
-    # Clases que nos interesan
-    valid_classes = ["person", "bicycle", "motorcycle"]
-    while True:
-        ret, frame = cap.read()
-        if not ret:
-            break
-        # Convertir BGR -> RGB para predecir con YOLO
-        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        # Hacemos la inferencia con un umbral de confianza del 0.5
-        results = model.predict(frame_rgb, conf=0.5)
-        # results es una lista; tomamos la primera predicción
-        boxes = results[0].boxes
-        # Dibujamos cada bounding box
-        for box in boxes:
-            # box.cls, box.conf y box.xyxy son tensores, así que convertimos a Python float/int
-            cls_id = int(box.cls[0].item())     # Índice de la clase
-            conf   = float(box.conf[0].item())  # Confianza
-            x1, y1, x2, y2 = box.xyxy[0]        # Coordenadas [xmin, ymin, xmax, ymax]
-            class_name = model.names[cls_id]
-            if class_name in valid_classes:
-                # Dibujamos el rectángulo
-                cv2.rectangle(frame,
-                              (int(x1), int(y1)),
-                              (int(x2), int(y2)),
-                              (0, 255, 0), 2)
-                text = f"{class_name} {conf:.2f}"
-                cv2.putText(frame, text,
-                            (int(x1), int(y1) - 10),
-                            cv2.FONT_HERSHEY_SIMPLEX, 0.5,
-                            (0, 255, 0), 2)
-        # Guardamos el frame anotado en el video de salida
-        out.write(frame)
-    cap.release()
-    out.release()
-    return output_path
-# Interfaz de Gradio
-iface = gr.Interface(
-    fn=process_video,
-    inputs=gr.Video(label="Sube tu video"),
-    outputs=gr.Video(label="Video procesado"),
-    title="Detección de Objetos con YOLOv8",
-    description="Sube un video y se detectan personas, bicicletas y motos con YOLOv8. "
-                "Los objetos se enmarcan y etiquetan en el video resultante."
-)
-if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
+from PIL import Image, ImageDraw, ImageFont
 from ultralytics import YOLO
+import spaces
+import cv2
+import numpy as np
 import tempfile
+@spaces.GPU
+def yolo_inference(input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection):
+    if input_type == "Image":
+        if image is None:
+            width, height = 640, 480
+            blank_image = Image.new("RGB", (width, height), color="white")
+            draw = ImageDraw.Draw(blank_image)
+            message = "No image provided"
+            font = ImageFont.load_default(size=40)
+            bbox = draw.textbbox((0, 0), message, font=font)
+            text_width = bbox[2] - bbox[0]
+            text_height = bbox[3] - bbox[1]
+            text_x = (width - text_width) / 2
+            text_y = (height - text_height) / 2
+            draw.text((text_x, text_y), message, fill="black", font=font)
+            return blank_image, None
+        model = YOLO(model_id)
+        results = model.predict(
+            source=image,
+            conf=conf_threshold,
+            iou=iou_threshold,
+            imgsz=640,
+            max_det=max_detection,
+            show_labels=True,
+            show_conf=True,
+        )
+        for r in results:
+            image_array = r.plot()
+            annotated_image = Image.fromarray(image_array[..., ::-1])
+        return annotated_image, None
+    elif input_type == "Video":
+        if video is None:
+            width, height = 640, 480
+            blank_image = Image.new("RGB", (width, height), color="white")
+            draw = ImageDraw.Draw(blank_image)
+            message = "No video provided"
+            font = ImageFont.load_default(size=40)
+            bbox = draw.textbbox((0, 0), message, font=font)
+            text_width = bbox[2] - bbox[0]
+            text_height = bbox[3] - bbox[1]
+            text_x = (width - text_width) / 2
+            text_y = (height - text_height) / 2
+            draw.text((text_x, text_y), message, fill="black", font=font)
+            temp_video_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
+            fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+            out = cv2.VideoWriter(temp_video_file, fourcc, 1, (width, height))
+            frame = cv2.cvtColor(np.array(blank_image), cv2.COLOR_RGB2BGR)
+            out.write(frame)
+            out.release()
+            return None, temp_video_file
+        model = YOLO(model_id)
+        cap = cv2.VideoCapture(video)
+        fps = cap.get(cv2.CAP_PROP_FPS) if cap.get(cv2.CAP_PROP_FPS) > 0 else 25
+        frames = []
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            results = model.predict(
+                source=pil_frame,
+                conf=conf_threshold,
+                iou=iou_threshold,
+                imgsz=640,
+                max_det=max_detection,
+                show_labels=True,
+                show_conf=True,
+            )
+            for r in results:
+                annotated_frame_array = r.plot()
+                annotated_frame = cv2.cvtColor(annotated_frame_array, cv2.COLOR_BGR2RGB)
+            frames.append(annotated_frame)
+        cap.release()
+        if len(frames) == 0:
+            return None, None
+        height_out, width_out, _ = frames[0].shape
+        temp_video_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
+        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+        out = cv2.VideoWriter(temp_video_file, fourcc, fps, (width_out, height_out))
+        for f in frames:
+            f_bgr = cv2.cvtColor(f, cv2.COLOR_RGB2BGR)
+            out.write(f_bgr)
+        out.release()
+        return None, temp_video_file
+    else:
+        return None, None
+def update_visibility(input_type):
     """
+    Show/hide image/video input and output depending on input_type.
     """
+    if input_type == "Image":
+        # image, video, output_image, output_video
+        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
+    else:
+        return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)
+def yolo_inference_for_examples(image, model_id, conf_threshold, iou_threshold, max_detection):
+    """
+    This is called by gr.Examples. We force the radio to 'Image'
+    and then do a standard image inference, returning both updated radio
+    value and the annotated image.
+    """
+    annotated_image, _ = yolo_inference(
+        input_type="Image",
+        image=image,
+        video=None,
+        model_id=model_id,
+        conf_threshold=conf_threshold,
+        iou_threshold=iou_threshold,
+        max_detection=max_detection
+    )
+    return gr.update(value="Image"), annotated_image
+with gr.Blocks() as app:
+    gr.Markdown("# Yolo11: Object Detection, Instance Segmentation, Pose/Keypoints, Oriented Detection, Classification")
+    gr.Markdown("Upload image(s) or video(s) for inference using the latest Ultralytics YOLO11 models.")
+    with gr.Row():
+        with gr.Column():
+            image = gr.Image(type="pil", label="Image", visible=True)
+            video = gr.Video(label="Video", visible=False)
+            input_type = gr.Radio(
+                choices=["Image", "Video"],
+                value="Image",
+                label="Input Type",
+            )
+            conf_threshold = gr.Slider(minimum=0, maximum=1, value=0.25, label="Confidence Threshold")
+            iou_threshold = gr.Slider(minimum=0, maximum=1, value=0.45, label="IoU Threshold")
+            max_detection = gr.Slider(minimum=1, maximum=300, step=1, value=300, label="Max Detection")
+            infer_button = gr.Button("Detect Objects")
+        with gr.Column():
+            output_image = gr.Image(type="pil", label="Annotated Image", visible=True)
+            output_video = gr.Video(label="Annotated Video", visible=False)
+    # Toggle input/output visibility
+    input_type.change(
+        fn=update_visibility,
+        inputs=input_type,
+        outputs=[image, video, output_image, output_video],
+    )
+    # Main inference for button click
+    infer_button.click(
+        fn=yolo_inference,
+        inputs=[input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection],
+        outputs=[output_image, output_video],
+    )
+if __name__ == '__main__':
+    app.launch()

requirements.txt CHANGED Viewed

@@ -1,9 +1,5 @@
-gradio
-opencv-python
-transformers
 torch
-tensorflow
 torchvision
-timm
 ultralytics
 Pillow

+spaces
 torch
 torchvision
 ultralytics
 Pillow