Spaces:

lakshayt
/

MemeGradio

Runtime error

App Files Files Community

lakshayt commited on Oct 10, 2023

Commit

188ba58

1 Parent(s): 369f754

Create app.py

Browse files

Files changed (1) hide show

app.py +113 -0

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import os
+import gradio as gr
+import torch
+import PIL
+from open_flamingo import create_model_and_transforms
+model, image_processor, tokenizer = create_model_and_transforms(
+    clip_vision_encoder_path="ViT-L-14",
+    clip_vision_encoder_pretrained="openai",
+    lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b",
+    tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b",
+    cross_attn_every_n_layers=1,
+    cache_dir="PATH/TO/CACHE/DIR"  # Defaults to ~/.cache
+)
+# grab model checkpoint from huggingface hub
+from huggingface_hub import hf_hub_download
+import torch
+checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-3B-vitl-mpt1b", "checkpoint.pt")
+model.load_state_dict(torch.load(checkpoint_path), strict=False)
+from PIL import Image
+import requests
+import torch
+"""
+Step 1: Load images
+"""
+demo_image_one = Image.open(
+    requests.get(
+        "http://images.cocodataset.org/val2017/000000039769.jpg", stream=True
+    ).raw
+)
+demo_image_two = Image.open(
+    requests.get(
+        "http://images.cocodataset.org/test-stuff2017/000000028137.jpg",
+        stream=True
+    ).raw
+)
+query_image = Image.open(
+    requests.get(
+        "http://images.cocodataset.org/test-stuff2017/000000028352.jpg",
+        stream=True
+    ).raw
+)
+"""
+Step 2: Preprocessing images
+Details: For OpenFlamingo, we expect the image to be a torch tensor of shape
+ batch_size x num_media x num_frames x channels x height x width.
+ In this case batch_size = 1, num_media = 3, num_frames = 1,
+ channels = 3, height = 224, width = 224.
+Step 3: Preprocessing text
+Details: In the text we expect an <image> special token to indicate where an image is.
+ We also expect an <|endofchunk|> special token to indicate the end of the text
+ portion associated with an image.
+tokenizer.padding_side = "left" # For generation padding tokens should be on the left
+lang_x = tokenizer(
+    ["<image>An image of two cats.<|endofchunk|><image>An image of a bathroom sink.<|endofchunk|><image>An image of"],
+    return_tensors="pt",
+)
+"""
+"""
+Step 4: Generate text
+"""
+#print("Generated text: ", tokenizer.decode(generated_text[0]))
+def predict_caption(image, prompt):
+    assert isinstance(prompt, str)
+    vision_x = [image_processor(demo_image_one).unsqueeze(0), image_processor(demo_image_two).unsqueeze(0), image_processor(query_image).unsqueeze(0)]
+    vision_x = torch.cat(vision_x, dim=0)
+    vision_x = vision_x.unsqueeze(1).unsqueeze(0)
+    tokenizer.padding_side = "left" # For generation padding tokens should be on the left
+    lang_x = tokenizer(
+        ["<image>An image of two cats.<|endofchunk|><image>An image of a bathroom sink.<|endofchunk|><image>An image of"],
+        return_tensors="pt",
+    )
+    tokenizer.padding_side = "left" # For generation padding tokens should be on the left
+    lang_x = tokenizer(
+        ["<image>An image of two cats.<|endofchunk|><image>An image of a bathroom sink.<|endofchunk|><image>An image of"],
+        return_tensors="pt",
+    )
+    caption = tokenizer.decode(generated_text[0])
+    return caption
+iface = gr.Interface(fn=predict_caption,
+        inputs=[gr.Image(type="pil"), gr.Textbox(value=DEFAULT_PROMPT, label="Prompt")],
+        examples=examples,
+        outputs="text")
+iface.launch()