Upload InternVL2 implementation
Browse files- Dockerfile +2 -0
- app_internvl2.py +31 -6
- requirements.txt +2 -1
Dockerfile
CHANGED
|
@@ -60,6 +60,8 @@ RUN pip3 install --no-cache-dir --upgrade pip && \
|
|
| 60 |
pip3 install --no-cache-dir transformers==4.37.2 safetensors==0.4.1 huggingface_hub==0.19.4 && \
|
| 61 |
# Install timm for vision models
|
| 62 |
pip3 install --no-cache-dir timm==0.9.11 && \
|
|
|
|
|
|
|
| 63 |
# Install lmdeploy and its dependencies first
|
| 64 |
pip3 install --no-cache-dir "accelerate==0.30.0" && \
|
| 65 |
pip3 install --no-cache-dir "lmdeploy==0.5.3" && \
|
|
|
|
| 60 |
pip3 install --no-cache-dir transformers==4.37.2 safetensors==0.4.1 huggingface_hub==0.19.4 && \
|
| 61 |
# Install timm for vision models
|
| 62 |
pip3 install --no-cache-dir timm==0.9.11 && \
|
| 63 |
+
# Install nest-asyncio for handling nested event loops
|
| 64 |
+
pip3 install --no-cache-dir nest-asyncio==1.5.8 && \
|
| 65 |
# Install lmdeploy and its dependencies first
|
| 66 |
pip3 install --no-cache-dir "accelerate==0.30.0" && \
|
| 67 |
pip3 install --no-cache-dir "lmdeploy==0.5.3" && \
|
app_internvl2.py
CHANGED
|
@@ -8,6 +8,11 @@ import warnings
|
|
| 8 |
import stat
|
| 9 |
import subprocess
|
| 10 |
import sys
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# Set environment variables
|
| 13 |
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
|
|
@@ -143,11 +148,14 @@ def load_internvl2_model():
|
|
| 143 |
# Configure for AWQ quantized model
|
| 144 |
backend_config = TurbomindEngineConfig(model_format='awq')
|
| 145 |
|
| 146 |
-
# Create pipeline
|
| 147 |
internvl2_pipeline = pipeline(
|
| 148 |
MODEL_ID,
|
| 149 |
-
backend_config=backend_config,
|
| 150 |
-
log_level='INFO'
|
|
|
|
|
|
|
|
|
|
| 151 |
)
|
| 152 |
|
| 153 |
print("InternVL2 model loaded successfully!")
|
|
@@ -189,11 +197,28 @@ def analyze_image(image, prompt):
|
|
| 189 |
# If somehow it's already a PIL Image
|
| 190 |
image_pil = image.convert('RGB')
|
| 191 |
|
| 192 |
-
# Run inference with the model
|
| 193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
# Get the response text
|
| 196 |
-
result = response.text
|
| 197 |
|
| 198 |
elapsed_time = time.time() - start_time
|
| 199 |
return result
|
|
|
|
| 8 |
import stat
|
| 9 |
import subprocess
|
| 10 |
import sys
|
| 11 |
+
import asyncio
|
| 12 |
+
import nest_asyncio
|
| 13 |
+
|
| 14 |
+
# Apply nest_asyncio to allow nested event loops
|
| 15 |
+
nest_asyncio.apply()
|
| 16 |
|
| 17 |
# Set environment variables
|
| 18 |
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
|
|
|
|
| 148 |
# Configure for AWQ quantized model
|
| 149 |
backend_config = TurbomindEngineConfig(model_format='awq')
|
| 150 |
|
| 151 |
+
# Create pipeline with non-streaming mode to avoid asyncio conflicts
|
| 152 |
internvl2_pipeline = pipeline(
|
| 153 |
MODEL_ID,
|
| 154 |
+
backend_config=backend_config,
|
| 155 |
+
log_level='INFO',
|
| 156 |
+
model_name_or_path=None,
|
| 157 |
+
backend_name="turbomind",
|
| 158 |
+
stream=False # Important: disable streaming to avoid asyncio issues
|
| 159 |
)
|
| 160 |
|
| 161 |
print("InternVL2 model loaded successfully!")
|
|
|
|
| 197 |
# If somehow it's already a PIL Image
|
| 198 |
image_pil = image.convert('RGB')
|
| 199 |
|
| 200 |
+
# Run inference with the model, handling event loop manually
|
| 201 |
+
loop = asyncio.get_event_loop()
|
| 202 |
+
if loop.is_running():
|
| 203 |
+
# If we're in a running event loop (like Gradio's),
|
| 204 |
+
# we need to use run_in_executor for blocking operations
|
| 205 |
+
print("Using threaded execution for model inference")
|
| 206 |
+
# Define a function that will run in a separate thread
|
| 207 |
+
def run_inference():
|
| 208 |
+
return internvl2_pipeline((prompt, image_pil))
|
| 209 |
+
|
| 210 |
+
# Run the inference in a thread pool executor
|
| 211 |
+
response = loop.run_in_executor(None, run_inference)
|
| 212 |
+
# Wait for the result
|
| 213 |
+
if hasattr(response, "result"):
|
| 214 |
+
response = response.result()
|
| 215 |
+
else:
|
| 216 |
+
# Standard synchronous execution
|
| 217 |
+
print("Using standard execution for model inference")
|
| 218 |
+
response = internvl2_pipeline((prompt, image_pil))
|
| 219 |
|
| 220 |
# Get the response text
|
| 221 |
+
result = response.text if hasattr(response, "text") else str(response)
|
| 222 |
|
| 223 |
elapsed_time = time.time() - start_time
|
| 224 |
return result
|
requirements.txt
CHANGED
|
@@ -16,4 +16,5 @@ packaging==23.2
|
|
| 16 |
pyyaml==6.0.1
|
| 17 |
tqdm==4.66.1
|
| 18 |
typing-extensions==4.10.0
|
| 19 |
-
timm==0.9.11
|
|
|
|
|
|
| 16 |
pyyaml==6.0.1
|
| 17 |
tqdm==4.66.1
|
| 18 |
typing-extensions==4.10.0
|
| 19 |
+
timm==0.9.11
|
| 20 |
+
nest-asyncio==1.5.8
|