Spaces:

Emb3rw
/

Vietnamese-Handwriting-OCR

Running

App Files Files Community

Nguyễn Hoàng Ân commited on Mar 9

Commit

e2ead91

verified ·

1 Parent(s): ff229b5

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -11

app.py CHANGED Viewed

@@ -13,15 +13,17 @@ IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
 def build_transform(input_size):
-    transform = T.Compose([
         T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
         T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
         T.ToTensor(),
         T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
     ])
-    return transform
 def load_image(image, input_size=448):
     transform = build_transform(input_size=input_size)
     pixel_values = transform(image).unsqueeze(0)  # Thêm batch dimension
     return pixel_values
@@ -36,22 +38,44 @@ model = AutoModel.from_pretrained(
 tokenizer = AutoTokenizer.from_pretrained("5CD-AI/Vintern-1B-v3_5", trust_remote_code=True, use_fast=False)
-def process_image(image):
-    pixel_values = load_image(image).to(device)
-    generation_config = dict(max_new_tokens=1024, do_sample=False, num_beams=3, repetition_penalty=2.5)
-    question = "<image>\nTrích xuất toàn bộ thông tin trong ảnh và trả về dạng text."
-    response, _ = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
     return response
 iface = gr.Interface(
     fn=process_image,
-    inputs=gr.Image(type="pil"),
     outputs="text",
-    title="Vietnamese Hand Writing ORC",
-    description="Extract all the information from the image and return it in text form."
 )
 if __name__ == "__main__":
-    iface.launch()

 IMAGENET_STD = (0.229, 0.224, 0.225)
 def build_transform(input_size):
+    return T.Compose([
         T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
         T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
         T.ToTensor(),
         T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
     ])
 def load_image(image, input_size=448):
+    if image is None:
+        raise ValueError("Vui lòng tải lên một hình ảnh hợp lệ.")
     transform = build_transform(input_size=input_size)
     pixel_values = transform(image).unsqueeze(0)  # Thêm batch dimension
     return pixel_values
 tokenizer = AutoTokenizer.from_pretrained("5CD-AI/Vintern-1B-v3_5", trust_remote_code=True, use_fast=False)
+def process_image(image, user_request):
+    try:
+        pixel_values = load_image(image).to(device)
+    except ValueError as e:
+        return str(e)
+    generation_config = {
+        "max_new_tokens": 256,  # Giảm số lượng token để tăng tốc
+        "do_sample": False,
+        "num_beams": 3,
+        "repetition_penalty": 2.0
+    }
+    # Nếu người dùng không nhập yêu cầu, dùng mặc định
+    if not user_request.strip():
+        user_request = "Trích xuất toàn bộ thông tin trong ảnh và trả về dạng text."
+    question = f"<image>\n{user_request}"
+    with torch.inference_mode():
+        try:
+            response, _ = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
+        except Exception as e:
+            return f"Lỗi khi xử lý ảnh: {e}"
     return response
 iface = gr.Interface(
     fn=process_image,
+    inputs=[
+        gr.Image(type="pil"),
+        gr.Textbox(lines=2, placeholder="Nhập yêu cầu của bạn, ví dụ: 'Nhận dạng chữ viết tay và trả về dạng text'"),
+    ],
     outputs="text",
+    title="Vietnamese Handwriting OCR",
+    description="Tải ảnh lên và nhập yêu cầu của bạn để trích xuất thông tin từ ảnh.",
+    theme="dark"  # Chuyển sang giao diện tối
 )
 if __name__ == "__main__":
+    iface.launch()