| | import re
|
| | import numpy as np
|
| | import cv2
|
| | from PIL import Image
|
| | import random
|
| | import torch
|
| | import torchvision.transforms as T
|
| | from torchvision.transforms.functional import InterpolationMode
|
| | from difflib import SequenceMatcher
|
| | from nltk.metrics.distance import edit_distance
|
| | import nltk
|
| |
|
| |
|
| | try:
|
| | nltk.data.find('corpora/words.zip')
|
| | except LookupError:
|
| | nltk.download('words')
|
| | try:
|
| | nltk.data.find('tokenizers/punkt')
|
| | except LookupError:
|
| | nltk.download('punkt')
|
| |
|
| | from nltk.corpus import words
|
| |
|
| | def set_seed(seed=42):
|
| | random.seed(seed)
|
| | np.random.seed(seed)
|
| | torch.manual_seed(seed)
|
| |
|
| | torch.backends.cudnn.deterministic = True
|
| | torch.backends.cudnn.benchmark = False
|
| |
|
| | def build_transform(input_size=448):
|
| | mean = (0.485, 0.456, 0.406)
|
| | std = (0.229, 0.224, 0.225)
|
| | return T.Compose([
|
| | T.Lambda(lambda img: img.convert('RGB')),
|
| | T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
|
| | T.ToTensor(),
|
| | T.Normalize(mean=mean, std=std)
|
| | ])
|
| |
|
| | def get_roi(image_path_or_obj, *roi):
|
| | """
|
| | Extracts ROI from an image path or PIL Image object.
|
| | """
|
| | if isinstance(image_path_or_obj, str):
|
| | image = Image.open(image_path_or_obj).convert('RGB')
|
| | else:
|
| | image = image_path_or_obj.convert('RGB')
|
| |
|
| | width, height = image.size
|
| |
|
| | roi_x_start = int(width * roi[0])
|
| | roi_y_start = int(height * roi[1])
|
| | roi_x_end = int(width * roi[2])
|
| | roi_y_end = int(height * roi[3])
|
| |
|
| | cropped_image = image.crop((roi_x_start, roi_y_start, roi_x_end, roi_y_end))
|
| | return cropped_image
|
| |
|
| | def clean_text(text):
|
| | return re.sub(r'[^a-zA-Z0-9]', '', text).strip().lower()
|
| |
|
| | def are_strings_similar(str1, str2, max_distance=3, max_length_diff=2):
|
| | if str1 == str2:
|
| | return True
|
| | if abs(len(str1) - len(str2)) > max_length_diff:
|
| | return False
|
| | edit_distance_value = edit_distance(str1, str2)
|
| | return edit_distance_value <= max_distance
|
| |
|
| | def blur_image(image, strength):
|
| | image_np = np.array(image)
|
| | blur_strength = int(strength * 50)
|
| | blur_strength = max(1, blur_strength | 1)
|
| | blurred_image = cv2.GaussianBlur(image_np, (blur_strength, blur_strength), 0)
|
| | blurred_pil_image = Image.fromarray(blurred_image)
|
| | return blurred_pil_image
|
| |
|
| | def is_blank(text, limit=15):
|
| | return len(text) < limit
|
| |
|
| | def string_similarity(a, b):
|
| | return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
| |
|
| | def find_similar_substring(text, keyword, threshold=0.9):
|
| | text = text.lower()
|
| | keyword = keyword.lower()
|
| |
|
| | if keyword in text:
|
| | return True
|
| |
|
| | keyword_length = len(keyword.split())
|
| | words_list = text.split()
|
| |
|
| | for i in range(len(words_list) - keyword_length + 1):
|
| | phrase = ' '.join(words_list[i:i + keyword_length])
|
| | similarity = string_similarity(phrase, keyword)
|
| | if similarity >= threshold:
|
| | return True
|
| |
|
| | return False
|
| |
|
| | def destroy_text_roi(image, *roi_params):
|
| | image_np = np.array(image)
|
| |
|
| | h, w, _ = image_np.shape
|
| | x1 = int(roi_params[0] * w)
|
| | y1 = int(roi_params[1] * h)
|
| | x2 = int(roi_params[2] * w)
|
| | y2 = int(roi_params[3] * h)
|
| |
|
| | roi = image_np[y1:y2, x1:x2]
|
| |
|
| | blurred_roi = cv2.GaussianBlur(roi, (75, 75), 0)
|
| | noise = np.random.randint(0, 50, (blurred_roi.shape[0], blurred_roi.shape[1], 3), dtype=np.uint8)
|
| | noisy_blurred_roi = cv2.add(blurred_roi, noise)
|
| | image_np[y1:y2, x1:x2] = noisy_blurred_roi
|
| | return Image.fromarray(image_np)
|
| |
|
| | def is_english(text):
|
| | allowed_pattern = re.compile(
|
| | r'^[a-zA-Z०-९\u0930\s\.,!?\-;:"\'()]*$'
|
| | )
|
| | return bool(allowed_pattern.match(text))
|
| |
|
| | def is_valid_english(text):
|
| | english_words = set(words.words())
|
| | cleaned_words = ''.join(c.lower() if c.isalnum() else ' ' for c in text).split()
|
| | return all(word.lower() in english_words for word in cleaned_words)
|
| |
|