yusenthebot
Integrate advanced OCR, FlashcardGenerator, DifficultyScorer, and AI Quiz from language project
aa3fdef
| # -*- coding: utf-8 -*- | |
| """ | |
| OCR Tools - Advanced text extraction with multi-language support | |
| Supports: English, Chinese, Japanese, Korean, German, Spanish, Russian | |
| """ | |
| import io | |
| import re | |
| from typing import Any, Dict, List, Optional | |
| import numpy as np | |
| from PIL import Image | |
| import pytesseract | |
| from deep_translator import GoogleTranslator | |
| # Try to import optional dependencies | |
| try: | |
| import cv2 | |
| HAS_CV2 = True | |
| except ImportError: | |
| HAS_CV2 = False | |
| try: | |
| from langdetect import detect | |
| HAS_LANGDETECT = True | |
| except ImportError: | |
| HAS_LANGDETECT = False | |
| try: | |
| from paddleocr import PaddleOCR | |
| HAS_PADDLEOCR = True | |
| _paddle_ocr = None | |
| except ImportError: | |
| HAS_PADDLEOCR = False | |
| _paddle_ocr = None | |
| # Language code mapping | |
| LANG_CODE_MAP = { | |
| 'zh-cn': 'zh-CN', | |
| 'zh-tw': 'zh-TW', | |
| 'en': 'en', | |
| 'ja': 'ja', | |
| 'ko': 'ko', | |
| 'fr': 'fr', | |
| 'de': 'de', | |
| 'es': 'es', | |
| 'ru': 'ru', | |
| } | |
| # Tesseract language codes for each supported language | |
| TESSERACT_LANG_MAP = { | |
| 'en': 'eng', | |
| 'english': 'eng', | |
| 'zh-cn': 'chi_sim', | |
| 'chinese': 'chi_sim', | |
| 'zh-tw': 'chi_tra', | |
| 'ja': 'jpn', | |
| 'japanese': 'jpn', | |
| 'ko': 'kor', | |
| 'korean': 'kor', | |
| 'de': 'deu', | |
| 'german': 'deu', | |
| 'es': 'spa', | |
| 'spanish': 'spa', | |
| 'ru': 'rus', | |
| 'russian': 'rus', | |
| 'fr': 'fra', | |
| 'french': 'fra', | |
| } | |
| def _get_paddle_ocr(): | |
| """Lazily initialize PaddleOCR""" | |
| global _paddle_ocr | |
| if HAS_PADDLEOCR and _paddle_ocr is None: | |
| try: | |
| _paddle_ocr = PaddleOCR(use_textline_orientation=True, lang='ch', show_log=False) | |
| except Exception as e: | |
| print(f"[OCR] PaddleOCR init failed: {e}") | |
| return _paddle_ocr | |
| def filter_pinyin_keep_chinese(text: str) -> str: | |
| """ | |
| Filter out pinyin and keep only Chinese characters. | |
| Preserves complete sentences with Chinese characters. | |
| """ | |
| lines = text.split('\n') | |
| filtered_lines = [] | |
| for line in lines: | |
| line_stripped = line.strip() | |
| if not line_stripped: | |
| continue | |
| # Check if line contains Chinese characters | |
| has_chinese = bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', line)) | |
| # Check if line is pure pinyin | |
| is_pinyin = bool(re.match(r'^[a-zA-Z\u0101\u00e1\u01ce\u00e0\u0113\u00e9\u011b\u00e8\u012b\u00ed\u01d0\u00ec\u014d\u00f3\u01d2\u00f2\u016b\u00fa\u01d4\u00f9\u00fc\u01d6\u01d8\u01da\u01dc\u0144\u0148\u01f9\s]+$', line_stripped)) | |
| if is_pinyin: | |
| continue | |
| if has_chinese: | |
| chinese_parts = re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf]+', line) | |
| if chinese_parts: | |
| filtered_lines.append(''.join(chinese_parts)) | |
| return '\n'.join(filtered_lines) | |
| def detect_language_from_text(text: str) -> str: | |
| """Detect language, with special handling for Chinese characters""" | |
| has_chinese = bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', text)) | |
| if has_chinese: | |
| return 'zh-cn' | |
| has_japanese = bool(re.search(r'[\u3040-\u309f\u30a0-\u30ff]', text)) | |
| if has_japanese: | |
| return 'ja' | |
| has_korean = bool(re.search(r'[\uac00-\ud7af]', text)) | |
| if has_korean: | |
| return 'ko' | |
| if HAS_LANGDETECT: | |
| try: | |
| return detect(text) | |
| except: | |
| pass | |
| return 'en' | |
| def _preprocess_image(img_array: np.ndarray, method: str = 'simple') -> np.ndarray: | |
| """Apply image preprocessing for better OCR accuracy""" | |
| if not HAS_CV2: | |
| return img_array | |
| # Convert to grayscale if needed | |
| if len(img_array.shape) == 3: | |
| gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY) | |
| else: | |
| gray = img_array | |
| if method == 'simple': | |
| _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) | |
| return binary | |
| elif method == 'adaptive': | |
| return cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2) | |
| elif method == 'clahe': | |
| clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) | |
| enhanced = clahe.apply(gray) | |
| _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) | |
| return binary | |
| elif method == 'denoised': | |
| kernel = np.ones((2, 2), np.uint8) | |
| denoised = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel, iterations=1) | |
| _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) | |
| return binary | |
| elif method == 'advanced': | |
| clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) | |
| enhanced = clahe.apply(gray) | |
| denoised = cv2.fastNlMeansDenoising(enhanced, None, 10, 7, 21) | |
| return cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2) | |
| else: | |
| return gray | |
| def _ocr_with_paddleocr(image_bytes: bytes) -> tuple: | |
| """Use PaddleOCR for text extraction (best for Chinese)""" | |
| paddle = _get_paddle_ocr() | |
| if paddle is None: | |
| return None, 0 | |
| try: | |
| img = Image.open(io.BytesIO(image_bytes)).convert("RGB") | |
| img_array = np.array(img) | |
| result = paddle.ocr(img_array, cls=True) | |
| if not result or len(result) == 0 or result[0] is None: | |
| return None, 0 | |
| texts = [] | |
| scores = [] | |
| for line in result[0]: | |
| if line and len(line) >= 2: | |
| text_info = line[1] | |
| if isinstance(text_info, tuple) and len(text_info) >= 2: | |
| texts.append(text_info[0]) | |
| scores.append(text_info[1]) | |
| if not texts: | |
| return None, 0 | |
| full_text = '\n'.join(texts) | |
| avg_confidence = sum(scores) / len(scores) if scores else 0 | |
| return full_text, avg_confidence * 100 | |
| except Exception as e: | |
| print(f"[OCR] PaddleOCR error: {e}") | |
| return None, 0 | |
| def _ocr_with_tesseract(image_bytes: bytes, lang: str = 'eng+chi_sim+jpn+kor') -> tuple: | |
| """Use Tesseract with multiple preprocessing methods""" | |
| img = Image.open(io.BytesIO(image_bytes)).convert("RGB") | |
| img_array = np.array(img) | |
| best_text = "" | |
| best_confidence = 0 | |
| best_method = "" | |
| # Try different preprocessing methods | |
| methods = ['simple', 'adaptive', 'clahe', 'denoised'] | |
| if HAS_CV2: | |
| methods.append('advanced') | |
| for method in methods: | |
| try: | |
| if HAS_CV2: | |
| processed = _preprocess_image(img_array, method) | |
| processed_img = Image.fromarray(processed) | |
| else: | |
| processed_img = img | |
| # Get OCR data with confidence | |
| data = pytesseract.image_to_data(processed_img, lang=lang, output_type=pytesseract.Output.DICT) | |
| text = pytesseract.image_to_string(processed_img, lang=lang) | |
| # Calculate average confidence | |
| confidences = [int(conf) for conf in data['conf'] if int(conf) > 0] | |
| avg_confidence = sum(confidences) / len(confidences) if confidences else 0 | |
| if text.strip() and avg_confidence > best_confidence: | |
| best_text = text | |
| best_confidence = avg_confidence | |
| best_method = method | |
| except Exception as e: | |
| continue | |
| return best_text.strip(), best_confidence, best_method | |
| def ocr_single_image( | |
| image_bytes: bytes, | |
| source_lang: Optional[str] = None, | |
| target_lang: str = "en", | |
| use_paddle: bool = True, | |
| ) -> Dict[str, Any]: | |
| """ | |
| Extract text from a single image and translate. | |
| Args: | |
| image_bytes: Raw image bytes | |
| source_lang: Source language hint (auto-detect if None) | |
| target_lang: Target language for translation | |
| use_paddle: Whether to try PaddleOCR first | |
| Returns: | |
| Dict with original_text, translated_text, detected_language, confidence, method | |
| """ | |
| best_text = "" | |
| best_method = "" | |
| best_confidence = 0 | |
| # Determine Tesseract language string | |
| tess_lang = 'eng+chi_sim+chi_tra+jpn+kor+deu+spa+rus+fra' | |
| if source_lang: | |
| mapped = TESSERACT_LANG_MAP.get(source_lang.lower()) | |
| if mapped: | |
| tess_lang = mapped | |
| # Try PaddleOCR first (best for Chinese) | |
| if use_paddle and HAS_PADDLEOCR: | |
| paddle_text, paddle_conf = _ocr_with_paddleocr(image_bytes) | |
| if paddle_text and paddle_text.strip(): | |
| best_text = paddle_text | |
| best_method = "PaddleOCR" | |
| best_confidence = paddle_conf | |
| # Try Tesseract (fallback or if PaddleOCR failed) | |
| if not best_text.strip(): | |
| tess_text, tess_conf, tess_method = _ocr_with_tesseract(image_bytes, tess_lang) | |
| if tess_text and (tess_conf > best_confidence or not best_text): | |
| best_text = tess_text | |
| best_method = f"Tesseract-{tess_method}" | |
| best_confidence = tess_conf | |
| if not best_text.strip(): | |
| return { | |
| "original_text": "", | |
| "translated_text": "", | |
| "detected_language": "unknown", | |
| "confidence": 0, | |
| "method": "none", | |
| "error": "No text detected" | |
| } | |
| # Filter pinyin for Chinese text | |
| filtered_text = filter_pinyin_keep_chinese(best_text) | |
| if not filtered_text.strip(): | |
| filtered_text = best_text | |
| # Detect language | |
| detected_lang = detect_language_from_text(filtered_text) | |
| # Translate | |
| try: | |
| source = LANG_CODE_MAP.get(detected_lang, detected_lang) | |
| target = LANG_CODE_MAP.get(target_lang, target_lang) | |
| translator = GoogleTranslator(source=source, target=target) | |
| translated = translator.translate(filtered_text) | |
| except Exception as e: | |
| translated = "" | |
| return { | |
| "original_text": filtered_text.strip(), | |
| "translated_text": translated.strip() if translated else "", | |
| "detected_language": detected_lang, | |
| "confidence": round(best_confidence, 2), | |
| "method": best_method | |
| } | |
| def ocr_and_translate_batch( | |
| images: List[bytes], | |
| target_lang: str = "en", | |
| prefer_ocr_local: bool = True, | |
| ) -> List[Dict]: | |
| """ | |
| Runs OCR on a batch of images with advanced processing. | |
| Args: | |
| images: List of image bytes | |
| target_lang: Target language for translation | |
| prefer_ocr_local: Whether to prefer local OCR (PaddleOCR) | |
| Returns: | |
| List of dicts with OCR results | |
| """ | |
| results = [] | |
| for img_bytes in images: | |
| result = ocr_single_image( | |
| image_bytes=img_bytes, | |
| target_lang=target_lang, | |
| use_paddle=prefer_ocr_local and HAS_PADDLEOCR | |
| ) | |
| # Convert to expected format for backward compatibility | |
| results.append({ | |
| "text": result.get("original_text", ""), | |
| "translation": result.get("translated_text", ""), | |
| "target_lang": target_lang, | |
| "detected_language": result.get("detected_language", "unknown"), | |
| "confidence": result.get("confidence", 0), | |
| "method": result.get("method", "unknown"), | |
| }) | |
| return results | |
| # Keep old function for backward compatibility | |
| def _simple_ocr(image_bytes: bytes) -> str: | |
| """Simple OCR using pytesseract (backward compatibility)""" | |
| img = Image.open(io.BytesIO(image_bytes)).convert("RGB") | |
| text = pytesseract.image_to_string(img) | |
| return text.strip() | |