agentic-language-partner

Sleeping

agentic-language-partner / src /app /ocr_tools.py

yusenthebot

Integrate advanced OCR, FlashcardGenerator, DifficultyScorer, and AI Quiz from language project

aa3fdef 18 days ago

11.3 kB

	# -- coding: utf-8 --
	"""
	OCR Tools - Advanced text extraction with multi-language support
	Supports: English, Chinese, Japanese, Korean, German, Spanish, Russian
	"""

	import io
	import re
	from typing import Any, Dict, List, Optional

	import numpy as np
	from PIL import Image
	import pytesseract
	from deep_translator import GoogleTranslator

	# Try to import optional dependencies
	try:
	import cv2
	HAS_CV2 = True
	except ImportError:
	HAS_CV2 = False

	try:
	from langdetect import detect
	HAS_LANGDETECT = True
	except ImportError:
	HAS_LANGDETECT = False

	try:
	from paddleocr import PaddleOCR
	HAS_PADDLEOCR = True
	_paddle_ocr = None
	except ImportError:
	HAS_PADDLEOCR = False
	_paddle_ocr = None


	# Language code mapping
	LANG_CODE_MAP = {
	'zh-cn': 'zh-CN',
	'zh-tw': 'zh-TW',
	'en': 'en',
	'ja': 'ja',
	'ko': 'ko',
	'fr': 'fr',
	'de': 'de',
	'es': 'es',
	'ru': 'ru',
	}

	# Tesseract language codes for each supported language
	TESSERACT_LANG_MAP = {
	'en': 'eng',
	'english': 'eng',
	'zh-cn': 'chi_sim',
	'chinese': 'chi_sim',
	'zh-tw': 'chi_tra',
	'ja': 'jpn',
	'japanese': 'jpn',
	'ko': 'kor',
	'korean': 'kor',
	'de': 'deu',
	'german': 'deu',
	'es': 'spa',
	'spanish': 'spa',
	'ru': 'rus',
	'russian': 'rus',
	'fr': 'fra',
	'french': 'fra',
	}


	def _get_paddle_ocr():
	"""Lazily initialize PaddleOCR"""
	global _paddle_ocr
	if HAS_PADDLEOCR and _paddle_ocr is None:
	try:
	_paddle_ocr = PaddleOCR(use_textline_orientation=True, lang='ch', show_log=False)
	except Exception as e:
	print(f"[OCR] PaddleOCR init failed: {e}")
	return _paddle_ocr


	def filter_pinyin_keep_chinese(text: str) -> str:
	"""
	Filter out pinyin and keep only Chinese characters.
	Preserves complete sentences with Chinese characters.
	"""
	lines = text.split('\n')
	filtered_lines = []

	for line in lines:
	line_stripped = line.strip()
	if not line_stripped:
	continue

	# Check if line contains Chinese characters
	has_chinese = bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', line))

	# Check if line is pure pinyin
	is_pinyin = bool(re.match(r'^[a-zA-Z\u0101\u00e1\u01ce\u00e0\u0113\u00e9\u011b\u00e8\u012b\u00ed\u01d0\u00ec\u014d\u00f3\u01d2\u00f2\u016b\u00fa\u01d4\u00f9\u00fc\u01d6\u01d8\u01da\u01dc\u0144\u0148\u01f9\s]+$', line_stripped))

	if is_pinyin:
	continue

	if has_chinese:
	chinese_parts = re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf]+', line)
	if chinese_parts:
	filtered_lines.append(''.join(chinese_parts))

	return '\n'.join(filtered_lines)


	def detect_language_from_text(text: str) -> str:
	"""Detect language, with special handling for Chinese characters"""
	has_chinese = bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', text))
	if has_chinese:
	return 'zh-cn'

	has_japanese = bool(re.search(r'[\u3040-\u309f\u30a0-\u30ff]', text))
	if has_japanese:
	return 'ja'

	has_korean = bool(re.search(r'[\uac00-\ud7af]', text))
	if has_korean:
	return 'ko'

	if HAS_LANGDETECT:
	try:
	return detect(text)
	except:
	pass

	return 'en'


	def _preprocess_image(img_array: np.ndarray, method: str = 'simple') -> np.ndarray:
	"""Apply image preprocessing for better OCR accuracy"""
	if not HAS_CV2:
	return img_array

	# Convert to grayscale if needed
	if len(img_array.shape) == 3:
	gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
	else:
	gray = img_array

	if method == 'simple':
	_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
	return binary
	elif method == 'adaptive':
	return cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
	elif method == 'clahe':
	clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
	enhanced = clahe.apply(gray)
	_, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
	return binary
	elif method == 'denoised':
	kernel = np.ones((2, 2), np.uint8)
	denoised = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel, iterations=1)
	_, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
	return binary
	elif method == 'advanced':
	clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
	enhanced = clahe.apply(gray)
	denoised = cv2.fastNlMeansDenoising(enhanced, None, 10, 7, 21)
	return cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
	else:
	return gray


	def _ocr_with_paddleocr(image_bytes: bytes) -> tuple:
	"""Use PaddleOCR for text extraction (best for Chinese)"""
	paddle = _get_paddle_ocr()
	if paddle is None:
	return None, 0

	try:
	img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
	img_array = np.array(img)

	result = paddle.ocr(img_array, cls=True)

	if not result or len(result) == 0 or result[0] is None:
	return None, 0

	texts = []
	scores = []
	for line in result[0]:
	if line and len(line) >= 2:
	text_info = line[1]
	if isinstance(text_info, tuple) and len(text_info) >= 2:
	texts.append(text_info[0])
	scores.append(text_info[1])

	if not texts:
	return None, 0

	full_text = '\n'.join(texts)
	avg_confidence = sum(scores) / len(scores) if scores else 0

	return full_text, avg_confidence * 100

	except Exception as e:
	print(f"[OCR] PaddleOCR error: {e}")
	return None, 0


	def _ocr_with_tesseract(image_bytes: bytes, lang: str = 'eng+chi_sim+jpn+kor') -> tuple:
	"""Use Tesseract with multiple preprocessing methods"""
	img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
	img_array = np.array(img)

	best_text = ""
	best_confidence = 0
	best_method = ""

	# Try different preprocessing methods
	methods = ['simple', 'adaptive', 'clahe', 'denoised']
	if HAS_CV2:
	methods.append('advanced')

	for method in methods:
	try:
	if HAS_CV2:
	processed = _preprocess_image(img_array, method)
	processed_img = Image.fromarray(processed)
	else:
	processed_img = img

	# Get OCR data with confidence
	data = pytesseract.image_to_data(processed_img, lang=lang, output_type=pytesseract.Output.DICT)
	text = pytesseract.image_to_string(processed_img, lang=lang)

	# Calculate average confidence
	confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
	avg_confidence = sum(confidences) / len(confidences) if confidences else 0

	if text.strip() and avg_confidence > best_confidence:
	best_text = text
	best_confidence = avg_confidence
	best_method = method

	except Exception as e:
	continue

	return best_text.strip(), best_confidence, best_method


	def ocr_single_image(
	image_bytes: bytes,
	source_lang: Optional[str] = None,
	target_lang: str = "en",
	use_paddle: bool = True,
	) -> Dict[str, Any]:
	"""
	Extract text from a single image and translate.

	Args:
	image_bytes: Raw image bytes
	source_lang: Source language hint (auto-detect if None)
	target_lang: Target language for translation
	use_paddle: Whether to try PaddleOCR first

	Returns:
	Dict with original_text, translated_text, detected_language, confidence, method
	"""
	best_text = ""
	best_method = ""
	best_confidence = 0

	# Determine Tesseract language string
	tess_lang = 'eng+chi_sim+chi_tra+jpn+kor+deu+spa+rus+fra'
	if source_lang:
	mapped = TESSERACT_LANG_MAP.get(source_lang.lower())
	if mapped:
	tess_lang = mapped

	# Try PaddleOCR first (best for Chinese)
	if use_paddle and HAS_PADDLEOCR:
	paddle_text, paddle_conf = _ocr_with_paddleocr(image_bytes)
	if paddle_text and paddle_text.strip():
	best_text = paddle_text
	best_method = "PaddleOCR"
	best_confidence = paddle_conf

	# Try Tesseract (fallback or if PaddleOCR failed)
	if not best_text.strip():
	tess_text, tess_conf, tess_method = _ocr_with_tesseract(image_bytes, tess_lang)
	if tess_text and (tess_conf > best_confidence or not best_text):
	best_text = tess_text
	best_method = f"Tesseract-{tess_method}"
	best_confidence = tess_conf

	if not best_text.strip():
	return {
	"original_text": "",
	"translated_text": "",
	"detected_language": "unknown",
	"confidence": 0,
	"method": "none",
	"error": "No text detected"
	}

	# Filter pinyin for Chinese text
	filtered_text = filter_pinyin_keep_chinese(best_text)
	if not filtered_text.strip():
	filtered_text = best_text

	# Detect language
	detected_lang = detect_language_from_text(filtered_text)

	# Translate
	try:
	source = LANG_CODE_MAP.get(detected_lang, detected_lang)
	target = LANG_CODE_MAP.get(target_lang, target_lang)
	translator = GoogleTranslator(source=source, target=target)
	translated = translator.translate(filtered_text)
	except Exception as e:
	translated = ""

	return {
	"original_text": filtered_text.strip(),
	"translated_text": translated.strip() if translated else "",
	"detected_language": detected_lang,
	"confidence": round(best_confidence, 2),
	"method": best_method
	}


	def ocr_and_translate_batch(
	images: List[bytes],
	target_lang: str = "en",
	prefer_ocr_local: bool = True,
	) -> List[Dict]:
	"""
	Runs OCR on a batch of images with advanced processing.

	Args:
	images: List of image bytes
	target_lang: Target language for translation
	prefer_ocr_local: Whether to prefer local OCR (PaddleOCR)

	Returns:
	List of dicts with OCR results
	"""
	results = []

	for img_bytes in images:
	result = ocr_single_image(
	image_bytes=img_bytes,
	target_lang=target_lang,
	use_paddle=prefer_ocr_local and HAS_PADDLEOCR
	)

	# Convert to expected format for backward compatibility
	results.append({
	"text": result.get("original_text", ""),
	"translation": result.get("translated_text", ""),
	"target_lang": target_lang,
	"detected_language": result.get("detected_language", "unknown"),
	"confidence": result.get("confidence", 0),
	"method": result.get("method", "unknown"),
	})

	return results


	# Keep old function for backward compatibility
	def _simple_ocr(image_bytes: bytes) -> str:
	"""Simple OCR using pytesseract (backward compatibility)"""
	img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
	text = pytesseract.image_to_string(img)
	return text.strip()