agentic-language-partner

Running

App Files Files Community

yusenthebot commited on 14 days ago

Commit

aa3fdef

1 Parent(s): 902e65e

Integrate advanced OCR, FlashcardGenerator, DifficultyScorer, and AI Quiz from language project

Browse files

Files changed (12) hide show

data/cefr/cefr_words.json +50 -0
data/cefr && cp CUsers13197OneDriveDesktopproject2languagedatahskhsk_words.json CUsers13197OneDriveDesktop24697Project2agentic-language-partnerdatahsk +50 -0
data/hsk/hsk_words.json +75 -0
data/jlpt/jlpt_words.json +62 -0
data/jlpt && cp CUsers13197OneDriveDesktopproject2languagedatatopiktopik_words.json CUsers13197OneDriveDesktop24697Project2agentic-language-partnerdatatopik +62 -0
data/topik/topik_words.json +70 -0
requirements.txt +8 -1
src/app/difficulty_scorer.py +290 -0
src/app/flashcard_generator.py +288 -0
src/app/flashcards_tools.py +153 -49
src/app/ocr_tools.py +350 -40
src/app/quiz_tools.py +379 -16

data/cefr/cefr_words.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "name": "CEFR (Common European Framework of Reference)",
+  "description": "Proficiency levels for European languages",
+  "languages": ["en", "de", "es", "fr", "it"],
+  "source": "Sample data - Replace with complete CEFR database for production",
+  "levels": {
+    "A1": {
+      "description": "Beginner",
+      "score": 1,
+      "en": ["hello", "goodbye", "yes", "no", "please", "thank you", "water", "food", "house", "cat", "dog", "book", "table", "chair", "good", "bad", "big", "small", "one", "two"],
+      "de": ["hallo", "tschüss", "ja", "nein", "bitte", "danke", "wasser", "essen", "haus", "katze", "hund", "buch", "tisch", "stuhl", "gut", "schlecht", "groß", "klein", "eins", "zwei"],
+      "es": ["hola", "adiós", "sí", "no", "por favor", "gracias", "agua", "comida", "casa", "gato", "perro", "libro", "mesa", "silla", "bueno", "malo", "grande", "pequeño", "uno", "dos"]
+    },
+    "A2": {
+      "description": "Elementary",
+      "score": 2,
+      "en": ["restaurant", "morning", "afternoon", "week", "month", "family", "friend", "work", "school", "city", "country", "weather", "summer", "winter", "happy", "sad", "easy", "difficult", "beautiful", "expensive"],
+      "de": ["restaurant", "morgen", "nachmittag", "woche", "monat", "familie", "freund", "arbeit", "schule", "stadt", "land", "wetter", "sommer", "winter", "glücklich", "traurig", "einfach", "schwierig", "schön", "teuer"],
+      "es": ["restaurante", "mañana", "tarde", "semana", "mes", "familia", "amigo", "trabajo", "escuela", "ciudad", "país", "tiempo", "verano", "invierno", "feliz", "triste", "fácil", "difícil", "hermoso", "caro"]
+    },
+    "B1": {
+      "description": "Intermediate",
+      "score": 3,
+      "en": ["experience", "environment", "situation", "knowledge", "relationship", "government", "education", "opportunity", "responsibility", "technology", "culture", "society", "development", "economy", "necessary", "available", "significant", "traditional", "generally", "particularly"],
+      "de": ["erfahrung", "umwelt", "situation", "wissen", "beziehung", "regierung", "bildung", "gelegenheit", "verantwortung", "technologie", "kultur", "gesellschaft", "entwicklung", "wirtschaft", "notwendig", "verfügbar", "bedeutend", "traditionell", "allgemein", "besonders"],
+      "es": ["experiencia", "medio ambiente", "situación", "conocimiento", "relación", "gobierno", "educación", "oportunidad", "responsabilidad", "tecnología", "cultura", "sociedad", "desarrollo", "economía", "necesario", "disponible", "significativo", "tradicional", "generalmente", "particularmente"]
+    },
+    "B2": {
+      "description": "Upper Intermediate",
+      "score": 4,
+      "en": ["implementation", "infrastructure", "phenomenon", "component", "perspective", "theoretical", "comprehensive", "substantial", "predominantly", "furthermore", "nevertheless", "consequently", "regarding", "whereas", "thereby"],
+      "de": ["umsetzung", "infrastruktur", "phänomen", "komponente", "perspektive", "theoretisch", "umfassend", "wesentlich", "vorwiegend", "außerdem", "dennoch", "folglich", "bezüglich", "wohingegen", "dadurch"],
+      "es": ["implementación", "infraestructura", "fenómeno", "componente", "perspectiva", "teórico", "comprensivo", "sustancial", "predominantemente", "además", "sin embargo", "consecuentemente", "respecto a", "mientras que", "por lo tanto"]
+    },
+    "C1": {
+      "description": "Advanced",
+      "score": 5,
+      "en": ["aforementioned", "notwithstanding", "juxtapose", "paradigm", "methodology", "hypothesis", "empirical", "ambiguous", "intrinsic", "exacerbate", "unprecedented", "inadvertent", "inherent", "albeit", "albeit"],
+      "de": ["oben erwähnt", "ungeachtet", "gegenüberstellen", "paradigma", "methodologie", "hypothese", "empirisch", "mehrdeutig", "intrinsisch", "verschärfen", "beispiellos", "unbeabsichtigt", "inhärent", "obwohl", "obgleich"],
+      "es": ["mencionado anteriormente", "no obstante", "yuxtaponer", "paradigma", "metodología", "hipótesis", "empírico", "ambiguo", "intrínseco", "exacerbar", "sin precedentes", "inadvertido", "inherente", "aunque", "si bien"]
+    },
+    "C2": {
+      "description": "Proficient",
+      "score": 6,
+      "en": ["epistemological", "quintessential", "perspicacious", "ubiquitous", "vicissitude", "surreptitious", "obfuscate", "indefatigable", "recalcitrant", "acquiesce"],
+      "de": ["erkenntnistheoretisch", "wesentlich", "scharfsinnig", "allgegenwärtig", "wandel", "heimlich", "verschleiern", "unermüdlich", "widerspenstig", "einwilligen"],
+      "es": ["epistemológico", "quintaesencial", "perspicaz", "ubicuo", "vicisitud", "subrepticio", "ofuscar", "infatigable", "recalcitrante", "consentir"]
+    }
+  }
+}

data/cefr && cp CUsers13197OneDriveDesktopproject2languagedatahskhsk_words.json CUsers13197OneDriveDesktop24697Project2agentic-language-partnerdatahsk ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "name": "CEFR (Common European Framework of Reference)",
+  "description": "Proficiency levels for European languages",
+  "languages": ["en", "de", "es", "fr", "it"],
+  "source": "Sample data - Replace with complete CEFR database for production",
+  "levels": {
+    "A1": {
+      "description": "Beginner",
+      "score": 1,
+      "en": ["hello", "goodbye", "yes", "no", "please", "thank you", "water", "food", "house", "cat", "dog", "book", "table", "chair", "good", "bad", "big", "small", "one", "two"],
+      "de": ["hallo", "tschüss", "ja", "nein", "bitte", "danke", "wasser", "essen", "haus", "katze", "hund", "buch", "tisch", "stuhl", "gut", "schlecht", "groß", "klein", "eins", "zwei"],
+      "es": ["hola", "adiós", "sí", "no", "por favor", "gracias", "agua", "comida", "casa", "gato", "perro", "libro", "mesa", "silla", "bueno", "malo", "grande", "pequeño", "uno", "dos"]
+    },
+    "A2": {
+      "description": "Elementary",
+      "score": 2,
+      "en": ["restaurant", "morning", "afternoon", "week", "month", "family", "friend", "work", "school", "city", "country", "weather", "summer", "winter", "happy", "sad", "easy", "difficult", "beautiful", "expensive"],
+      "de": ["restaurant", "morgen", "nachmittag", "woche", "monat", "familie", "freund", "arbeit", "schule", "stadt", "land", "wetter", "sommer", "winter", "glücklich", "traurig", "einfach", "schwierig", "schön", "teuer"],
+      "es": ["restaurante", "mañana", "tarde", "semana", "mes", "familia", "amigo", "trabajo", "escuela", "ciudad", "país", "tiempo", "verano", "invierno", "feliz", "triste", "fácil", "difícil", "hermoso", "caro"]
+    },
+    "B1": {
+      "description": "Intermediate",
+      "score": 3,
+      "en": ["experience", "environment", "situation", "knowledge", "relationship", "government", "education", "opportunity", "responsibility", "technology", "culture", "society", "development", "economy", "necessary", "available", "significant", "traditional", "generally", "particularly"],
+      "de": ["erfahrung", "umwelt", "situation", "wissen", "beziehung", "regierung", "bildung", "gelegenheit", "verantwortung", "technologie", "kultur", "gesellschaft", "entwicklung", "wirtschaft", "notwendig", "verfügbar", "bedeutend", "traditionell", "allgemein", "besonders"],
+      "es": ["experiencia", "medio ambiente", "situación", "conocimiento", "relación", "gobierno", "educación", "oportunidad", "responsabilidad", "tecnología", "cultura", "sociedad", "desarrollo", "economía", "necesario", "disponible", "significativo", "tradicional", "generalmente", "particularmente"]
+    },
+    "B2": {
+      "description": "Upper Intermediate",
+      "score": 4,
+      "en": ["implementation", "infrastructure", "phenomenon", "component", "perspective", "theoretical", "comprehensive", "substantial", "predominantly", "furthermore", "nevertheless", "consequently", "regarding", "whereas", "thereby"],
+      "de": ["umsetzung", "infrastruktur", "phänomen", "komponente", "perspektive", "theoretisch", "umfassend", "wesentlich", "vorwiegend", "außerdem", "dennoch", "folglich", "bezüglich", "wohingegen", "dadurch"],
+      "es": ["implementación", "infraestructura", "fenómeno", "componente", "perspectiva", "teórico", "comprensivo", "sustancial", "predominantemente", "además", "sin embargo", "consecuentemente", "respecto a", "mientras que", "por lo tanto"]
+    },
+    "C1": {
+      "description": "Advanced",
+      "score": 5,
+      "en": ["aforementioned", "notwithstanding", "juxtapose", "paradigm", "methodology", "hypothesis", "empirical", "ambiguous", "intrinsic", "exacerbate", "unprecedented", "inadvertent", "inherent", "albeit", "albeit"],
+      "de": ["oben erwähnt", "ungeachtet", "gegenüberstellen", "paradigma", "methodologie", "hypothese", "empirisch", "mehrdeutig", "intrinsisch", "verschärfen", "beispiellos", "unbeabsichtigt", "inhärent", "obwohl", "obgleich"],
+      "es": ["mencionado anteriormente", "no obstante", "yuxtaponer", "paradigma", "metodología", "hipótesis", "empírico", "ambiguo", "intrínseco", "exacerbar", "sin precedentes", "inadvertido", "inherente", "aunque", "si bien"]
+    },
+    "C2": {
+      "description": "Proficient",
+      "score": 6,
+      "en": ["epistemological", "quintessential", "perspicacious", "ubiquitous", "vicissitude", "surreptitious", "obfuscate", "indefatigable", "recalcitrant", "acquiesce"],
+      "de": ["erkenntnistheoretisch", "wesentlich", "scharfsinnig", "allgegenwärtig", "wandel", "heimlich", "verschleiern", "unermüdlich", "widerspenstig", "einwilligen"],
+      "es": ["epistemológico", "quintaesencial", "perspicaz", "ubicuo", "vicisitud", "subrepticio", "ofuscar", "infatigable", "recalcitrante", "consentir"]
+    }
+  }
+}

data/hsk/hsk_words.json ADDED Viewed

	@@ -0,0 +1,75 @@

+{
+  "name": "HSK (Hanyu Shuiping Kaoshi)",
+  "description": "Chinese Proficiency Test",
+  "language": "zh-cn",
+  "source": "Sample data - Replace with complete HSK database (~5000 words) for production",
+  "levels": {
+    "1": {
+      "description": "Beginner",
+      "score": 1,
+      "words": [
+        "你", "我", "他", "她", "们", "的", "是", "不", "了", "在",
+        "有", "人", "这", "中", "大", "来", "上", "国", "个", "到",
+        "说", "时", "要", "就", "出", "会", "可", "也", "你们", "我们",
+        "他们", "什么", "没有", "好", "看", "爱", "去", "想", "做", "吃",
+        "喝", "饭", "茶", "水", "书", "字", "学", "生", "先生", "小姐"
+      ]
+    },
+    "2": {
+      "description": "Elementary",
+      "score": 2,
+      "words": [
+        "能", "过", "现在", "没关系", "太", "非常", "怎么", "怎么样", "知道",
+        "道", "学习", "认识", "高兴", "欢迎", "谢谢", "对不起", "再见", "明天", "昨天",
+        "今天", "年", "月", "日", "星期", "点", "分", "小时", "刚才",
+        "已经", "马上", "打电话", "跑步", "睡觉", "起床", "上班", "下班", "飞机", "火车",
+        "公共汽车", "出租车", "医院", "银行", "邮局", "超市", "商店", "饭馆", "学校", "公司"
+      ]
+    },
+    "3": {
+      "description": "Intermediate",
+      "score": 3,
+      "words": [
+        "但是", "因为", "所以", "虽然", "如果", "或者", "而且", "然后", "才", "刚",
+        "曾经", "从来", "一直", "正在", "着", "过", "了", "地", "得",
+        "把", "被", "让", "叫", "使", "教", "告诉", "问", "回答", "说话",
+        "聊天", "讨论", "解释", "介绍", "表示", "表达", "意思", "意见", "建议", "办法",
+        "方法", "态度", "情况", "问题", "困难", "容易", "简单", "复杂", "重要", "必要"
+      ]
+    },
+    "4": {
+      "description": "Upper Intermediate",
+      "score": 4,
+      "words": [
+        "麻婆豆腐", "番茄炒蛋", "宫保鸡丁", "鱼香肉丝", "醋溜白菜", "蛋炒饭", "辣子鸡", "酸辣土豆丝",
+        "饺子", "包子", "馒头", "面条", "米饭", "粥", "汤", "菜",
+        "总是", "经常", "有时候", "偶尔", "从不", "永远", "始终", "一直",
+        "特别", "比较", "更", "最", "极其",
+        "关于", "对于", "至于", "由于", "根据", "按照", "依照", "为了",
+        "除了", "除非", "即使", "尽管", "不管"
+      ]
+    },
+    "5": {
+      "description": "Advanced",
+      "score": 5,
+      "words": [
+        "龟兔赛跑", "画蛇添足", "守株待兔", "刻舟求剑", "亡羊补牢", "掩耳盗铃", "狐假虎威", "井底之蛙",
+        "完全", "彻底", "绝对", "肯定", "否定", "确定", "一定", "必然",
+        "偶然", "突然", "忽然", "顿时", "立刻", "随即", "随后",
+        "继续", "持续", "连续", "陆续", "依次", "逐渐", "渐渐", "逐步",
+        "促进", "推动", "推进", "加强", "增强", "提高", "改善", "完善"
+      ]
+    },
+    "6": {
+      "description": "Proficient",
+      "score": 6,
+      "words": [
+        "跑得很远了", "宝宝睡前故事", "嘲笑", "比赛", "撒开", "腿", "一会儿",
+        "深刻", "深入", "深远", "深厚", "深切", "深度", "深层", "深化",
+        "广泛", "广大", "广阔", "宽广", "宽阔", "辽阔", "浩瀚", "无限",
+        "精确", "准确", "正确", "确切", "切实", "实际", "实在", "实质",
+        "综合", "综述", "概括", "概述", "总结", "归纳", "归结", "归类"
+      ]
+    }
+  }
+}

data/jlpt/jlpt_words.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "name": "JLPT (Japanese Language Proficiency Test)",
+  "description": "Proficiency levels for Japanese language",
+  "language": "ja",
+  "source": "Sample data - Replace with complete JLPT database for production",
+  "note": "N5 is easiest, N1 is hardest",
+  "levels": {
+    "N5": {
+      "description": "Beginner",
+      "score": 1,
+      "words": [
+        "こんにちは", "ありがとう", "すみません", "はい", "いいえ",
+        "私", "あなた", "これ", "それ", "あれ",
+        "水", "食べ物", "家", "学校", "本",
+        "大きい", "小さい", "良い", "悪い", "新しい",
+        "一", "二", "三", "四", "五"
+      ]
+    },
+    "N4": {
+      "description": "Elementary",
+      "score": 2,
+      "words": [
+        "レストラン", "駅", "病院", "図書館", "公園",
+        "朝", "昼", "夜", "今日", "明日",
+        "友達", "先生", "学生", "会社", "仕事",
+        "便利", "簡単", "難しい", "面白い", "つまらない",
+        "食べる", "飲む", "行く", "来る", "見る"
+      ]
+    },
+    "N3": {
+      "description": "Intermediate",
+      "score": 3,
+      "words": [
+        "経験", "環境", "状況", "知識", "関係",
+        "政府", "教育", "機会", "責任", "技術",
+        "文化", "社会", "発展", "経済", "必要",
+        "一般的", "特に", "確かに", "実際", "最近",
+        "考える", "思う", "感じる", "理解する", "説明する"
+      ]
+    },
+    "N2": {
+      "description": "Upper Intermediate",
+      "score": 4,
+      "words": [
+        "実施", "基盤", "現象", "要素", "観点",
+        "理論的", "包括的", "実質的", "主に", "さらに",
+        "しかしながら", "従って", "に関して", "一方", "それによって",
+        "検討する", "分析する", "評価する", "提案する", "実現する"
+      ]
+    },
+    "N1": {
+      "description": "Advanced",
+      "score": 5,
+      "words": [
+        "前述", "にもかかわらず", "対比する", "範例", "方法論",
+        "仮説", "経験的", "曖昧", "本質的", "悪化させる",
+        "前例のない", "無意識", "固有", "とはいえ", "けれども",
+        "体系化する", "統合する", "最適化する", "具現化する", "顕在化する"
+      ]
+    }
+  }
+}

data/jlpt && cp CUsers13197OneDriveDesktopproject2languagedatatopiktopik_words.json CUsers13197OneDriveDesktop24697Project2agentic-language-partnerdatatopik ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "name": "JLPT (Japanese Language Proficiency Test)",
+  "description": "Proficiency levels for Japanese language",
+  "language": "ja",
+  "source": "Sample data - Replace with complete JLPT database for production",
+  "note": "N5 is easiest, N1 is hardest",
+  "levels": {
+    "N5": {
+      "description": "Beginner",
+      "score": 1,
+      "words": [
+        "こんにちは", "ありがとう", "すみません", "はい", "いいえ",
+        "私", "あなた", "これ", "それ", "あれ",
+        "水", "食べ物", "家", "学校", "本",
+        "大きい", "小さい", "良い", "悪い", "新しい",
+        "一", "二", "三", "四", "五"
+      ]
+    },
+    "N4": {
+      "description": "Elementary",
+      "score": 2,
+      "words": [
+        "レストラン", "駅", "病院", "図書館", "公園",
+        "朝", "昼", "夜", "今日", "明日",
+        "友達", "先生", "学生", "会社", "仕事",
+        "便利", "簡単", "難しい", "面白い", "つまらない",
+        "食べる", "飲む", "行く", "来る", "見る"
+      ]
+    },
+    "N3": {
+      "description": "Intermediate",
+      "score": 3,
+      "words": [
+        "経験", "環境", "状況", "知識", "関係",
+        "政府", "教育", "機会", "責任", "技術",
+        "文化", "社会", "発展", "経済", "必要",
+        "一般的", "特に", "確かに", "実際", "最近",
+        "考える", "思う", "感じる", "理解する", "説明する"
+      ]
+    },
+    "N2": {
+      "description": "Upper Intermediate",
+      "score": 4,
+      "words": [
+        "実施", "基盤", "現象", "要素", "観点",
+        "理論的", "包括的", "実質的", "主に", "さらに",
+        "しかしながら", "従って", "に関して", "一方", "それによって",
+        "検討する", "分析する", "評価する", "提案する", "実現する"
+      ]
+    },
+    "N1": {
+      "description": "Advanced",
+      "score": 5,
+      "words": [
+        "前述", "にもかかわらず", "対比する", "範例", "方法論",
+        "仮説", "経験的", "曖昧", "本質的", "悪化させる",
+        "前例のない", "無意識", "固有", "とはいえ", "けれども",
+        "体系化する", "統合する", "最適化する", "具現化する", "顕在化する"
+      ]
+    }
+  }
+}

data/topik/topik_words.json ADDED Viewed

	@@ -0,0 +1,70 @@

+{
+  "name": "TOPIK (Test of Proficiency in Korean)",
+  "description": "Proficiency levels for Korean language",
+  "language": "ko",
+  "source": "Sample data - Replace with complete TOPIK database for production",
+  "note": "TOPIK I (1-2) and TOPIK II (3-6)",
+  "levels": {
+    "1": {
+      "description": "Beginner",
+      "score": 1,
+      "words": [
+        "안녕하세요", "감사합니다", "죄송합니다", "네", "아니요",
+        "나", "너", "이것", "저것", "그것",
+        "물", "음식", "집", "학교", "책",
+        "크다", "작다", "좋다", "나쁘다", "새롭다",
+        "하나", "둘", "셋", "넷", "다섯"
+      ]
+    },
+    "2": {
+      "description": "Elementary",
+      "score": 2,
+      "words": [
+        "식당", "역", "병원", "도서관", "공원",
+        "아침", "점심", "저녁", "오늘", "내일",
+        "친구", "선생님", "학생", "회사", "일",
+        "편리하다", "쉽다", "어렵다", "재미있다", "지루하다",
+        "먹다", "마시다", "가다", "오다", "보다"
+      ]
+    },
+    "3": {
+      "description": "Intermediate",
+      "score": 3,
+      "words": [
+        "경험", "환경", "상황", "지식", "관계",
+        "정부", "교육", "기회", "책임", "기술",
+        "문화", "사회", "발전", "경제", "필요",
+        "일반적", "특히", "확실히", "실제로", "최근",
+        "생각하다", "느끼다", "이해하다", "설명하다", "표현하다"
+      ]
+    },
+    "4": {
+      "description": "Upper Intermediate",
+      "score": 4,
+      "words": [
+        "실시", "기반", "현상", "요소", "관점",
+        "이론적", "포괄적", "실질적", "주로", "더욱이",
+        "그러나", "따라서", "에 관하여", "한편", "그로써",
+        "검토하다", "분석하다", "평가하다", "제안하다", "실현하다"
+      ]
+    },
+    "5": {
+      "description": "Advanced",
+      "score": 5,
+      "words": [
+        "전술한", "에도 불구하고", "대비하다", "패러다임", "방법론",
+        "가설", "경험적", "애매한", "본질적", "악화시키다",
+        "전례없는", "무의식적", "고유한", "비록", "그럼에도",
+        "체계화하다", "통합하다", "최적화하다", "구현하다", "현현하다"
+      ]
+    },
+    "6": {
+      "description": "Proficient",
+      "score": 6,
+      "words": [
+        "인식론적", "전형적인", "예리한", "편재하는", "변천",
+        "은밀한", "난독화하다", "불굴의", "완고한", "동의하다"
+      ]
+    }
+  }
+}

requirements.txt CHANGED Viewed

@@ -17,11 +17,13 @@ gTTS
 ########################################
 pytesseract
 pillow
 ########################################
-# Translation
 ########################################
 deep-translator
 ########################################
 # Language Modeling / Text Processing
@@ -32,6 +34,11 @@ sentencepiece
 safetensors
 regex
 ########################################
 # General Utilities
 ########################################

 ########################################
 pytesseract
 pillow
+opencv-python-headless
 ########################################
+# Translation & Language Detection
 ########################################
 deep-translator
+langdetect
 ########################################
 # Language Modeling / Text Processing
 safetensors
 regex
+########################################
+# AI APIs (Optional - for Quiz Generation)
+########################################
+openai
 ########################################
 # General Utilities
 ########################################

src/app/difficulty_scorer.py ADDED Viewed

	@@ -0,0 +1,290 @@

+# -*- coding: utf-8 -*-
+"""
+Difficulty Scorer - Multi-language Support
+Supports 6 languages with proficiency test databases:
+- English (en): CEFR A1-C2
+- Chinese (zh-cn): HSK 1-6
+- German (de): CEFR A1-C2
+- Spanish (es): CEFR A1-C2
+- Japanese (ja): JLPT N5-N1
+- Korean (ko): TOPIK 1-6
+"""
+import json
+from typing import Dict, Any, List, Optional
+from pathlib import Path
+class DifficultyScorer:
+    """Multi-language difficulty scoring system"""
+    LANGUAGE_TESTS = {
+        'en': 'cefr',
+        'de': 'cefr',
+        'es': 'cefr',
+        'fr': 'cefr',
+        'it': 'cefr',
+        'zh-cn': 'hsk',
+        'zh-tw': 'hsk',
+        'ja': 'jlpt',
+        'ko': 'topik',
+        'ru': 'cefr',
+    }
+    JLPT_MAPPING = {
+        'N5': 1, 'N4': 2, 'N3': 3, 'N2': 4, 'N1': 5
+    }
+    def __init__(self, data_dir: str = None):
+        """
+        Initialize multi-language difficulty scorer
+        Args:
+            data_dir: Path to data directory containing proficiency databases
+        """
+        if data_dir is None:
+            current_dir = Path(__file__).parent
+            project_root = current_dir.parent.parent
+            data_dir = project_root / "data"
+        self.data_dir = Path(data_dir)
+        self.databases = self._load_all_databases()
+        self.word_lookups = self._create_word_lookups()
+    def _load_all_databases(self) -> Dict[str, Dict]:
+        """Load all language proficiency databases"""
+        databases = {}
+        # Load CEFR (English, German, Spanish, etc.)
+        cefr_path = self.data_dir / "cefr" / "cefr_words.json"
+        if cefr_path.exists():
+            try:
+                with open(cefr_path, 'r', encoding='utf-8') as f:
+                    databases['cefr'] = json.load(f)
+            except Exception as e:
+                print(f"[DifficultyScorer] Failed to load CEFR: {e}")
+        # Load HSK (Chinese)
+        hsk_path = self.data_dir / "hsk" / "hsk_words.json"
+        if hsk_path.exists():
+            try:
+                with open(hsk_path, 'r', encoding='utf-8') as f:
+                    databases['hsk'] = json.load(f)
+            except Exception as e:
+                print(f"[DifficultyScorer] Failed to load HSK: {e}")
+        # Load JLPT (Japanese)
+        jlpt_path = self.data_dir / "jlpt" / "jlpt_words.json"
+        if jlpt_path.exists():
+            try:
+                with open(jlpt_path, 'r', encoding='utf-8') as f:
+                    databases['jlpt'] = json.load(f)
+            except Exception as e:
+                print(f"[DifficultyScorer] Failed to load JLPT: {e}")
+        # Load TOPIK (Korean)
+        topik_path = self.data_dir / "topik" / "topik_words.json"
+        if topik_path.exists():
+            try:
+                with open(topik_path, 'r', encoding='utf-8') as f:
+                    databases['topik'] = json.load(f)
+            except Exception as e:
+                print(f"[DifficultyScorer] Failed to load TOPIK: {e}")
+        return databases
+    def _create_word_lookups(self) -> Dict[str, Dict[str, int]]:
+        """Create word-to-score lookup tables for all languages"""
+        lookups = {}
+        # CEFR lookups
+        if 'cefr' in self.databases:
+            cefr = self.databases['cefr']
+            for lang_code in ['en', 'de', 'es', 'fr', 'it', 'ru']:
+                lookups[lang_code] = {}
+                if 'levels' in cefr:
+                    for level, data in cefr['levels'].items():
+                        score = data.get('score', 3)
+                        if lang_code in data:
+                            for word in data[lang_code]:
+                                lookups[lang_code][word.lower()] = score
+        # HSK lookup (Chinese)
+        if 'hsk' in self.databases:
+            lookups['zh-cn'] = {}
+            lookups['zh-tw'] = {}
+            if 'levels' in self.databases['hsk']:
+                for level, data in self.databases['hsk']['levels'].items():
+                    score = data.get('score', 3)
+                    for word in data.get('words', []):
+                        lookups['zh-cn'][word] = score
+                        lookups['zh-tw'][word] = score
+        # JLPT lookup (Japanese)
+        if 'jlpt' in self.databases:
+            lookups['ja'] = {}
+            if 'levels' in self.databases['jlpt']:
+                for level, data in self.databases['jlpt']['levels'].items():
+                    score = data.get('score', 3)
+                    for word in data.get('words', []):
+                        lookups['ja'][word] = score
+        # TOPIK lookup (Korean)
+        if 'topik' in self.databases:
+            lookups['ko'] = {}
+            if 'levels' in self.databases['topik']:
+                for level, data in self.databases['topik']['levels'].items():
+                    score = data.get('score', 3)
+                    for word in data.get('words', []):
+                        lookups['ko'][word] = score
+        return lookups
+    def get_proficiency_score(self, word: str, language: str) -> float:
+        """
+        Get proficiency test score for a word
+        Args:
+            word: Word or phrase
+            language: Language code
+        Returns:
+            Score 1-6 (1=easiest, 6=hardest)
+        """
+        language = language.lower()
+        if language not in self.word_lookups:
+            return self._estimate_by_length(word)
+        lookup = self.word_lookups[language]
+        search_word = word if language in ['zh-cn', 'zh-tw', 'ja', 'ko'] else word.lower()
+        if search_word in lookup:
+            return float(lookup[search_word])
+        return self._estimate_by_length(word)
+    def _estimate_by_length(self, word: str) -> float:
+        """Estimate difficulty by word length (fallback)"""
+        length = len(word)
+        if length <= 3:
+            return 2.0
+        elif length <= 6:
+            return 3.5
+        elif length <= 10:
+            return 4.5
+        else:
+            return 5.5
+    def get_length_score(self, word: str) -> float:
+        """Score based on word length"""
+        length = len(word)
+        if length == 1:
+            return 1.0
+        elif length <= 3:
+            return 2.0
+        elif length <= 6:
+            return 3.0
+        elif length <= 10:
+            return 4.0
+        elif length <= 15:
+            return 5.0
+        else:
+            return 6.0
+    def calculate_difficulty(self, word: str, language: str) -> Dict[str, Any]:
+        """
+        Calculate comprehensive difficulty score
+        Weights:
+        - Proficiency level: 60%
+        - Word length: 40%
+        """
+        proficiency_score = self.get_proficiency_score(word, language)
+        length_score = self.get_length_score(word)
+        overall_score = proficiency_score * 0.6 + length_score * 0.4
+        if overall_score <= 2.5:
+            level = "beginner"
+        elif overall_score <= 4.5:
+            level = "intermediate"
+        else:
+            level = "advanced"
+        test_name = self.LANGUAGE_TESTS.get(language.lower(), 'unknown')
+        return {
+            "overall_score": round(overall_score, 2),
+            "level": level,
+            "factors": {
+                "proficiency_score": round(proficiency_score, 2),
+                "length": len(word),
+                "length_score": round(length_score, 2),
+                "test_system": test_name.upper()
+            }
+        }
+    def score_flashcard(self, card: Dict[str, Any]) -> Dict[str, Any]:
+        """Add difficulty score to flashcard"""
+        word = card.get('front', '')
+        language = card.get('language', 'en')
+        difficulty = self.calculate_difficulty(word, language)
+        card_with_difficulty = card.copy()
+        card_with_difficulty['difficulty'] = difficulty
+        return card_with_difficulty
+    def score_all_flashcards(self, flashcards: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Score all flashcards"""
+        return [self.score_flashcard(card) for card in flashcards]
+    def get_statistics(self, flashcards: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Generate difficulty statistics"""
+        if not flashcards:
+            return {}
+        level_counts = {"beginner": 0, "intermediate": 0, "advanced": 0}
+        scores = []
+        by_language = {}
+        for card in flashcards:
+            if 'difficulty' in card:
+                level = card['difficulty']['level']
+                level_counts[level] += 1
+                scores.append(card['difficulty']['overall_score'])
+                lang = card.get('language', 'unknown')
+                if lang not in by_language:
+                    by_language[lang] = {"count": 0, "scores": []}
+                by_language[lang]["count"] += 1
+                by_language[lang]["scores"].append(card['difficulty']['overall_score'])
+        for lang in by_language:
+            lang_scores = by_language[lang]["scores"]
+            by_language[lang]["avg_score"] = round(sum(lang_scores) / len(lang_scores), 2)
+            del by_language[lang]["scores"]
+        return {
+            "total_cards": len(flashcards),
+            "by_level": level_counts,
+            "by_language": by_language,
+            "average_score": round(sum(scores) / len(scores), 2) if scores else 0,
+            "min_score": round(min(scores), 2) if scores else 0,
+            "max_score": round(max(scores), 2) if scores else 0
+        }
+# Global instance (lazy initialization)
+_difficulty_scorer = None
+def get_difficulty_scorer() -> DifficultyScorer:
+    """Get or create the global DifficultyScorer instance"""
+    global _difficulty_scorer
+    if _difficulty_scorer is None:
+        _difficulty_scorer = DifficultyScorer()
+    return _difficulty_scorer

src/app/flashcard_generator.py ADDED Viewed

	@@ -0,0 +1,288 @@

+# -*- coding: utf-8 -*-
+"""
+Flashcard Generator - Extracts vocabulary with context from OCR results
+Supports multi-language extraction and context sentence generation
+"""
+import json
+import re
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+from deep_translator import GoogleTranslator
+class FlashcardGenerator:
+    """Generate flashcards from OCR results with multi-language support"""
+    def __init__(self):
+        self.supported_languages = {
+            'zh-cn': 'Chinese (Simplified)',
+            'zh-tw': 'Chinese (Traditional)',
+            'ja': 'Japanese',
+            'ko': 'Korean',
+            'en': 'English',
+            'fr': 'French',
+            'de': 'German',
+            'es': 'Spanish',
+            'ru': 'Russian',
+        }
+        self.lang_map = {
+            'zh-cn': 'zh-CN',
+            'zh-tw': 'zh-TW',
+            'ja': 'ja',
+            'ko': 'ko',
+            'ru': 'ru',
+        }
+        self.translator_cache = {}
+        # Stop words for filtering common words
+        self.stop_words = {
+            'zh-cn': {
+                '的', '了', '是', '在', '我', '有', '和', '就', '不', '人',
+                '都', '一个', '上', '也', '很', '到', '说', '要', '去', '你',
+                '会', '着', '没有', '看', '好', '自己', '这', '他', '她', '它',
+                '们', '个', '吗', '呢', '吧', '啊', '哦', '嗯', '呀'
+            },
+            'en': {
+                'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
+                'for', 'of', 'with', 'by', 'from', 'is', 'am', 'are', 'was', 'were',
+                'be', 'been', 'being', 'this', 'that', 'these', 'those', 'i', 'you',
+                'he', 'she', 'it', 'we', 'they', 'my', 'your', 'his', 'her', 'its'
+            },
+            'de': {
+                'der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einer',
+                'und', 'oder', 'aber', 'in', 'an', 'auf', 'für', 'mit', 'von',
+                'zu', 'ist', 'sind', 'war', 'waren', 'ich', 'du', 'er', 'sie', 'es'
+            },
+            'es': {
+                'el', 'la', 'los', 'las', 'un', 'una', 'unos', 'unas', 'y', 'o',
+                'pero', 'en', 'a', 'de', 'con', 'por', 'para', 'es', 'son', 'era',
+                'yo', 'tú', 'él', 'ella', 'nosotros', 'vosotros', 'ellos', 'ellas'
+            },
+            'ja': {
+                'の', 'に', 'は', 'を', 'た', 'が', 'で', 'て', 'と', 'し',
+                'れ', 'さ', 'ある', 'いる', 'も', 'する', 'から', 'な', 'こ', 'そ'
+            },
+            'ko': {
+                '은', '는', '이', '가', '을', '를', '의', '에', '에서', '로',
+                '와', '과', '도', '만', '까지', '부터', '하다', '되다', '있다', '없다'
+            },
+            'ru': {
+                'и', 'в', 'на', 'с', 'к', 'по', 'за', 'из', 'у', 'о',
+                'а', 'но', 'что', 'это', 'как', 'он', 'она', 'они', 'мы', 'вы'
+            }
+        }
+    def extract_chinese_text(self, text: str) -> List[str]:
+        """Extract Chinese characters/phrases"""
+        pattern = re.compile(r'[\u4e00-\u9fff]+')
+        return pattern.findall(text)
+    def extract_japanese_text(self, text: str) -> List[str]:
+        """Extract Japanese text (kanji + hiragana + katakana)"""
+        pattern = re.compile(r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FAF]+')
+        return pattern.findall(text)
+    def extract_korean_text(self, text: str) -> List[str]:
+        """Extract Korean words"""
+        pattern = re.compile(r'[\uAC00-\uD7AF]+')
+        return pattern.findall(text)
+    def extract_european_words(self, text: str) -> List[str]:
+        """Extract words from European languages"""
+        pattern = re.compile(r'[a-zA-ZäöüßÄÖÜáéíóúñÁÉÍÓÚÑàèìòùÀÈÌÒÙ\u0400-\u04FF]+')
+        return pattern.findall(text)
+    def filter_by_length(self, items: List[str], min_len: int = 2, max_len: int = 15) -> List[str]:
+        """Filter items by character length"""
+        return [item for item in items if min_len <= len(item) <= max_len]
+    def filter_stop_words(self, items: List[str], language: str) -> List[str]:
+        """Remove common stop words"""
+        stop_words = self.stop_words.get(language, set())
+        if language in ['en', 'de', 'es', 'ru']:
+            return [item for item in items if item.lower() not in stop_words]
+        return [item for item in items if item not in stop_words]
+    def extract_vocabulary_by_language(self, text: str, language: str) -> List[str]:
+        """Extract vocabulary based on language type"""
+        if language in ['zh-cn', 'zh-tw']:
+            return self.extract_chinese_text(text)
+        elif language == 'ja':
+            return self.extract_japanese_text(text)
+        elif language == 'ko':
+            return self.extract_korean_text(text)
+        else:
+            return self.extract_european_words(text)
+    def get_sentence_delimiter(self, language: str) -> str:
+        """Get sentence delimiter pattern for a language"""
+        return r'[。！？.!?\n]+'
+    def extract_context_sentence(self, word: str, text: str, language: str = 'zh-cn') -> str:
+        """Extract context around the word"""
+        delimiter = self.get_sentence_delimiter(language)
+        sentences = re.split(delimiter, text)
+        sentences = [s.strip() for s in sentences if s.strip()]
+        if not sentences:
+            return ""
+        # Find sentence containing the word
+        word_sentence_idx = -1
+        for idx, sentence in enumerate(sentences):
+            if word in sentence:
+                word_sentence_idx = idx
+                break
+        if word_sentence_idx == -1:
+            return ""
+        word_sentence = sentences[word_sentence_idx]
+        is_same_as_sentence = (word_sentence == word or word_sentence.replace(' ', '') == word.replace(' ', ''))
+        is_title = (is_same_as_sentence and (word_sentence_idx <= 3 or word_sentence_idx < len(sentences) - 1))
+        context_sentences = []
+        if is_title:
+            context_sentences.append(word_sentence)
+            for i in range(word_sentence_idx + 1, min(word_sentence_idx + 3, len(sentences))):
+                next_sentence = sentences[i]
+                if len(next_sentence) > 3:
+                    context_sentences.append(next_sentence)
+                    break
+        else:
+            if word_sentence_idx > 0:
+                prev_sentence = sentences[word_sentence_idx - 1]
+                if len(prev_sentence) > 5:
+                    context_sentences.append(prev_sentence)
+            context_sentences.append(word_sentence)
+            if word_sentence_idx < len(sentences) - 1:
+                next_sentence = sentences[word_sentence_idx + 1]
+                if len(next_sentence) > 5:
+                    context_sentences.append(next_sentence)
+        if language in ['zh-cn', 'zh-tw', 'ja']:
+            context = ''.join(context_sentences)
+        else:
+            context = ' '.join(context_sentences)
+        if len(context) > 150:
+            context = context[:150] + '...'
+        return context
+    def translate_to_target(self, text: str, source_lang: str, target_lang: str = 'en') -> str:
+        """Translate text to target language"""
+        cache_key = f"{source_lang}:{target_lang}:{text}"
+        if cache_key in self.translator_cache:
+            return self.translator_cache[cache_key]
+        try:
+            source = self.lang_map.get(source_lang, source_lang)
+            target = self.lang_map.get(target_lang, target_lang)
+            translator = GoogleTranslator(source=source, target=target)
+            translation = translator.translate(text)
+            self.translator_cache[cache_key] = translation
+            return translation
+        except Exception as e:
+            return f"[Translation failed: {text}]"
+    def extract_learnable_items(self, ocr_result: Dict[str, Any], target_lang: str = 'en') -> List[Dict[str, Any]]:
+        """Extract vocabulary items from OCR result"""
+        original_text = ocr_result.get('original_text', '') or ocr_result.get('text', '')
+        language = ocr_result.get('detected_language', 'unknown')
+        filename = ocr_result.get('filename', '')
+        if not original_text or language == 'unknown':
+            return []
+        language = language.lower()
+        # Extract vocabulary
+        vocabulary_items = self.extract_vocabulary_by_language(original_text, language)
+        if not vocabulary_items:
+            return []
+        # Determine length constraints
+        if language in ['zh-cn', 'zh-tw', 'ja']:
+            min_len, max_len = 2, 6
+        elif language == 'ko':
+            min_len, max_len = 2, 10
+        else:
+            min_len, max_len = 3, 15
+        filtered_items = self.filter_by_length(vocabulary_items, min_len=min_len, max_len=max_len)
+        filtered_items = self.filter_stop_words(filtered_items, language)
+        # Remove duplicates
+        unique_items = list(dict.fromkeys(filtered_items))[:10]
+        if not unique_items:
+            return []
+        items = []
+        for idx, item in enumerate(unique_items):
+            # Get translation
+            if language == target_lang:
+                translation = item
+            else:
+                translation = self.translate_to_target(item, language, target_lang)
+            # Skip if translation is same as original
+            if translation.strip().lower() == item.strip().lower():
+                continue
+            # Extract context
+            context = self.extract_context_sentence(item, original_text, language)
+            context_translated = ""
+            if context and language != target_lang:
+                context_translated = self.translate_to_target(context, language, target_lang)
+            items.append({
+                'id': idx + 1,
+                'front': item,
+                'back': translation,
+                'context': context,
+                'context_en': context_translated,
+                'language': language,
+                'content_type': 'ocr_vocab',
+                'source_file': filename,
+            })
+        return items
+    def generate_flashcards(self, ocr_results: List[Dict[str, Any]], target_lang: str = 'en') -> Dict[str, Any]:
+        """Generate flashcards from OCR results"""
+        all_cards = []
+        for result in ocr_results:
+            learnable_items = self.extract_learnable_items(result, target_lang)
+            all_cards.extend(learnable_items)
+        return {
+            'total_cards': len(all_cards),
+            'cards': all_cards,
+            'metadata': {
+                'generator': 'FlashcardGenerator v2.0',
+                'method': 'context-extraction',
+            }
+        }
+    def save_flashcards(self, flashcards: Dict[str, Any], output_path: str):
+        """Save flashcards to JSON file"""
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(flashcards, f, ensure_ascii=False, indent=2)
+    def load_ocr_results(self, input_path: str) -> List[Dict[str, Any]]:
+        """Load OCR results from JSON file"""
+        with open(input_path, 'r', encoding='utf-8') as f:
+            return json.load(f)

src/app/flashcards_tools.py CHANGED Viewed

@@ -1,20 +1,33 @@
-# src/app/flashcards_tools.py
 import json
 import re
 from pathlib import Path
-from typing import Dict, List, Tuple, Optional
 from deep_translator import GoogleTranslator
 from .config import get_user_dir
 def _get_decks_dir(username: str) -> Path:
-    """
-    Returns the directory where all of a user's decks are stored.
-    """
     user_dir = get_user_dir(username)
     decks_dir = user_dir / "decks"
     decks_dir.mkdir(parents=True, exist_ok=True)
@@ -22,11 +35,7 @@ def _get_decks_dir(username: str) -> Path:
 def list_user_decks(username: str) -> Dict[str, Path]:
-    """
-    Returns a mapping of deck name -> deck json path.
-    Deck name is taken from the deck's "name" field if present,
-    otherwise the filename stem.
-    """
     decks_dir = _get_decks_dir(username)
     deck_files = sorted(decks_dir.glob("*.json"))
     decks: Dict[str, Path] = {}
@@ -38,7 +47,6 @@ def list_user_decks(username: str) -> Dict[str, Path]:
         except Exception:
             name = path.stem
-        # ensure uniqueness by appending stem if needed
         if name in decks and decks[name] != path:
             name = f"{name} ({path.stem})"
         decks[name] = path
@@ -47,24 +55,31 @@ def list_user_decks(username: str) -> Dict[str, Path]:
 def _ensure_card_stats(card: Dict) -> None:
-    """
-    Ensure that a card has simple spaced-repetition stats.
-    """
-    if "score" not in card:  # learning strength
         card["score"] = 0
     if "reviews" not in card:
         card["reviews"] = 0
 def load_deck(path: Path) -> Dict:
-    """
-    Loads a deck from JSON, ensuring 'cards' exists and that
-    each card has basic stats for spaced repetition.
-    """
     try:
         data = json.loads(path.read_text(encoding="utf-8"))
     except Exception:
         data = {}
     if "cards" not in data or not isinstance(data["cards"], list):
         data["cards"] = []
     if "name" not in data:
@@ -79,9 +94,7 @@ def load_deck(path: Path) -> Dict:
 def save_deck(path: Path, deck: Dict) -> None:
-    """
-    Saves deck to JSON.
-    """
     if "cards" not in deck:
         deck["cards"] = []
     if "name" not in deck:
@@ -89,21 +102,14 @@ def save_deck(path: Path, deck: Dict) -> None:
     if "tags" not in deck or not isinstance(deck["tags"], list):
         deck["tags"] = []
-    # make sure stats are present
     for card in deck["cards"]:
         _ensure_card_stats(card)
     path.write_text(json.dumps(deck, indent=2, ensure_ascii=False), encoding="utf-8")
-# ------------------------------------------------------------
-# Shared tokenization
-# ------------------------------------------------------------
 def _extract_candidate_words(text: str) -> List[str]:
-    """
-    Simple tokenizer & filter for candidate vocab words.
-    """
     tokens = re.findall(r"\b\w+\b", text, flags=re.UNICODE)
     out = []
     seen = set()
@@ -121,28 +127,63 @@ def _extract_candidate_words(text: str) -> List[str]:
     return out
-# ------------------------------------------------------------
-# OCR → Flashcards
-# ------------------------------------------------------------
 def generate_flashcards_from_ocr_results(
     username: str,
     ocr_results: List[Dict],
     deck_name: str = "ocr",
     target_lang: str = "en",
     tags: Optional[List[str]] = None,
 ) -> Path:
     """
-    Takes OCR results (as produced by ocr_tools.ocr_and_translate_batch)
-    and constructs a simple vocab deck.
-    ocr_results: list of dict with keys:
-        - "text": original text
-        - optionally other fields (ignored)
     """
     all_text = []
     for res in ocr_results:
-        t = res.get("text") or res.get("raw_text") or ""
         if t:
             all_text.append(t)
     joined = "\n".join(all_text)
@@ -153,7 +194,7 @@ def generate_flashcards_from_ocr_results(
     translator = GoogleTranslator(source="auto", target=target_lang)
     cards = []
-    for w in words:
         try:
             trans = translator.translate(w)
         except Exception:
@@ -162,12 +203,14 @@ def generate_flashcards_from_ocr_results(
             continue
         if trans.strip().lower() == w.strip().lower():
             continue
         card = {
             "front": w,
             "back": trans,
             "content_type": "ocr_vocab",
             "language": target_lang,
         }
         _ensure_card_stats(card)
         cards.append(card)
@@ -186,27 +229,73 @@ def generate_flashcards_from_ocr_results(
     return deck_path
-# ------------------------------------------------------------
-# Conversation/Text → Flashcards
-# ------------------------------------------------------------
 def generate_flashcards_from_text(
     username: str,
     text: str,
     deck_name: str = "conversation",
     target_lang: str = "en",
     tags: Optional[List[str]] = None,
 ) -> Path:
     """
-    Build a vocab deck from raw conversation text.
     """
     words = _extract_candidate_words(text)
     if not words:
         raise ValueError("No candidate words found in text.")
     translator = GoogleTranslator(source="auto", target=target_lang)
     cards = []
-    for w in words:
         try:
             trans = translator.translate(w)
         except Exception:
@@ -215,12 +304,14 @@ def generate_flashcards_from_text(
             continue
         if trans.strip().lower() == w.strip().lower():
             continue
         card = {
             "front": w,
             "back": trans,
             "content_type": "conversation_vocab",
             "language": target_lang,
         }
         _ensure_card_stats(card)
         cards.append(card)
@@ -239,3 +330,16 @@ def generate_flashcards_from_text(
     return deck_path

+# -*- coding: utf-8 -*-
+"""
+Flashcards Tools - Enhanced with FlashcardGenerator and DifficultyScorer
+"""
 import json
 import re
 from pathlib import Path
+from typing import Dict, List, Optional, Any
 from deep_translator import GoogleTranslator
 from .config import get_user_dir
+# Import advanced generators (with fallback)
+try:
+    from .flashcard_generator import FlashcardGenerator
+    HAS_FLASHCARD_GENERATOR = True
+except ImportError:
+    HAS_FLASHCARD_GENERATOR = False
+try:
+    from .difficulty_scorer import get_difficulty_scorer
+    HAS_DIFFICULTY_SCORER = True
+except ImportError:
+    HAS_DIFFICULTY_SCORER = False
 def _get_decks_dir(username: str) -> Path:
+    """Returns the directory where all of a user's decks are stored."""
     user_dir = get_user_dir(username)
     decks_dir = user_dir / "decks"
     decks_dir.mkdir(parents=True, exist_ok=True)
 def list_user_decks(username: str) -> Dict[str, Path]:
+    """Returns a mapping of deck name -> deck json path."""
     decks_dir = _get_decks_dir(username)
     deck_files = sorted(decks_dir.glob("*.json"))
     decks: Dict[str, Path] = {}
         except Exception:
             name = path.stem
         if name in decks and decks[name] != path:
             name = f"{name} ({path.stem})"
         decks[name] = path
 def _ensure_card_stats(card: Dict) -> None:
+    """Ensure that a card has simple spaced-repetition stats."""
+    if "score" not in card:
         card["score"] = 0
     if "reviews" not in card:
         card["reviews"] = 0
+def _add_difficulty_to_card(card: Dict) -> Dict:
+    """Add difficulty scoring to a card if DifficultyScorer is available."""
+    if HAS_DIFFICULTY_SCORER:
+        try:
+            scorer = get_difficulty_scorer()
+            return scorer.score_flashcard(card)
+        except Exception:
+            pass
+    return card
 def load_deck(path: Path) -> Dict:
+    """Loads a deck from JSON with stats for spaced repetition."""
     try:
         data = json.loads(path.read_text(encoding="utf-8"))
     except Exception:
         data = {}
     if "cards" not in data or not isinstance(data["cards"], list):
         data["cards"] = []
     if "name" not in data:
 def save_deck(path: Path, deck: Dict) -> None:
+    """Saves deck to JSON."""
     if "cards" not in deck:
         deck["cards"] = []
     if "name" not in deck:
     if "tags" not in deck or not isinstance(deck["tags"], list):
         deck["tags"] = []
     for card in deck["cards"]:
         _ensure_card_stats(card)
     path.write_text(json.dumps(deck, indent=2, ensure_ascii=False), encoding="utf-8")
 def _extract_candidate_words(text: str) -> List[str]:
+    """Simple tokenizer & filter for candidate vocab words."""
     tokens = re.findall(r"\b\w+\b", text, flags=re.UNICODE)
     out = []
     seen = set()
     return out
 def generate_flashcards_from_ocr_results(
     username: str,
     ocr_results: List[Dict],
     deck_name: str = "ocr",
     target_lang: str = "en",
     tags: Optional[List[str]] = None,
+    use_advanced_generator: bool = True,
 ) -> Path:
     """
+    Takes OCR results and constructs a vocab deck.
+    Args:
+        username: User identifier
+        ocr_results: List of OCR result dicts with 'text' key
+        deck_name: Name for the deck
+        target_lang: Target language for translations
+        tags: Optional tags for the deck
+        use_advanced_generator: Whether to use FlashcardGenerator
+    Returns:
+        Path to the saved deck
     """
+    # Try advanced generator first
+    if use_advanced_generator and HAS_FLASHCARD_GENERATOR:
+        try:
+            generator = FlashcardGenerator()
+            flashcard_data = generator.generate_flashcards(ocr_results, target_lang)
+            cards = flashcard_data.get('cards', [])
+            if cards:
+                # Add difficulty scores
+                if HAS_DIFFICULTY_SCORER:
+                    scorer = get_difficulty_scorer()
+                    cards = scorer.score_all_flashcards(cards)
+                # Ensure stats
+                for card in cards:
+                    _ensure_card_stats(card)
+                decks_dir = _get_decks_dir(username)
+                deck_path = decks_dir / f"{deck_name}.json"
+                deck = {
+                    "name": deck_name,
+                    "cards": cards,
+                    "tags": tags or ["ocr"],
+                    "metadata": flashcard_data.get('metadata', {})
+                }
+                save_deck(deck_path, deck)
+                return deck_path
+        except Exception as e:
+            print(f"[flashcards_tools] Advanced generator failed: {e}, using fallback")
+    # Fallback to simple extraction
     all_text = []
     for res in ocr_results:
+        t = res.get("text") or res.get("raw_text") or res.get("original_text") or ""
         if t:
             all_text.append(t)
     joined = "\n".join(all_text)
     translator = GoogleTranslator(source="auto", target=target_lang)
     cards = []
+    for w in words[:20]:  # Limit to 20 words
         try:
             trans = translator.translate(w)
         except Exception:
             continue
         if trans.strip().lower() == w.strip().lower():
             continue
         card = {
             "front": w,
             "back": trans,
             "content_type": "ocr_vocab",
             "language": target_lang,
         }
+        card = _add_difficulty_to_card(card)
         _ensure_card_stats(card)
         cards.append(card)
     return deck_path
 def generate_flashcards_from_text(
     username: str,
     text: str,
     deck_name: str = "conversation",
     target_lang: str = "en",
     tags: Optional[List[str]] = None,
+    source_lang: Optional[str] = None,
 ) -> Path:
     """
+    Build a vocab deck from raw text.
+    Args:
+        username: User identifier
+        text: Raw text to extract vocabulary from
+        deck_name: Name for the deck
+        target_lang: Target language for translations
+        tags: Optional tags for the deck
+        source_lang: Source language (auto-detect if None)
+    Returns:
+        Path to the saved deck
     """
+    # Try advanced generator first
+    if HAS_FLASHCARD_GENERATOR:
+        try:
+            generator = FlashcardGenerator()
+            # Create fake OCR result
+            ocr_result = {
+                'original_text': text,
+                'text': text,
+                'detected_language': source_lang or 'auto',
+            }
+            flashcard_data = generator.generate_flashcards([ocr_result], target_lang)
+            cards = flashcard_data.get('cards', [])
+            if cards:
+                if HAS_DIFFICULTY_SCORER:
+                    scorer = get_difficulty_scorer()
+                    cards = scorer.score_all_flashcards(cards)
+                for card in cards:
+                    card['content_type'] = 'conversation_vocab'
+                    _ensure_card_stats(card)
+                decks_dir = _get_decks_dir(username)
+                deck_path = decks_dir / f"{deck_name}.json"
+                deck = {
+                    "name": deck_name,
+                    "cards": cards,
+                    "tags": tags or ["conversation"],
+                }
+                save_deck(deck_path, deck)
+                return deck_path
+        except Exception as e:
+            print(f"[flashcards_tools] Advanced generator failed: {e}, using fallback")
+    # Fallback
     words = _extract_candidate_words(text)
     if not words:
         raise ValueError("No candidate words found in text.")
     translator = GoogleTranslator(source="auto", target=target_lang)
     cards = []
+    for w in words[:20]:
         try:
             trans = translator.translate(w)
         except Exception:
             continue
         if trans.strip().lower() == w.strip().lower():
             continue
         card = {
             "front": w,
             "back": trans,
             "content_type": "conversation_vocab",
             "language": target_lang,
         }
+        card = _add_difficulty_to_card(card)
         _ensure_card_stats(card)
         cards.append(card)
     return deck_path
+def add_difficulty_to_deck(deck: Dict) -> Dict:
+    """Add difficulty scores to all cards in a deck."""
+    if not HAS_DIFFICULTY_SCORER:
+        return deck
+    try:
+        scorer = get_difficulty_scorer()
+        deck["cards"] = scorer.score_all_flashcards(deck.get("cards", []))
+        deck["statistics"] = scorer.get_statistics(deck["cards"])
+    except Exception as e:
+        print(f"[flashcards_tools] Difficulty scoring failed: {e}")
+    return deck

src/app/ocr_tools.py CHANGED Viewed

@@ -1,22 +1,331 @@
 import io
 from typing import Any, Dict, List, Optional
 from PIL import Image
 import pytesseract
 from deep_translator import GoogleTranslator
-from src.app.config import get_user_dir   # keep this if you use it
-# ❗ REMOVED invalid placeholder import:
-# from .flashcards_tools import ...
-def _simple_ocr(image_bytes: bytes) -> str:
     """
-    Fallback OCR using pytesseract.
     """
     img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-    text = pytesseract.image_to_string(img)
-    return text.strip()
 def ocr_and_translate_batch(
@@ -25,40 +334,41 @@ def ocr_and_translate_batch(
     prefer_ocr_local: bool = True,
 ) -> List[Dict]:
     """
-    Runs OCR on a batch of images. For now, we always use the
-    simple pytesseract-based OCR, but the 'prefer_ocr_local'
-    flag is kept for compatibility with previous versions that
-    used a local PaddleOCR pipeline.
-    Returns: list of dicts with keys:
-      - "text": original OCR text
-      - "translation": translation into target_lang
-      - "target_lang": target_lang
     """
-    translator = GoogleTranslator(source="auto", target=target_lang)
-    results: List[Dict] = []
     for img_bytes in images:
-        text = _simple_ocr(img_bytes)
-        if text:
-            try:
-                translated = translator.translate(text)
-            except Exception:
-                translated = ""
-            results.append(
-                {
-                    "text": text,
-                    "translation": translated,
-                    "target_lang": target_lang,
-                }
-            )
-        else:
-            results.append(
-                {
-                    "text": "",
-                    "translation": "",
-                    "target_lang": target_lang,
-                }
-            )
     return results

+# -*- coding: utf-8 -*-
+"""
+OCR Tools - Advanced text extraction with multi-language support
+Supports: English, Chinese, Japanese, Korean, German, Spanish, Russian
+"""
 import io
+import re
 from typing import Any, Dict, List, Optional
+import numpy as np
 from PIL import Image
 import pytesseract
 from deep_translator import GoogleTranslator
+# Try to import optional dependencies
+try:
+    import cv2
+    HAS_CV2 = True
+except ImportError:
+    HAS_CV2 = False
+try:
+    from langdetect import detect
+    HAS_LANGDETECT = True
+except ImportError:
+    HAS_LANGDETECT = False
+try:
+    from paddleocr import PaddleOCR
+    HAS_PADDLEOCR = True
+    _paddle_ocr = None
+except ImportError:
+    HAS_PADDLEOCR = False
+    _paddle_ocr = None
+# Language code mapping
+LANG_CODE_MAP = {
+    'zh-cn': 'zh-CN',
+    'zh-tw': 'zh-TW',
+    'en': 'en',
+    'ja': 'ja',
+    'ko': 'ko',
+    'fr': 'fr',
+    'de': 'de',
+    'es': 'es',
+    'ru': 'ru',
+}
+# Tesseract language codes for each supported language
+TESSERACT_LANG_MAP = {
+    'en': 'eng',
+    'english': 'eng',
+    'zh-cn': 'chi_sim',
+    'chinese': 'chi_sim',
+    'zh-tw': 'chi_tra',
+    'ja': 'jpn',
+    'japanese': 'jpn',
+    'ko': 'kor',
+    'korean': 'kor',
+    'de': 'deu',
+    'german': 'deu',
+    'es': 'spa',
+    'spanish': 'spa',
+    'ru': 'rus',
+    'russian': 'rus',
+    'fr': 'fra',
+    'french': 'fra',
+}
+def _get_paddle_ocr():
+    """Lazily initialize PaddleOCR"""
+    global _paddle_ocr
+    if HAS_PADDLEOCR and _paddle_ocr is None:
+        try:
+            _paddle_ocr = PaddleOCR(use_textline_orientation=True, lang='ch', show_log=False)
+        except Exception as e:
+            print(f"[OCR] PaddleOCR init failed: {e}")
+    return _paddle_ocr
+def filter_pinyin_keep_chinese(text: str) -> str:
     """
+    Filter out pinyin and keep only Chinese characters.
+    Preserves complete sentences with Chinese characters.
     """
+    lines = text.split('\n')
+    filtered_lines = []
+    for line in lines:
+        line_stripped = line.strip()
+        if not line_stripped:
+            continue
+        # Check if line contains Chinese characters
+        has_chinese = bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', line))
+        # Check if line is pure pinyin
+        is_pinyin = bool(re.match(r'^[a-zA-Z\u0101\u00e1\u01ce\u00e0\u0113\u00e9\u011b\u00e8\u012b\u00ed\u01d0\u00ec\u014d\u00f3\u01d2\u00f2\u016b\u00fa\u01d4\u00f9\u00fc\u01d6\u01d8\u01da\u01dc\u0144\u0148\u01f9\s]+$', line_stripped))
+        if is_pinyin:
+            continue
+        if has_chinese:
+            chinese_parts = re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf]+', line)
+            if chinese_parts:
+                filtered_lines.append(''.join(chinese_parts))
+    return '\n'.join(filtered_lines)
+def detect_language_from_text(text: str) -> str:
+    """Detect language, with special handling for Chinese characters"""
+    has_chinese = bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', text))
+    if has_chinese:
+        return 'zh-cn'
+    has_japanese = bool(re.search(r'[\u3040-\u309f\u30a0-\u30ff]', text))
+    if has_japanese:
+        return 'ja'
+    has_korean = bool(re.search(r'[\uac00-\ud7af]', text))
+    if has_korean:
+        return 'ko'
+    if HAS_LANGDETECT:
+        try:
+            return detect(text)
+        except:
+            pass
+    return 'en'
+def _preprocess_image(img_array: np.ndarray, method: str = 'simple') -> np.ndarray:
+    """Apply image preprocessing for better OCR accuracy"""
+    if not HAS_CV2:
+        return img_array
+    # Convert to grayscale if needed
+    if len(img_array.shape) == 3:
+        gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+    else:
+        gray = img_array
+    if method == 'simple':
+        _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        return binary
+    elif method == 'adaptive':
+        return cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
+    elif method == 'clahe':
+        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+        enhanced = clahe.apply(gray)
+        _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        return binary
+    elif method == 'denoised':
+        kernel = np.ones((2, 2), np.uint8)
+        denoised = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel, iterations=1)
+        _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        return binary
+    elif method == 'advanced':
+        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+        enhanced = clahe.apply(gray)
+        denoised = cv2.fastNlMeansDenoising(enhanced, None, 10, 7, 21)
+        return cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
+    else:
+        return gray
+def _ocr_with_paddleocr(image_bytes: bytes) -> tuple:
+    """Use PaddleOCR for text extraction (best for Chinese)"""
+    paddle = _get_paddle_ocr()
+    if paddle is None:
+        return None, 0
+    try:
+        img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+        img_array = np.array(img)
+        result = paddle.ocr(img_array, cls=True)
+        if not result or len(result) == 0 or result[0] is None:
+            return None, 0
+        texts = []
+        scores = []
+        for line in result[0]:
+            if line and len(line) >= 2:
+                text_info = line[1]
+                if isinstance(text_info, tuple) and len(text_info) >= 2:
+                    texts.append(text_info[0])
+                    scores.append(text_info[1])
+        if not texts:
+            return None, 0
+        full_text = '\n'.join(texts)
+        avg_confidence = sum(scores) / len(scores) if scores else 0
+        return full_text, avg_confidence * 100
+    except Exception as e:
+        print(f"[OCR] PaddleOCR error: {e}")
+        return None, 0
+def _ocr_with_tesseract(image_bytes: bytes, lang: str = 'eng+chi_sim+jpn+kor') -> tuple:
+    """Use Tesseract with multiple preprocessing methods"""
     img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    img_array = np.array(img)
+    best_text = ""
+    best_confidence = 0
+    best_method = ""
+    # Try different preprocessing methods
+    methods = ['simple', 'adaptive', 'clahe', 'denoised']
+    if HAS_CV2:
+        methods.append('advanced')
+    for method in methods:
+        try:
+            if HAS_CV2:
+                processed = _preprocess_image(img_array, method)
+                processed_img = Image.fromarray(processed)
+            else:
+                processed_img = img
+            # Get OCR data with confidence
+            data = pytesseract.image_to_data(processed_img, lang=lang, output_type=pytesseract.Output.DICT)
+            text = pytesseract.image_to_string(processed_img, lang=lang)
+            # Calculate average confidence
+            confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
+            avg_confidence = sum(confidences) / len(confidences) if confidences else 0
+            if text.strip() and avg_confidence > best_confidence:
+                best_text = text
+                best_confidence = avg_confidence
+                best_method = method
+        except Exception as e:
+            continue
+    return best_text.strip(), best_confidence, best_method
+def ocr_single_image(
+    image_bytes: bytes,
+    source_lang: Optional[str] = None,
+    target_lang: str = "en",
+    use_paddle: bool = True,
+) -> Dict[str, Any]:
+    """
+    Extract text from a single image and translate.
+    Args:
+        image_bytes: Raw image bytes
+        source_lang: Source language hint (auto-detect if None)
+        target_lang: Target language for translation
+        use_paddle: Whether to try PaddleOCR first
+    Returns:
+        Dict with original_text, translated_text, detected_language, confidence, method
+    """
+    best_text = ""
+    best_method = ""
+    best_confidence = 0
+    # Determine Tesseract language string
+    tess_lang = 'eng+chi_sim+chi_tra+jpn+kor+deu+spa+rus+fra'
+    if source_lang:
+        mapped = TESSERACT_LANG_MAP.get(source_lang.lower())
+        if mapped:
+            tess_lang = mapped
+    # Try PaddleOCR first (best for Chinese)
+    if use_paddle and HAS_PADDLEOCR:
+        paddle_text, paddle_conf = _ocr_with_paddleocr(image_bytes)
+        if paddle_text and paddle_text.strip():
+            best_text = paddle_text
+            best_method = "PaddleOCR"
+            best_confidence = paddle_conf
+    # Try Tesseract (fallback or if PaddleOCR failed)
+    if not best_text.strip():
+        tess_text, tess_conf, tess_method = _ocr_with_tesseract(image_bytes, tess_lang)
+        if tess_text and (tess_conf > best_confidence or not best_text):
+            best_text = tess_text
+            best_method = f"Tesseract-{tess_method}"
+            best_confidence = tess_conf
+    if not best_text.strip():
+        return {
+            "original_text": "",
+            "translated_text": "",
+            "detected_language": "unknown",
+            "confidence": 0,
+            "method": "none",
+            "error": "No text detected"
+        }
+    # Filter pinyin for Chinese text
+    filtered_text = filter_pinyin_keep_chinese(best_text)
+    if not filtered_text.strip():
+        filtered_text = best_text
+    # Detect language
+    detected_lang = detect_language_from_text(filtered_text)
+    # Translate
+    try:
+        source = LANG_CODE_MAP.get(detected_lang, detected_lang)
+        target = LANG_CODE_MAP.get(target_lang, target_lang)
+        translator = GoogleTranslator(source=source, target=target)
+        translated = translator.translate(filtered_text)
+    except Exception as e:
+        translated = ""
+    return {
+        "original_text": filtered_text.strip(),
+        "translated_text": translated.strip() if translated else "",
+        "detected_language": detected_lang,
+        "confidence": round(best_confidence, 2),
+        "method": best_method
+    }
 def ocr_and_translate_batch(
     prefer_ocr_local: bool = True,
 ) -> List[Dict]:
     """
+    Runs OCR on a batch of images with advanced processing.
+    Args:
+        images: List of image bytes
+        target_lang: Target language for translation
+        prefer_ocr_local: Whether to prefer local OCR (PaddleOCR)
+    Returns:
+        List of dicts with OCR results
     """
+    results = []
     for img_bytes in images:
+        result = ocr_single_image(
+            image_bytes=img_bytes,
+            target_lang=target_lang,
+            use_paddle=prefer_ocr_local and HAS_PADDLEOCR
+        )
+        # Convert to expected format for backward compatibility
+        results.append({
+            "text": result.get("original_text", ""),
+            "translation": result.get("translated_text", ""),
+            "target_lang": target_lang,
+            "detected_language": result.get("detected_language", "unknown"),
+            "confidence": result.get("confidence", 0),
+            "method": result.get("method", "unknown"),
+        })
     return results
+# Keep old function for backward compatibility
+def _simple_ocr(image_bytes: bytes) -> str:
+    """Simple OCR using pytesseract (backward compatibility)"""
+    img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    text = pytesseract.image_to_string(img)
+    return text.strip()

src/app/quiz_tools.py CHANGED Viewed

@@ -1,17 +1,329 @@
-# src/app/quiz_tools.py
-# Placeholder restored because modifications moved to main_app.
-# This keeps the file present so import does not fail.
 import json
 import random
 from datetime import datetime
 from .config import get_user_dir
-from .flashcards_tools import ...
-def create_semantic_quiz_for_user(username: str, topic: str, num_questions: int = 5):
     reading_passages = [
         f"{topic.capitalize()} is important in daily life. Many people enjoy talking about it.",
         f"Here is a short story based on the topic '{topic}'.",
@@ -25,28 +337,25 @@ def create_semantic_quiz_for_user(username: str, topic: str, num_questions: int
         if q_type == "translate_phrase":
             questions.append({
                 "type": "semantic_translate_phrase",
-                "prompt": f"Translate:
-'{passage}'",
                 "answer": "(model evaluated)",
                 "explanation": f"Checks ability to translate topic '{topic}'."
             })
         elif q_type == "summarize":
             questions.append({
                 "type": "semantic_summarize",
-                "prompt": f"Summarize:
-{passage}",
                 "answer": "(model evaluated)",
                 "explanation": f"Checks comprehension of topic '{topic}'."
             })
         elif q_type == "interpret":
             questions.append({
                 "type": "semantic_interpret",
-                "prompt": f"Interpret meaning:
-{passage}",
                 "answer": "(model evaluated)",
                 "explanation": f"Checks conceptual understanding of '{topic}'."
             })
@@ -58,5 +367,59 @@ def create_semantic_quiz_for_user(username: str, topic: str, num_questions: int
         "id": quiz_id,
         "created_at": ts,
         "topic": topic,
         "questions": questions,
     }

+# -*- coding: utf-8 -*-
+"""
+Quiz Tools - AI-Powered Quiz Generation from Flashcards
+Supports multiple question types and uses OpenAI API for intelligent quiz creation
+"""
 import json
+import os
 import random
 from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Any, Optional
 from .config import get_user_dir
+from .flashcards_tools import load_deck, list_user_decks
+# Try to import OpenAI
+try:
+    from openai import OpenAI
+    HAS_OPENAI = True
+except ImportError:
+    HAS_OPENAI = False
+class QuizGenerator:
+    """Generate intelligent quizzes using OpenAI API"""
+    QUESTION_TYPES = [
+        'multiple_choice',
+        'fill_in_blank',
+        'true_false',
+        'matching',
+        'short_answer'
+    ]
+    def __init__(self, api_key: Optional[str] = None, model: str = "gpt-4o-mini"):
+        """
+        Initialize the quiz generator
+        Args:
+            api_key: OpenAI API key (uses env var if not provided)
+            model: Model to use for quiz generation
+        """
+        self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
+        self.model = model
+        self.client = None
+        if HAS_OPENAI and self.api_key:
+            try:
+                self.client = OpenAI(api_key=self.api_key)
+            except Exception as e:
+                print(f"[QuizGenerator] OpenAI init failed: {e}")
+    def _prepare_flashcard_context(self, flashcards: List[Dict], max_cards: int = 20) -> str:
+        """Prepare flashcard data as context for AI"""
+        selected_cards = flashcards[:max_cards] if len(flashcards) > max_cards else flashcards
+        context_parts = []
+        for idx, card in enumerate(selected_cards, 1):
+            card_info = (
+                f"{idx}. Word: {card.get('front', '')}\n"
+                f"   Translation: {card.get('back', '')}\n"
+                f"   Language: {card.get('language', 'unknown')}\n"
+                f"   Context: {card.get('context', 'N/A')}"
+            )
+            context_parts.append(card_info)
+        return "\n\n".join(context_parts)
+    def _create_quiz_prompt(self, flashcards: List[Dict], num_questions: int = 30) -> str:
+        """Create the prompt for AI quiz generation"""
+        flashcard_context = self._prepare_flashcard_context(flashcards)
+        prompt = f"""You are an expert language teacher creating a QUESTION BANK to test students' knowledge of vocabulary.
+Based on the following flashcards, generate exactly {num_questions} diverse quiz questions.
+FLASHCARDS:
+{flashcard_context}
+REQUIREMENTS:
+1. Generate exactly {num_questions} questions
+2. Use different question types: multiple_choice, fill_in_blank, true_false, matching, short_answer
+3. Questions should test different aspects: vocabulary recall, context understanding, usage
+4. Each question must include the correct answer
+5. For multiple choice questions, provide 4 options with one correct answer
+6. For matching questions, provide 4 word-translation pairs
+7. Make questions challenging but fair
+8. Vary difficulty levels across questions
+OUTPUT FORMAT (JSON):
+{{
+  "quiz_title": "Vocabulary Quiz",
+  "total_questions": {num_questions},
+  "questions": [
+    {{
+      "question_number": 1,
+      "type": "multiple_choice",
+      "question": "What does 'word' mean?",
+      "options": ["Option A", "Option B", "Option C", "Option D"],
+      "correct_answer": "Option B",
+      "explanation": "Brief explanation."
+    }},
+    {{
+      "question_number": 2,
+      "type": "fill_in_blank",
+      "question": "Complete: The ___ ran quickly.",
+      "correct_answer": "cat",
+      "explanation": "Brief explanation."
+    }},
+    {{
+      "question_number": 3,
+      "type": "true_false",
+      "question": "'Word' means 'definition' in English.",
+      "correct_answer": false,
+      "explanation": "Brief explanation."
+    }},
+    {{
+      "question_number": 4,
+      "type": "matching",
+      "question": "Match the words to their correct translations",
+      "pairs": [
+        {{"word": "word1", "translation": "translation1"}},
+        {{"word": "word2", "translation": "translation2"}},
+        {{"word": "word3", "translation": "translation3"}},
+        {{"word": "word4", "translation": "translation4"}}
+      ],
+      "correct_answer": "All pairs are correctly matched",
+      "explanation": "Brief explanation."
+    }},
+    {{
+      "question_number": 5,
+      "type": "short_answer",
+      "question": "Explain the usage of 'word'.",
+      "correct_answer": "Model answer here.",
+      "explanation": "Brief explanation."
+    }}
+  ]
+}}
+Generate the quiz now:"""
+        return prompt
+    def generate_quiz_with_ai(self, flashcards: List[Dict], num_questions: int = 30) -> Dict[str, Any]:
+        """Generate quiz using OpenAI API"""
+        if not self.client:
+            raise ValueError("OpenAI client not initialized. Check API key.")
+        if not flashcards:
+            raise ValueError("No flashcards provided for quiz generation")
+        prompt = self._create_quiz_prompt(flashcards, num_questions)
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are an expert language teacher who creates engaging, educational quizzes. Always respond with valid JSON."
+                    },
+                    {
+                        "role": "user",
+                        "content": prompt
+                    }
+                ],
+                response_format={"type": "json_object"},
+                temperature=0.7,
+                max_tokens=4000
+            )
+            quiz_content = response.choices[0].message.content
+            quiz_data = json.loads(quiz_content)
+            quiz_data['metadata'] = {
+                'generator': 'AI-Powered Quiz Generator',
+                'model': self.model,
+                'source_flashcards': len(flashcards),
+                'tokens_used': response.usage.total_tokens if response.usage else 0
+            }
+            return quiz_data
+        except Exception as e:
+            print(f"[QuizGenerator] AI generation failed: {e}")
+            raise
+    def generate_simple_quiz(self, flashcards: List[Dict], num_questions: int = 5) -> Dict[str, Any]:
+        """Generate a simple quiz without AI (fallback)"""
+        if not flashcards:
+            raise ValueError("No flashcards provided")
+        questions = []
+        used_cards = random.sample(flashcards, min(num_questions * 2, len(flashcards)))
+        for i, card in enumerate(used_cards[:num_questions]):
+            q_type = random.choice(['multiple_choice', 'fill_in_blank', 'true_false'])
+            if q_type == 'multiple_choice':
+                # Create wrong options from other cards
+                other_cards = [c for c in flashcards if c != card]
+                wrong_options = random.sample(
+                    [c.get('back', 'Unknown') for c in other_cards],
+                    min(3, len(other_cards))
+                )
+                while len(wrong_options) < 3:
+                    wrong_options.append(f"Not {card.get('back', 'this')}")
+                options = wrong_options + [card.get('back', '')]
+                random.shuffle(options)
+                questions.append({
+                    "question_number": i + 1,
+                    "type": "multiple_choice",
+                    "question": f"What does '{card.get('front', '')}' mean?",
+                    "options": options,
+                    "correct_answer": card.get('back', ''),
+                    "explanation": f"'{card.get('front', '')}' translates to '{card.get('back', '')}'."
+                })
+            elif q_type == 'fill_in_blank':
+                questions.append({
+                    "question_number": i + 1,
+                    "type": "fill_in_blank",
+                    "question": f"Translate: '{card.get('front', '')}' = _____",
+                    "correct_answer": card.get('back', ''),
+                    "explanation": f"The correct translation is '{card.get('back', '')}'."
+                })
+            elif q_type == 'true_false':
+                is_true = random.choice([True, False])
+                if is_true:
+                    shown_answer = card.get('back', '')
+                else:
+                    other_cards = [c for c in flashcards if c != card]
+                    if other_cards:
+                        shown_answer = random.choice(other_cards).get('back', 'something else')
+                    else:
+                        shown_answer = f"Not {card.get('back', 'this')}"
+                questions.append({
+                    "question_number": i + 1,
+                    "type": "true_false",
+                    "question": f"'{card.get('front', '')}' means '{shown_answer}'.",
+                    "correct_answer": is_true,
+                    "explanation": f"'{card.get('front', '')}' actually means '{card.get('back', '')}'."
+                })
+        return {
+            "quiz_title": "Vocabulary Quiz",
+            "total_questions": len(questions),
+            "questions": questions,
+            "metadata": {
+                "generator": "Simple Quiz Generator",
+                "source_flashcards": len(flashcards)
+            }
+        }
+def create_quiz_from_deck(
+    username: str,
+    deck_name: str,
+    num_questions: int = 5,
+    use_ai: bool = True,
+    api_key: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Create a quiz from a user's flashcard deck
+    Args:
+        username: User identifier
+        deck_name: Name of the deck to create quiz from
+        num_questions: Number of questions for the quiz session
+        use_ai: Whether to use AI for quiz generation
+        api_key: Optional OpenAI API key
+    Returns:
+        Quiz dictionary with questions
+    """
+    decks = list_user_decks(username)
+    if deck_name not in decks:
+        raise ValueError(f"Deck '{deck_name}' not found")
+    deck = load_deck(decks[deck_name])
+    flashcards = deck.get('cards', [])
+    if not flashcards:
+        raise ValueError(f"Deck '{deck_name}' has no cards")
+    generator = QuizGenerator(api_key=api_key)
+    try:
+        if use_ai and generator.client:
+            # Generate larger question bank with AI
+            quiz = generator.generate_quiz_with_ai(flashcards, num_questions=30)
+        else:
+            # Use simple generator
+            quiz = generator.generate_simple_quiz(flashcards, num_questions=num_questions)
+    except Exception as e:
+        print(f"[quiz_tools] AI quiz generation failed: {e}, using simple generator")
+        quiz = generator.generate_simple_quiz(flashcards, num_questions=num_questions)
+    # Add quiz metadata
+    ts = datetime.utcnow().strftime("%Y-%m-%dT%H-%M-%SZ")
+    quiz['id'] = f"quiz_{ts}"
+    quiz['created_at'] = ts
+    quiz['deck_name'] = deck_name
+    quiz['questions_per_session'] = num_questions
+    return quiz
+def create_semantic_quiz_for_user(username: str, topic: str, num_questions: int = 5) -> Dict[str, Any]:
+    """
+    Create a semantic quiz based on a topic (for conversation practice)
+    Args:
+        username: User identifier
+        topic: Topic for the quiz
+        num_questions: Number of questions
+    Returns:
+        Quiz dictionary
+    """
     reading_passages = [
         f"{topic.capitalize()} is important in daily life. Many people enjoy talking about it.",
         f"Here is a short story based on the topic '{topic}'.",
         if q_type == "translate_phrase":
             questions.append({
+                "question_number": i + 1,
                 "type": "semantic_translate_phrase",
+                "prompt": f"Translate:\n\n'{passage}'",
                 "answer": "(model evaluated)",
                 "explanation": f"Checks ability to translate topic '{topic}'."
             })
         elif q_type == "summarize":
             questions.append({
+                "question_number": i + 1,
                 "type": "semantic_summarize",
+                "prompt": f"Summarize:\n\n{passage}",
                 "answer": "(model evaluated)",
                 "explanation": f"Checks comprehension of topic '{topic}'."
             })
         elif q_type == "interpret":
             questions.append({
+                "question_number": i + 1,
                 "type": "semantic_interpret",
+                "prompt": f"Interpret meaning:\n\n{passage}",
                 "answer": "(model evaluated)",
                 "explanation": f"Checks conceptual understanding of '{topic}'."
             })
         "id": quiz_id,
         "created_at": ts,
         "topic": topic,
+        "total_questions": len(questions),
         "questions": questions,
     }
+def save_quiz(username: str, quiz: Dict[str, Any]) -> Path:
+    """Save a quiz to the user's directory"""
+    user_dir = get_user_dir(username)
+    quizzes_dir = user_dir / "quizzes"
+    quizzes_dir.mkdir(parents=True, exist_ok=True)
+    quiz_id = quiz.get('id', f"quiz_{datetime.utcnow().strftime('%Y%m%d%H%M%S')}")
+    quiz_path = quizzes_dir / f"{quiz_id}.json"
+    with open(quiz_path, 'w', encoding='utf-8') as f:
+        json.dump(quiz, f, ensure_ascii=False, indent=2)
+    return quiz_path
+def load_quiz(username: str, quiz_id: str) -> Dict[str, Any]:
+    """Load a saved quiz"""
+    user_dir = get_user_dir(username)
+    quiz_path = user_dir / "quizzes" / f"{quiz_id}.json"
+    if not quiz_path.exists():
+        raise FileNotFoundError(f"Quiz '{quiz_id}' not found")
+    with open(quiz_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+def list_user_quizzes(username: str) -> List[Dict[str, Any]]:
+    """List all quizzes for a user"""
+    user_dir = get_user_dir(username)
+    quizzes_dir = user_dir / "quizzes"
+    if not quizzes_dir.exists():
+        return []
+    quizzes = []
+    for quiz_file in sorted(quizzes_dir.glob("*.json"), reverse=True):
+        try:
+            with open(quiz_file, 'r', encoding='utf-8') as f:
+                quiz = json.load(f)
+                quizzes.append({
+                    'id': quiz.get('id', quiz_file.stem),
+                    'title': quiz.get('quiz_title', 'Untitled Quiz'),
+                    'created_at': quiz.get('created_at', ''),
+                    'total_questions': quiz.get('total_questions', 0),
+                    'deck_name': quiz.get('deck_name', ''),
+                })
+        except Exception:
+            continue
+    return quizzes