yusenthebot
commited on
Commit
·
aa3fdef
1
Parent(s):
902e65e
Integrate advanced OCR, FlashcardGenerator, DifficultyScorer, and AI Quiz from language project
Browse files- data/cefr/cefr_words.json +50 -0
- data/cefr && cp CUsers13197OneDriveDesktopproject2languagedatahskhsk_words.json CUsers13197OneDriveDesktop24697Project2agentic-language-partnerdatahsk +50 -0
- data/hsk/hsk_words.json +75 -0
- data/jlpt/jlpt_words.json +62 -0
- data/jlpt && cp CUsers13197OneDriveDesktopproject2languagedatatopiktopik_words.json CUsers13197OneDriveDesktop24697Project2agentic-language-partnerdatatopik +62 -0
- data/topik/topik_words.json +70 -0
- requirements.txt +8 -1
- src/app/difficulty_scorer.py +290 -0
- src/app/flashcard_generator.py +288 -0
- src/app/flashcards_tools.py +153 -49
- src/app/ocr_tools.py +350 -40
- src/app/quiz_tools.py +379 -16
data/cefr/cefr_words.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "CEFR (Common European Framework of Reference)",
|
| 3 |
+
"description": "Proficiency levels for European languages",
|
| 4 |
+
"languages": ["en", "de", "es", "fr", "it"],
|
| 5 |
+
"source": "Sample data - Replace with complete CEFR database for production",
|
| 6 |
+
"levels": {
|
| 7 |
+
"A1": {
|
| 8 |
+
"description": "Beginner",
|
| 9 |
+
"score": 1,
|
| 10 |
+
"en": ["hello", "goodbye", "yes", "no", "please", "thank you", "water", "food", "house", "cat", "dog", "book", "table", "chair", "good", "bad", "big", "small", "one", "two"],
|
| 11 |
+
"de": ["hallo", "tschüss", "ja", "nein", "bitte", "danke", "wasser", "essen", "haus", "katze", "hund", "buch", "tisch", "stuhl", "gut", "schlecht", "groß", "klein", "eins", "zwei"],
|
| 12 |
+
"es": ["hola", "adiós", "sí", "no", "por favor", "gracias", "agua", "comida", "casa", "gato", "perro", "libro", "mesa", "silla", "bueno", "malo", "grande", "pequeño", "uno", "dos"]
|
| 13 |
+
},
|
| 14 |
+
"A2": {
|
| 15 |
+
"description": "Elementary",
|
| 16 |
+
"score": 2,
|
| 17 |
+
"en": ["restaurant", "morning", "afternoon", "week", "month", "family", "friend", "work", "school", "city", "country", "weather", "summer", "winter", "happy", "sad", "easy", "difficult", "beautiful", "expensive"],
|
| 18 |
+
"de": ["restaurant", "morgen", "nachmittag", "woche", "monat", "familie", "freund", "arbeit", "schule", "stadt", "land", "wetter", "sommer", "winter", "glücklich", "traurig", "einfach", "schwierig", "schön", "teuer"],
|
| 19 |
+
"es": ["restaurante", "mañana", "tarde", "semana", "mes", "familia", "amigo", "trabajo", "escuela", "ciudad", "país", "tiempo", "verano", "invierno", "feliz", "triste", "fácil", "difícil", "hermoso", "caro"]
|
| 20 |
+
},
|
| 21 |
+
"B1": {
|
| 22 |
+
"description": "Intermediate",
|
| 23 |
+
"score": 3,
|
| 24 |
+
"en": ["experience", "environment", "situation", "knowledge", "relationship", "government", "education", "opportunity", "responsibility", "technology", "culture", "society", "development", "economy", "necessary", "available", "significant", "traditional", "generally", "particularly"],
|
| 25 |
+
"de": ["erfahrung", "umwelt", "situation", "wissen", "beziehung", "regierung", "bildung", "gelegenheit", "verantwortung", "technologie", "kultur", "gesellschaft", "entwicklung", "wirtschaft", "notwendig", "verfügbar", "bedeutend", "traditionell", "allgemein", "besonders"],
|
| 26 |
+
"es": ["experiencia", "medio ambiente", "situación", "conocimiento", "relación", "gobierno", "educación", "oportunidad", "responsabilidad", "tecnología", "cultura", "sociedad", "desarrollo", "economía", "necesario", "disponible", "significativo", "tradicional", "generalmente", "particularmente"]
|
| 27 |
+
},
|
| 28 |
+
"B2": {
|
| 29 |
+
"description": "Upper Intermediate",
|
| 30 |
+
"score": 4,
|
| 31 |
+
"en": ["implementation", "infrastructure", "phenomenon", "component", "perspective", "theoretical", "comprehensive", "substantial", "predominantly", "furthermore", "nevertheless", "consequently", "regarding", "whereas", "thereby"],
|
| 32 |
+
"de": ["umsetzung", "infrastruktur", "phänomen", "komponente", "perspektive", "theoretisch", "umfassend", "wesentlich", "vorwiegend", "außerdem", "dennoch", "folglich", "bezüglich", "wohingegen", "dadurch"],
|
| 33 |
+
"es": ["implementación", "infraestructura", "fenómeno", "componente", "perspectiva", "teórico", "comprensivo", "sustancial", "predominantemente", "además", "sin embargo", "consecuentemente", "respecto a", "mientras que", "por lo tanto"]
|
| 34 |
+
},
|
| 35 |
+
"C1": {
|
| 36 |
+
"description": "Advanced",
|
| 37 |
+
"score": 5,
|
| 38 |
+
"en": ["aforementioned", "notwithstanding", "juxtapose", "paradigm", "methodology", "hypothesis", "empirical", "ambiguous", "intrinsic", "exacerbate", "unprecedented", "inadvertent", "inherent", "albeit", "albeit"],
|
| 39 |
+
"de": ["oben erwähnt", "ungeachtet", "gegenüberstellen", "paradigma", "methodologie", "hypothese", "empirisch", "mehrdeutig", "intrinsisch", "verschärfen", "beispiellos", "unbeabsichtigt", "inhärent", "obwohl", "obgleich"],
|
| 40 |
+
"es": ["mencionado anteriormente", "no obstante", "yuxtaponer", "paradigma", "metodología", "hipótesis", "empírico", "ambiguo", "intrínseco", "exacerbar", "sin precedentes", "inadvertido", "inherente", "aunque", "si bien"]
|
| 41 |
+
},
|
| 42 |
+
"C2": {
|
| 43 |
+
"description": "Proficient",
|
| 44 |
+
"score": 6,
|
| 45 |
+
"en": ["epistemological", "quintessential", "perspicacious", "ubiquitous", "vicissitude", "surreptitious", "obfuscate", "indefatigable", "recalcitrant", "acquiesce"],
|
| 46 |
+
"de": ["erkenntnistheoretisch", "wesentlich", "scharfsinnig", "allgegenwärtig", "wandel", "heimlich", "verschleiern", "unermüdlich", "widerspenstig", "einwilligen"],
|
| 47 |
+
"es": ["epistemológico", "quintaesencial", "perspicaz", "ubicuo", "vicisitud", "subrepticio", "ofuscar", "infatigable", "recalcitrante", "consentir"]
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
}
|
data/cefr && cp CUsers13197OneDriveDesktopproject2languagedatahskhsk_words.json CUsers13197OneDriveDesktop24697Project2agentic-language-partnerdatahsk
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "CEFR (Common European Framework of Reference)",
|
| 3 |
+
"description": "Proficiency levels for European languages",
|
| 4 |
+
"languages": ["en", "de", "es", "fr", "it"],
|
| 5 |
+
"source": "Sample data - Replace with complete CEFR database for production",
|
| 6 |
+
"levels": {
|
| 7 |
+
"A1": {
|
| 8 |
+
"description": "Beginner",
|
| 9 |
+
"score": 1,
|
| 10 |
+
"en": ["hello", "goodbye", "yes", "no", "please", "thank you", "water", "food", "house", "cat", "dog", "book", "table", "chair", "good", "bad", "big", "small", "one", "two"],
|
| 11 |
+
"de": ["hallo", "tschüss", "ja", "nein", "bitte", "danke", "wasser", "essen", "haus", "katze", "hund", "buch", "tisch", "stuhl", "gut", "schlecht", "groß", "klein", "eins", "zwei"],
|
| 12 |
+
"es": ["hola", "adiós", "sí", "no", "por favor", "gracias", "agua", "comida", "casa", "gato", "perro", "libro", "mesa", "silla", "bueno", "malo", "grande", "pequeño", "uno", "dos"]
|
| 13 |
+
},
|
| 14 |
+
"A2": {
|
| 15 |
+
"description": "Elementary",
|
| 16 |
+
"score": 2,
|
| 17 |
+
"en": ["restaurant", "morning", "afternoon", "week", "month", "family", "friend", "work", "school", "city", "country", "weather", "summer", "winter", "happy", "sad", "easy", "difficult", "beautiful", "expensive"],
|
| 18 |
+
"de": ["restaurant", "morgen", "nachmittag", "woche", "monat", "familie", "freund", "arbeit", "schule", "stadt", "land", "wetter", "sommer", "winter", "glücklich", "traurig", "einfach", "schwierig", "schön", "teuer"],
|
| 19 |
+
"es": ["restaurante", "mañana", "tarde", "semana", "mes", "familia", "amigo", "trabajo", "escuela", "ciudad", "país", "tiempo", "verano", "invierno", "feliz", "triste", "fácil", "difícil", "hermoso", "caro"]
|
| 20 |
+
},
|
| 21 |
+
"B1": {
|
| 22 |
+
"description": "Intermediate",
|
| 23 |
+
"score": 3,
|
| 24 |
+
"en": ["experience", "environment", "situation", "knowledge", "relationship", "government", "education", "opportunity", "responsibility", "technology", "culture", "society", "development", "economy", "necessary", "available", "significant", "traditional", "generally", "particularly"],
|
| 25 |
+
"de": ["erfahrung", "umwelt", "situation", "wissen", "beziehung", "regierung", "bildung", "gelegenheit", "verantwortung", "technologie", "kultur", "gesellschaft", "entwicklung", "wirtschaft", "notwendig", "verfügbar", "bedeutend", "traditionell", "allgemein", "besonders"],
|
| 26 |
+
"es": ["experiencia", "medio ambiente", "situación", "conocimiento", "relación", "gobierno", "educación", "oportunidad", "responsabilidad", "tecnología", "cultura", "sociedad", "desarrollo", "economía", "necesario", "disponible", "significativo", "tradicional", "generalmente", "particularmente"]
|
| 27 |
+
},
|
| 28 |
+
"B2": {
|
| 29 |
+
"description": "Upper Intermediate",
|
| 30 |
+
"score": 4,
|
| 31 |
+
"en": ["implementation", "infrastructure", "phenomenon", "component", "perspective", "theoretical", "comprehensive", "substantial", "predominantly", "furthermore", "nevertheless", "consequently", "regarding", "whereas", "thereby"],
|
| 32 |
+
"de": ["umsetzung", "infrastruktur", "phänomen", "komponente", "perspektive", "theoretisch", "umfassend", "wesentlich", "vorwiegend", "außerdem", "dennoch", "folglich", "bezüglich", "wohingegen", "dadurch"],
|
| 33 |
+
"es": ["implementación", "infraestructura", "fenómeno", "componente", "perspectiva", "teórico", "comprensivo", "sustancial", "predominantemente", "además", "sin embargo", "consecuentemente", "respecto a", "mientras que", "por lo tanto"]
|
| 34 |
+
},
|
| 35 |
+
"C1": {
|
| 36 |
+
"description": "Advanced",
|
| 37 |
+
"score": 5,
|
| 38 |
+
"en": ["aforementioned", "notwithstanding", "juxtapose", "paradigm", "methodology", "hypothesis", "empirical", "ambiguous", "intrinsic", "exacerbate", "unprecedented", "inadvertent", "inherent", "albeit", "albeit"],
|
| 39 |
+
"de": ["oben erwähnt", "ungeachtet", "gegenüberstellen", "paradigma", "methodologie", "hypothese", "empirisch", "mehrdeutig", "intrinsisch", "verschärfen", "beispiellos", "unbeabsichtigt", "inhärent", "obwohl", "obgleich"],
|
| 40 |
+
"es": ["mencionado anteriormente", "no obstante", "yuxtaponer", "paradigma", "metodología", "hipótesis", "empírico", "ambiguo", "intrínseco", "exacerbar", "sin precedentes", "inadvertido", "inherente", "aunque", "si bien"]
|
| 41 |
+
},
|
| 42 |
+
"C2": {
|
| 43 |
+
"description": "Proficient",
|
| 44 |
+
"score": 6,
|
| 45 |
+
"en": ["epistemological", "quintessential", "perspicacious", "ubiquitous", "vicissitude", "surreptitious", "obfuscate", "indefatigable", "recalcitrant", "acquiesce"],
|
| 46 |
+
"de": ["erkenntnistheoretisch", "wesentlich", "scharfsinnig", "allgegenwärtig", "wandel", "heimlich", "verschleiern", "unermüdlich", "widerspenstig", "einwilligen"],
|
| 47 |
+
"es": ["epistemológico", "quintaesencial", "perspicaz", "ubicuo", "vicisitud", "subrepticio", "ofuscar", "infatigable", "recalcitrante", "consentir"]
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
}
|
data/hsk/hsk_words.json
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "HSK (Hanyu Shuiping Kaoshi)",
|
| 3 |
+
"description": "Chinese Proficiency Test",
|
| 4 |
+
"language": "zh-cn",
|
| 5 |
+
"source": "Sample data - Replace with complete HSK database (~5000 words) for production",
|
| 6 |
+
"levels": {
|
| 7 |
+
"1": {
|
| 8 |
+
"description": "Beginner",
|
| 9 |
+
"score": 1,
|
| 10 |
+
"words": [
|
| 11 |
+
"你", "我", "他", "她", "们", "的", "是", "不", "了", "在",
|
| 12 |
+
"有", "人", "这", "中", "大", "来", "上", "国", "个", "到",
|
| 13 |
+
"说", "时", "要", "就", "出", "会", "可", "也", "你们", "我们",
|
| 14 |
+
"他们", "什么", "没有", "好", "看", "爱", "去", "想", "做", "吃",
|
| 15 |
+
"喝", "饭", "茶", "水", "书", "字", "学", "生", "先生", "小姐"
|
| 16 |
+
]
|
| 17 |
+
},
|
| 18 |
+
"2": {
|
| 19 |
+
"description": "Elementary",
|
| 20 |
+
"score": 2,
|
| 21 |
+
"words": [
|
| 22 |
+
"能", "过", "现在", "没关系", "太", "非常", "怎么", "怎么样", "知道",
|
| 23 |
+
"道", "学习", "认识", "高兴", "欢迎", "谢谢", "对不起", "再见", "明天", "昨天",
|
| 24 |
+
"今天", "年", "月", "日", "星期", "点", "分", "小时", "刚才",
|
| 25 |
+
"已经", "马上", "打电话", "跑步", "睡觉", "起床", "上班", "下班", "飞机", "火车",
|
| 26 |
+
"公共汽车", "出租车", "医院", "银行", "邮局", "超市", "商店", "饭馆", "学校", "公司"
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"3": {
|
| 30 |
+
"description": "Intermediate",
|
| 31 |
+
"score": 3,
|
| 32 |
+
"words": [
|
| 33 |
+
"但是", "因为", "所以", "虽然", "如果", "或者", "而且", "然后", "才", "刚",
|
| 34 |
+
"曾经", "从来", "一直", "正在", "着", "过", "了", "地", "得",
|
| 35 |
+
"把", "被", "让", "叫", "使", "教", "告诉", "问", "回答", "说话",
|
| 36 |
+
"聊天", "讨论", "解释", "介绍", "表示", "表达", "意思", "意见", "建议", "办法",
|
| 37 |
+
"方法", "态度", "情况", "问题", "困难", "容易", "简单", "复杂", "重要", "必要"
|
| 38 |
+
]
|
| 39 |
+
},
|
| 40 |
+
"4": {
|
| 41 |
+
"description": "Upper Intermediate",
|
| 42 |
+
"score": 4,
|
| 43 |
+
"words": [
|
| 44 |
+
"麻婆豆腐", "番茄炒蛋", "宫保鸡丁", "鱼香肉丝", "醋溜白菜", "蛋炒饭", "辣子鸡", "酸辣土豆丝",
|
| 45 |
+
"饺子", "包子", "馒头", "面条", "米饭", "粥", "汤", "菜",
|
| 46 |
+
"总是", "经常", "有时候", "偶尔", "从不", "永远", "始终", "一直",
|
| 47 |
+
"特别", "比较", "更", "最", "极其",
|
| 48 |
+
"关于", "对于", "至于", "由于", "根据", "按照", "依照", "为了",
|
| 49 |
+
"除了", "除非", "即使", "尽管", "不管"
|
| 50 |
+
]
|
| 51 |
+
},
|
| 52 |
+
"5": {
|
| 53 |
+
"description": "Advanced",
|
| 54 |
+
"score": 5,
|
| 55 |
+
"words": [
|
| 56 |
+
"龟兔赛跑", "画蛇添足", "守株待兔", "刻舟求剑", "亡羊补牢", "掩耳盗铃", "狐假虎威", "井底之蛙",
|
| 57 |
+
"完全", "彻底", "绝对", "肯定", "否定", "确定", "一定", "必然",
|
| 58 |
+
"偶然", "突然", "忽然", "顿时", "立刻", "随即", "随后",
|
| 59 |
+
"继续", "持续", "连续", "陆续", "依次", "逐渐", "渐渐", "逐步",
|
| 60 |
+
"促进", "推动", "推进", "加强", "增强", "提高", "改善", "完善"
|
| 61 |
+
]
|
| 62 |
+
},
|
| 63 |
+
"6": {
|
| 64 |
+
"description": "Proficient",
|
| 65 |
+
"score": 6,
|
| 66 |
+
"words": [
|
| 67 |
+
"跑得很远了", "宝宝睡前故事", "嘲笑", "比赛", "撒开", "腿", "一会儿",
|
| 68 |
+
"深刻", "深入", "深远", "深厚", "深切", "深度", "深层", "深化",
|
| 69 |
+
"广泛", "广大", "广阔", "宽广", "宽阔", "辽阔", "浩瀚", "无限",
|
| 70 |
+
"精确", "准确", "正确", "确切", "切实", "实际", "实在", "实质",
|
| 71 |
+
"综合", "综述", "概括", "概述", "总结", "归纳", "归结", "归类"
|
| 72 |
+
]
|
| 73 |
+
}
|
| 74 |
+
}
|
| 75 |
+
}
|
data/jlpt/jlpt_words.json
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "JLPT (Japanese Language Proficiency Test)",
|
| 3 |
+
"description": "Proficiency levels for Japanese language",
|
| 4 |
+
"language": "ja",
|
| 5 |
+
"source": "Sample data - Replace with complete JLPT database for production",
|
| 6 |
+
"note": "N5 is easiest, N1 is hardest",
|
| 7 |
+
"levels": {
|
| 8 |
+
"N5": {
|
| 9 |
+
"description": "Beginner",
|
| 10 |
+
"score": 1,
|
| 11 |
+
"words": [
|
| 12 |
+
"こんにちは", "ありがとう", "すみません", "はい", "いいえ",
|
| 13 |
+
"私", "あなた", "これ", "それ", "あれ",
|
| 14 |
+
"水", "食べ物", "家", "学校", "本",
|
| 15 |
+
"大きい", "小さい", "良い", "悪い", "新しい",
|
| 16 |
+
"一", "二", "三", "四", "五"
|
| 17 |
+
]
|
| 18 |
+
},
|
| 19 |
+
"N4": {
|
| 20 |
+
"description": "Elementary",
|
| 21 |
+
"score": 2,
|
| 22 |
+
"words": [
|
| 23 |
+
"レストラン", "駅", "病院", "図書館", "公園",
|
| 24 |
+
"朝", "昼", "夜", "今日", "明日",
|
| 25 |
+
"友達", "先生", "学生", "会社", "仕事",
|
| 26 |
+
"便利", "簡単", "難しい", "面白い", "つまらない",
|
| 27 |
+
"食べる", "飲む", "行く", "来る", "見る"
|
| 28 |
+
]
|
| 29 |
+
},
|
| 30 |
+
"N3": {
|
| 31 |
+
"description": "Intermediate",
|
| 32 |
+
"score": 3,
|
| 33 |
+
"words": [
|
| 34 |
+
"経験", "環境", "状況", "知識", "関係",
|
| 35 |
+
"政府", "教育", "機会", "責任", "技術",
|
| 36 |
+
"文化", "社会", "発展", "経済", "必要",
|
| 37 |
+
"一般的", "特に", "確かに", "実際", "最近",
|
| 38 |
+
"考える", "思う", "感じる", "理解する", "説明する"
|
| 39 |
+
]
|
| 40 |
+
},
|
| 41 |
+
"N2": {
|
| 42 |
+
"description": "Upper Intermediate",
|
| 43 |
+
"score": 4,
|
| 44 |
+
"words": [
|
| 45 |
+
"実施", "基盤", "現象", "要素", "観点",
|
| 46 |
+
"理論的", "包括的", "実質的", "主に", "さらに",
|
| 47 |
+
"しかしながら", "従って", "に関して", "一方", "それによって",
|
| 48 |
+
"検討する", "分析する", "評価する", "提案する", "実現する"
|
| 49 |
+
]
|
| 50 |
+
},
|
| 51 |
+
"N1": {
|
| 52 |
+
"description": "Advanced",
|
| 53 |
+
"score": 5,
|
| 54 |
+
"words": [
|
| 55 |
+
"前述", "にもかかわらず", "対比する", "範例", "方法論",
|
| 56 |
+
"仮説", "経験的", "曖昧", "本質的", "悪化させる",
|
| 57 |
+
"前例のない", "無意識", "固有", "とはいえ", "けれども",
|
| 58 |
+
"体系化する", "統合する", "最適化する", "具現化する", "顕在化する"
|
| 59 |
+
]
|
| 60 |
+
}
|
| 61 |
+
}
|
| 62 |
+
}
|
data/jlpt && cp CUsers13197OneDriveDesktopproject2languagedatatopiktopik_words.json CUsers13197OneDriveDesktop24697Project2agentic-language-partnerdatatopik
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "JLPT (Japanese Language Proficiency Test)",
|
| 3 |
+
"description": "Proficiency levels for Japanese language",
|
| 4 |
+
"language": "ja",
|
| 5 |
+
"source": "Sample data - Replace with complete JLPT database for production",
|
| 6 |
+
"note": "N5 is easiest, N1 is hardest",
|
| 7 |
+
"levels": {
|
| 8 |
+
"N5": {
|
| 9 |
+
"description": "Beginner",
|
| 10 |
+
"score": 1,
|
| 11 |
+
"words": [
|
| 12 |
+
"こんにちは", "ありがとう", "すみません", "はい", "いいえ",
|
| 13 |
+
"私", "あなた", "これ", "それ", "あれ",
|
| 14 |
+
"水", "食べ物", "家", "学校", "本",
|
| 15 |
+
"大きい", "小さい", "良い", "悪い", "新しい",
|
| 16 |
+
"一", "二", "三", "四", "五"
|
| 17 |
+
]
|
| 18 |
+
},
|
| 19 |
+
"N4": {
|
| 20 |
+
"description": "Elementary",
|
| 21 |
+
"score": 2,
|
| 22 |
+
"words": [
|
| 23 |
+
"レストラン", "駅", "病院", "図書館", "公園",
|
| 24 |
+
"朝", "昼", "夜", "今日", "明日",
|
| 25 |
+
"友達", "先生", "学生", "会社", "仕事",
|
| 26 |
+
"便利", "簡単", "難しい", "面白い", "つまらない",
|
| 27 |
+
"食べる", "飲む", "行く", "来る", "見る"
|
| 28 |
+
]
|
| 29 |
+
},
|
| 30 |
+
"N3": {
|
| 31 |
+
"description": "Intermediate",
|
| 32 |
+
"score": 3,
|
| 33 |
+
"words": [
|
| 34 |
+
"経験", "環境", "状況", "知識", "関係",
|
| 35 |
+
"政府", "教育", "機会", "責任", "技術",
|
| 36 |
+
"文化", "社会", "発展", "経済", "必要",
|
| 37 |
+
"一般的", "特に", "確かに", "実際", "最近",
|
| 38 |
+
"考える", "思う", "感じる", "理解する", "説明する"
|
| 39 |
+
]
|
| 40 |
+
},
|
| 41 |
+
"N2": {
|
| 42 |
+
"description": "Upper Intermediate",
|
| 43 |
+
"score": 4,
|
| 44 |
+
"words": [
|
| 45 |
+
"実施", "基盤", "現象", "要素", "観点",
|
| 46 |
+
"理論的", "包括的", "実質的", "主に", "さらに",
|
| 47 |
+
"しかしながら", "従って", "に関して", "一方", "それによって",
|
| 48 |
+
"検討する", "分析する", "評価する", "提案する", "実現する"
|
| 49 |
+
]
|
| 50 |
+
},
|
| 51 |
+
"N1": {
|
| 52 |
+
"description": "Advanced",
|
| 53 |
+
"score": 5,
|
| 54 |
+
"words": [
|
| 55 |
+
"前述", "にもかかわらず", "対比する", "範例", "方法論",
|
| 56 |
+
"仮説", "経験的", "曖昧", "本質的", "悪化させる",
|
| 57 |
+
"前例のない", "無意識", "固有", "とはいえ", "けれども",
|
| 58 |
+
"体系化する", "統合する", "最適化する", "具現化する", "顕在化する"
|
| 59 |
+
]
|
| 60 |
+
}
|
| 61 |
+
}
|
| 62 |
+
}
|
data/topik/topik_words.json
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "TOPIK (Test of Proficiency in Korean)",
|
| 3 |
+
"description": "Proficiency levels for Korean language",
|
| 4 |
+
"language": "ko",
|
| 5 |
+
"source": "Sample data - Replace with complete TOPIK database for production",
|
| 6 |
+
"note": "TOPIK I (1-2) and TOPIK II (3-6)",
|
| 7 |
+
"levels": {
|
| 8 |
+
"1": {
|
| 9 |
+
"description": "Beginner",
|
| 10 |
+
"score": 1,
|
| 11 |
+
"words": [
|
| 12 |
+
"안녕하세요", "감사합니다", "죄송합니다", "네", "아니요",
|
| 13 |
+
"나", "너", "이것", "저것", "그것",
|
| 14 |
+
"물", "음식", "집", "학교", "책",
|
| 15 |
+
"크다", "작다", "좋다", "나쁘다", "새롭다",
|
| 16 |
+
"하나", "둘", "셋", "넷", "다섯"
|
| 17 |
+
]
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"description": "Elementary",
|
| 21 |
+
"score": 2,
|
| 22 |
+
"words": [
|
| 23 |
+
"식당", "역", "병원", "도서관", "공원",
|
| 24 |
+
"아침", "점심", "저녁", "오늘", "내일",
|
| 25 |
+
"친구", "선생님", "학생", "회사", "일",
|
| 26 |
+
"편리하다", "쉽다", "어렵다", "재미있다", "지루하다",
|
| 27 |
+
"먹다", "마시다", "가다", "오다", "보다"
|
| 28 |
+
]
|
| 29 |
+
},
|
| 30 |
+
"3": {
|
| 31 |
+
"description": "Intermediate",
|
| 32 |
+
"score": 3,
|
| 33 |
+
"words": [
|
| 34 |
+
"경험", "환경", "상황", "지식", "관계",
|
| 35 |
+
"정부", "교육", "기회", "책임", "기술",
|
| 36 |
+
"문화", "사회", "발전", "경제", "필요",
|
| 37 |
+
"일반적", "특히", "확실히", "실제로", "최근",
|
| 38 |
+
"생각하다", "느끼다", "이해하다", "설명하다", "표현하다"
|
| 39 |
+
]
|
| 40 |
+
},
|
| 41 |
+
"4": {
|
| 42 |
+
"description": "Upper Intermediate",
|
| 43 |
+
"score": 4,
|
| 44 |
+
"words": [
|
| 45 |
+
"실시", "기반", "현상", "요소", "관점",
|
| 46 |
+
"이론적", "포괄적", "실질적", "주로", "더욱이",
|
| 47 |
+
"그러나", "따라서", "에 관하여", "한편", "그로써",
|
| 48 |
+
"검토하다", "분석하다", "평가하다", "제안하다", "실현하다"
|
| 49 |
+
]
|
| 50 |
+
},
|
| 51 |
+
"5": {
|
| 52 |
+
"description": "Advanced",
|
| 53 |
+
"score": 5,
|
| 54 |
+
"words": [
|
| 55 |
+
"전술한", "에도 불구하고", "대비하다", "패러다임", "방법론",
|
| 56 |
+
"가설", "경험적", "애매한", "본질적", "악화시키다",
|
| 57 |
+
"전례없는", "무의식적", "고유한", "비록", "그럼에도",
|
| 58 |
+
"체계화하다", "통합하다", "최적화하다", "구현하다", "현현하다"
|
| 59 |
+
]
|
| 60 |
+
},
|
| 61 |
+
"6": {
|
| 62 |
+
"description": "Proficient",
|
| 63 |
+
"score": 6,
|
| 64 |
+
"words": [
|
| 65 |
+
"인식론적", "전형적인", "예리한", "편재하는", "변천",
|
| 66 |
+
"은밀한", "난독화하다", "불굴의", "완고한", "동의하다"
|
| 67 |
+
]
|
| 68 |
+
}
|
| 69 |
+
}
|
| 70 |
+
}
|
requirements.txt
CHANGED
|
@@ -17,11 +17,13 @@ gTTS
|
|
| 17 |
########################################
|
| 18 |
pytesseract
|
| 19 |
pillow
|
|
|
|
| 20 |
|
| 21 |
########################################
|
| 22 |
-
# Translation
|
| 23 |
########################################
|
| 24 |
deep-translator
|
|
|
|
| 25 |
|
| 26 |
########################################
|
| 27 |
# Language Modeling / Text Processing
|
|
@@ -32,6 +34,11 @@ sentencepiece
|
|
| 32 |
safetensors
|
| 33 |
regex
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
########################################
|
| 36 |
# General Utilities
|
| 37 |
########################################
|
|
|
|
| 17 |
########################################
|
| 18 |
pytesseract
|
| 19 |
pillow
|
| 20 |
+
opencv-python-headless
|
| 21 |
|
| 22 |
########################################
|
| 23 |
+
# Translation & Language Detection
|
| 24 |
########################################
|
| 25 |
deep-translator
|
| 26 |
+
langdetect
|
| 27 |
|
| 28 |
########################################
|
| 29 |
# Language Modeling / Text Processing
|
|
|
|
| 34 |
safetensors
|
| 35 |
regex
|
| 36 |
|
| 37 |
+
########################################
|
| 38 |
+
# AI APIs (Optional - for Quiz Generation)
|
| 39 |
+
########################################
|
| 40 |
+
openai
|
| 41 |
+
|
| 42 |
########################################
|
| 43 |
# General Utilities
|
| 44 |
########################################
|
src/app/difficulty_scorer.py
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Difficulty Scorer - Multi-language Support
|
| 4 |
+
|
| 5 |
+
Supports 6 languages with proficiency test databases:
|
| 6 |
+
- English (en): CEFR A1-C2
|
| 7 |
+
- Chinese (zh-cn): HSK 1-6
|
| 8 |
+
- German (de): CEFR A1-C2
|
| 9 |
+
- Spanish (es): CEFR A1-C2
|
| 10 |
+
- Japanese (ja): JLPT N5-N1
|
| 11 |
+
- Korean (ko): TOPIK 1-6
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import json
|
| 15 |
+
from typing import Dict, Any, List, Optional
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class DifficultyScorer:
|
| 20 |
+
"""Multi-language difficulty scoring system"""
|
| 21 |
+
|
| 22 |
+
LANGUAGE_TESTS = {
|
| 23 |
+
'en': 'cefr',
|
| 24 |
+
'de': 'cefr',
|
| 25 |
+
'es': 'cefr',
|
| 26 |
+
'fr': 'cefr',
|
| 27 |
+
'it': 'cefr',
|
| 28 |
+
'zh-cn': 'hsk',
|
| 29 |
+
'zh-tw': 'hsk',
|
| 30 |
+
'ja': 'jlpt',
|
| 31 |
+
'ko': 'topik',
|
| 32 |
+
'ru': 'cefr',
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
JLPT_MAPPING = {
|
| 36 |
+
'N5': 1, 'N4': 2, 'N3': 3, 'N2': 4, 'N1': 5
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
def __init__(self, data_dir: str = None):
|
| 40 |
+
"""
|
| 41 |
+
Initialize multi-language difficulty scorer
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
data_dir: Path to data directory containing proficiency databases
|
| 45 |
+
"""
|
| 46 |
+
if data_dir is None:
|
| 47 |
+
current_dir = Path(__file__).parent
|
| 48 |
+
project_root = current_dir.parent.parent
|
| 49 |
+
data_dir = project_root / "data"
|
| 50 |
+
|
| 51 |
+
self.data_dir = Path(data_dir)
|
| 52 |
+
self.databases = self._load_all_databases()
|
| 53 |
+
self.word_lookups = self._create_word_lookups()
|
| 54 |
+
|
| 55 |
+
def _load_all_databases(self) -> Dict[str, Dict]:
|
| 56 |
+
"""Load all language proficiency databases"""
|
| 57 |
+
databases = {}
|
| 58 |
+
|
| 59 |
+
# Load CEFR (English, German, Spanish, etc.)
|
| 60 |
+
cefr_path = self.data_dir / "cefr" / "cefr_words.json"
|
| 61 |
+
if cefr_path.exists():
|
| 62 |
+
try:
|
| 63 |
+
with open(cefr_path, 'r', encoding='utf-8') as f:
|
| 64 |
+
databases['cefr'] = json.load(f)
|
| 65 |
+
except Exception as e:
|
| 66 |
+
print(f"[DifficultyScorer] Failed to load CEFR: {e}")
|
| 67 |
+
|
| 68 |
+
# Load HSK (Chinese)
|
| 69 |
+
hsk_path = self.data_dir / "hsk" / "hsk_words.json"
|
| 70 |
+
if hsk_path.exists():
|
| 71 |
+
try:
|
| 72 |
+
with open(hsk_path, 'r', encoding='utf-8') as f:
|
| 73 |
+
databases['hsk'] = json.load(f)
|
| 74 |
+
except Exception as e:
|
| 75 |
+
print(f"[DifficultyScorer] Failed to load HSK: {e}")
|
| 76 |
+
|
| 77 |
+
# Load JLPT (Japanese)
|
| 78 |
+
jlpt_path = self.data_dir / "jlpt" / "jlpt_words.json"
|
| 79 |
+
if jlpt_path.exists():
|
| 80 |
+
try:
|
| 81 |
+
with open(jlpt_path, 'r', encoding='utf-8') as f:
|
| 82 |
+
databases['jlpt'] = json.load(f)
|
| 83 |
+
except Exception as e:
|
| 84 |
+
print(f"[DifficultyScorer] Failed to load JLPT: {e}")
|
| 85 |
+
|
| 86 |
+
# Load TOPIK (Korean)
|
| 87 |
+
topik_path = self.data_dir / "topik" / "topik_words.json"
|
| 88 |
+
if topik_path.exists():
|
| 89 |
+
try:
|
| 90 |
+
with open(topik_path, 'r', encoding='utf-8') as f:
|
| 91 |
+
databases['topik'] = json.load(f)
|
| 92 |
+
except Exception as e:
|
| 93 |
+
print(f"[DifficultyScorer] Failed to load TOPIK: {e}")
|
| 94 |
+
|
| 95 |
+
return databases
|
| 96 |
+
|
| 97 |
+
def _create_word_lookups(self) -> Dict[str, Dict[str, int]]:
|
| 98 |
+
"""Create word-to-score lookup tables for all languages"""
|
| 99 |
+
lookups = {}
|
| 100 |
+
|
| 101 |
+
# CEFR lookups
|
| 102 |
+
if 'cefr' in self.databases:
|
| 103 |
+
cefr = self.databases['cefr']
|
| 104 |
+
for lang_code in ['en', 'de', 'es', 'fr', 'it', 'ru']:
|
| 105 |
+
lookups[lang_code] = {}
|
| 106 |
+
if 'levels' in cefr:
|
| 107 |
+
for level, data in cefr['levels'].items():
|
| 108 |
+
score = data.get('score', 3)
|
| 109 |
+
if lang_code in data:
|
| 110 |
+
for word in data[lang_code]:
|
| 111 |
+
lookups[lang_code][word.lower()] = score
|
| 112 |
+
|
| 113 |
+
# HSK lookup (Chinese)
|
| 114 |
+
if 'hsk' in self.databases:
|
| 115 |
+
lookups['zh-cn'] = {}
|
| 116 |
+
lookups['zh-tw'] = {}
|
| 117 |
+
if 'levels' in self.databases['hsk']:
|
| 118 |
+
for level, data in self.databases['hsk']['levels'].items():
|
| 119 |
+
score = data.get('score', 3)
|
| 120 |
+
for word in data.get('words', []):
|
| 121 |
+
lookups['zh-cn'][word] = score
|
| 122 |
+
lookups['zh-tw'][word] = score
|
| 123 |
+
|
| 124 |
+
# JLPT lookup (Japanese)
|
| 125 |
+
if 'jlpt' in self.databases:
|
| 126 |
+
lookups['ja'] = {}
|
| 127 |
+
if 'levels' in self.databases['jlpt']:
|
| 128 |
+
for level, data in self.databases['jlpt']['levels'].items():
|
| 129 |
+
score = data.get('score', 3)
|
| 130 |
+
for word in data.get('words', []):
|
| 131 |
+
lookups['ja'][word] = score
|
| 132 |
+
|
| 133 |
+
# TOPIK lookup (Korean)
|
| 134 |
+
if 'topik' in self.databases:
|
| 135 |
+
lookups['ko'] = {}
|
| 136 |
+
if 'levels' in self.databases['topik']:
|
| 137 |
+
for level, data in self.databases['topik']['levels'].items():
|
| 138 |
+
score = data.get('score', 3)
|
| 139 |
+
for word in data.get('words', []):
|
| 140 |
+
lookups['ko'][word] = score
|
| 141 |
+
|
| 142 |
+
return lookups
|
| 143 |
+
|
| 144 |
+
def get_proficiency_score(self, word: str, language: str) -> float:
|
| 145 |
+
"""
|
| 146 |
+
Get proficiency test score for a word
|
| 147 |
+
|
| 148 |
+
Args:
|
| 149 |
+
word: Word or phrase
|
| 150 |
+
language: Language code
|
| 151 |
+
|
| 152 |
+
Returns:
|
| 153 |
+
Score 1-6 (1=easiest, 6=hardest)
|
| 154 |
+
"""
|
| 155 |
+
language = language.lower()
|
| 156 |
+
|
| 157 |
+
if language not in self.word_lookups:
|
| 158 |
+
return self._estimate_by_length(word)
|
| 159 |
+
|
| 160 |
+
lookup = self.word_lookups[language]
|
| 161 |
+
search_word = word if language in ['zh-cn', 'zh-tw', 'ja', 'ko'] else word.lower()
|
| 162 |
+
|
| 163 |
+
if search_word in lookup:
|
| 164 |
+
return float(lookup[search_word])
|
| 165 |
+
|
| 166 |
+
return self._estimate_by_length(word)
|
| 167 |
+
|
| 168 |
+
def _estimate_by_length(self, word: str) -> float:
|
| 169 |
+
"""Estimate difficulty by word length (fallback)"""
|
| 170 |
+
length = len(word)
|
| 171 |
+
if length <= 3:
|
| 172 |
+
return 2.0
|
| 173 |
+
elif length <= 6:
|
| 174 |
+
return 3.5
|
| 175 |
+
elif length <= 10:
|
| 176 |
+
return 4.5
|
| 177 |
+
else:
|
| 178 |
+
return 5.5
|
| 179 |
+
|
| 180 |
+
def get_length_score(self, word: str) -> float:
|
| 181 |
+
"""Score based on word length"""
|
| 182 |
+
length = len(word)
|
| 183 |
+
if length == 1:
|
| 184 |
+
return 1.0
|
| 185 |
+
elif length <= 3:
|
| 186 |
+
return 2.0
|
| 187 |
+
elif length <= 6:
|
| 188 |
+
return 3.0
|
| 189 |
+
elif length <= 10:
|
| 190 |
+
return 4.0
|
| 191 |
+
elif length <= 15:
|
| 192 |
+
return 5.0
|
| 193 |
+
else:
|
| 194 |
+
return 6.0
|
| 195 |
+
|
| 196 |
+
def calculate_difficulty(self, word: str, language: str) -> Dict[str, Any]:
|
| 197 |
+
"""
|
| 198 |
+
Calculate comprehensive difficulty score
|
| 199 |
+
|
| 200 |
+
Weights:
|
| 201 |
+
- Proficiency level: 60%
|
| 202 |
+
- Word length: 40%
|
| 203 |
+
"""
|
| 204 |
+
proficiency_score = self.get_proficiency_score(word, language)
|
| 205 |
+
length_score = self.get_length_score(word)
|
| 206 |
+
|
| 207 |
+
overall_score = proficiency_score * 0.6 + length_score * 0.4
|
| 208 |
+
|
| 209 |
+
if overall_score <= 2.5:
|
| 210 |
+
level = "beginner"
|
| 211 |
+
elif overall_score <= 4.5:
|
| 212 |
+
level = "intermediate"
|
| 213 |
+
else:
|
| 214 |
+
level = "advanced"
|
| 215 |
+
|
| 216 |
+
test_name = self.LANGUAGE_TESTS.get(language.lower(), 'unknown')
|
| 217 |
+
|
| 218 |
+
return {
|
| 219 |
+
"overall_score": round(overall_score, 2),
|
| 220 |
+
"level": level,
|
| 221 |
+
"factors": {
|
| 222 |
+
"proficiency_score": round(proficiency_score, 2),
|
| 223 |
+
"length": len(word),
|
| 224 |
+
"length_score": round(length_score, 2),
|
| 225 |
+
"test_system": test_name.upper()
|
| 226 |
+
}
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
def score_flashcard(self, card: Dict[str, Any]) -> Dict[str, Any]:
|
| 230 |
+
"""Add difficulty score to flashcard"""
|
| 231 |
+
word = card.get('front', '')
|
| 232 |
+
language = card.get('language', 'en')
|
| 233 |
+
|
| 234 |
+
difficulty = self.calculate_difficulty(word, language)
|
| 235 |
+
|
| 236 |
+
card_with_difficulty = card.copy()
|
| 237 |
+
card_with_difficulty['difficulty'] = difficulty
|
| 238 |
+
|
| 239 |
+
return card_with_difficulty
|
| 240 |
+
|
| 241 |
+
def score_all_flashcards(self, flashcards: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 242 |
+
"""Score all flashcards"""
|
| 243 |
+
return [self.score_flashcard(card) for card in flashcards]
|
| 244 |
+
|
| 245 |
+
def get_statistics(self, flashcards: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 246 |
+
"""Generate difficulty statistics"""
|
| 247 |
+
if not flashcards:
|
| 248 |
+
return {}
|
| 249 |
+
|
| 250 |
+
level_counts = {"beginner": 0, "intermediate": 0, "advanced": 0}
|
| 251 |
+
scores = []
|
| 252 |
+
by_language = {}
|
| 253 |
+
|
| 254 |
+
for card in flashcards:
|
| 255 |
+
if 'difficulty' in card:
|
| 256 |
+
level = card['difficulty']['level']
|
| 257 |
+
level_counts[level] += 1
|
| 258 |
+
scores.append(card['difficulty']['overall_score'])
|
| 259 |
+
|
| 260 |
+
lang = card.get('language', 'unknown')
|
| 261 |
+
if lang not in by_language:
|
| 262 |
+
by_language[lang] = {"count": 0, "scores": []}
|
| 263 |
+
by_language[lang]["count"] += 1
|
| 264 |
+
by_language[lang]["scores"].append(card['difficulty']['overall_score'])
|
| 265 |
+
|
| 266 |
+
for lang in by_language:
|
| 267 |
+
lang_scores = by_language[lang]["scores"]
|
| 268 |
+
by_language[lang]["avg_score"] = round(sum(lang_scores) / len(lang_scores), 2)
|
| 269 |
+
del by_language[lang]["scores"]
|
| 270 |
+
|
| 271 |
+
return {
|
| 272 |
+
"total_cards": len(flashcards),
|
| 273 |
+
"by_level": level_counts,
|
| 274 |
+
"by_language": by_language,
|
| 275 |
+
"average_score": round(sum(scores) / len(scores), 2) if scores else 0,
|
| 276 |
+
"min_score": round(min(scores), 2) if scores else 0,
|
| 277 |
+
"max_score": round(max(scores), 2) if scores else 0
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
# Global instance (lazy initialization)
|
| 282 |
+
_difficulty_scorer = None
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
def get_difficulty_scorer() -> DifficultyScorer:
|
| 286 |
+
"""Get or create the global DifficultyScorer instance"""
|
| 287 |
+
global _difficulty_scorer
|
| 288 |
+
if _difficulty_scorer is None:
|
| 289 |
+
_difficulty_scorer = DifficultyScorer()
|
| 290 |
+
return _difficulty_scorer
|
src/app/flashcard_generator.py
ADDED
|
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Flashcard Generator - Extracts vocabulary with context from OCR results
|
| 4 |
+
Supports multi-language extraction and context sentence generation
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
import re
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import List, Dict, Any, Optional
|
| 11 |
+
|
| 12 |
+
from deep_translator import GoogleTranslator
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class FlashcardGenerator:
|
| 16 |
+
"""Generate flashcards from OCR results with multi-language support"""
|
| 17 |
+
|
| 18 |
+
def __init__(self):
|
| 19 |
+
self.supported_languages = {
|
| 20 |
+
'zh-cn': 'Chinese (Simplified)',
|
| 21 |
+
'zh-tw': 'Chinese (Traditional)',
|
| 22 |
+
'ja': 'Japanese',
|
| 23 |
+
'ko': 'Korean',
|
| 24 |
+
'en': 'English',
|
| 25 |
+
'fr': 'French',
|
| 26 |
+
'de': 'German',
|
| 27 |
+
'es': 'Spanish',
|
| 28 |
+
'ru': 'Russian',
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
self.lang_map = {
|
| 32 |
+
'zh-cn': 'zh-CN',
|
| 33 |
+
'zh-tw': 'zh-TW',
|
| 34 |
+
'ja': 'ja',
|
| 35 |
+
'ko': 'ko',
|
| 36 |
+
'ru': 'ru',
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
self.translator_cache = {}
|
| 40 |
+
|
| 41 |
+
# Stop words for filtering common words
|
| 42 |
+
self.stop_words = {
|
| 43 |
+
'zh-cn': {
|
| 44 |
+
'的', '了', '是', '在', '我', '有', '和', '就', '不', '人',
|
| 45 |
+
'都', '一个', '上', '也', '很', '到', '说', '要', '去', '你',
|
| 46 |
+
'会', '着', '没有', '看', '好', '自己', '这', '他', '她', '它',
|
| 47 |
+
'们', '个', '吗', '呢', '吧', '啊', '哦', '嗯', '呀'
|
| 48 |
+
},
|
| 49 |
+
'en': {
|
| 50 |
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
|
| 51 |
+
'for', 'of', 'with', 'by', 'from', 'is', 'am', 'are', 'was', 'were',
|
| 52 |
+
'be', 'been', 'being', 'this', 'that', 'these', 'those', 'i', 'you',
|
| 53 |
+
'he', 'she', 'it', 'we', 'they', 'my', 'your', 'his', 'her', 'its'
|
| 54 |
+
},
|
| 55 |
+
'de': {
|
| 56 |
+
'der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einer',
|
| 57 |
+
'und', 'oder', 'aber', 'in', 'an', 'auf', 'für', 'mit', 'von',
|
| 58 |
+
'zu', 'ist', 'sind', 'war', 'waren', 'ich', 'du', 'er', 'sie', 'es'
|
| 59 |
+
},
|
| 60 |
+
'es': {
|
| 61 |
+
'el', 'la', 'los', 'las', 'un', 'una', 'unos', 'unas', 'y', 'o',
|
| 62 |
+
'pero', 'en', 'a', 'de', 'con', 'por', 'para', 'es', 'son', 'era',
|
| 63 |
+
'yo', 'tú', 'él', 'ella', 'nosotros', 'vosotros', 'ellos', 'ellas'
|
| 64 |
+
},
|
| 65 |
+
'ja': {
|
| 66 |
+
'の', 'に', 'は', 'を', 'た', 'が', 'で', 'て', 'と', 'し',
|
| 67 |
+
'れ', 'さ', 'ある', 'いる', 'も', 'する', 'から', 'な', 'こ', 'そ'
|
| 68 |
+
},
|
| 69 |
+
'ko': {
|
| 70 |
+
'은', '는', '이', '가', '을', '를', '의', '에', '에서', '로',
|
| 71 |
+
'와', '과', '도', '만', '까지', '부터', '하다', '되다', '있다', '없다'
|
| 72 |
+
},
|
| 73 |
+
'ru': {
|
| 74 |
+
'и', 'в', 'на', 'с', 'к', 'по', 'за', 'из', 'у', 'о',
|
| 75 |
+
'а', 'но', 'что', 'это', 'как', 'он', 'она', 'они', 'мы', 'вы'
|
| 76 |
+
}
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
def extract_chinese_text(self, text: str) -> List[str]:
|
| 80 |
+
"""Extract Chinese characters/phrases"""
|
| 81 |
+
pattern = re.compile(r'[\u4e00-\u9fff]+')
|
| 82 |
+
return pattern.findall(text)
|
| 83 |
+
|
| 84 |
+
def extract_japanese_text(self, text: str) -> List[str]:
|
| 85 |
+
"""Extract Japanese text (kanji + hiragana + katakana)"""
|
| 86 |
+
pattern = re.compile(r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FAF]+')
|
| 87 |
+
return pattern.findall(text)
|
| 88 |
+
|
| 89 |
+
def extract_korean_text(self, text: str) -> List[str]:
|
| 90 |
+
"""Extract Korean words"""
|
| 91 |
+
pattern = re.compile(r'[\uAC00-\uD7AF]+')
|
| 92 |
+
return pattern.findall(text)
|
| 93 |
+
|
| 94 |
+
def extract_european_words(self, text: str) -> List[str]:
|
| 95 |
+
"""Extract words from European languages"""
|
| 96 |
+
pattern = re.compile(r'[a-zA-ZäöüßÄÖÜáéíóúñÁÉÍÓÚÑàèìòùÀÈÌÒÙ\u0400-\u04FF]+')
|
| 97 |
+
return pattern.findall(text)
|
| 98 |
+
|
| 99 |
+
def filter_by_length(self, items: List[str], min_len: int = 2, max_len: int = 15) -> List[str]:
|
| 100 |
+
"""Filter items by character length"""
|
| 101 |
+
return [item for item in items if min_len <= len(item) <= max_len]
|
| 102 |
+
|
| 103 |
+
def filter_stop_words(self, items: List[str], language: str) -> List[str]:
|
| 104 |
+
"""Remove common stop words"""
|
| 105 |
+
stop_words = self.stop_words.get(language, set())
|
| 106 |
+
if language in ['en', 'de', 'es', 'ru']:
|
| 107 |
+
return [item for item in items if item.lower() not in stop_words]
|
| 108 |
+
return [item for item in items if item not in stop_words]
|
| 109 |
+
|
| 110 |
+
def extract_vocabulary_by_language(self, text: str, language: str) -> List[str]:
|
| 111 |
+
"""Extract vocabulary based on language type"""
|
| 112 |
+
if language in ['zh-cn', 'zh-tw']:
|
| 113 |
+
return self.extract_chinese_text(text)
|
| 114 |
+
elif language == 'ja':
|
| 115 |
+
return self.extract_japanese_text(text)
|
| 116 |
+
elif language == 'ko':
|
| 117 |
+
return self.extract_korean_text(text)
|
| 118 |
+
else:
|
| 119 |
+
return self.extract_european_words(text)
|
| 120 |
+
|
| 121 |
+
def get_sentence_delimiter(self, language: str) -> str:
|
| 122 |
+
"""Get sentence delimiter pattern for a language"""
|
| 123 |
+
return r'[。!?.!?\n]+'
|
| 124 |
+
|
| 125 |
+
def extract_context_sentence(self, word: str, text: str, language: str = 'zh-cn') -> str:
|
| 126 |
+
"""Extract context around the word"""
|
| 127 |
+
delimiter = self.get_sentence_delimiter(language)
|
| 128 |
+
sentences = re.split(delimiter, text)
|
| 129 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
| 130 |
+
|
| 131 |
+
if not sentences:
|
| 132 |
+
return ""
|
| 133 |
+
|
| 134 |
+
# Find sentence containing the word
|
| 135 |
+
word_sentence_idx = -1
|
| 136 |
+
for idx, sentence in enumerate(sentences):
|
| 137 |
+
if word in sentence:
|
| 138 |
+
word_sentence_idx = idx
|
| 139 |
+
break
|
| 140 |
+
|
| 141 |
+
if word_sentence_idx == -1:
|
| 142 |
+
return ""
|
| 143 |
+
|
| 144 |
+
word_sentence = sentences[word_sentence_idx]
|
| 145 |
+
is_same_as_sentence = (word_sentence == word or word_sentence.replace(' ', '') == word.replace(' ', ''))
|
| 146 |
+
is_title = (is_same_as_sentence and (word_sentence_idx <= 3 or word_sentence_idx < len(sentences) - 1))
|
| 147 |
+
|
| 148 |
+
context_sentences = []
|
| 149 |
+
|
| 150 |
+
if is_title:
|
| 151 |
+
context_sentences.append(word_sentence)
|
| 152 |
+
for i in range(word_sentence_idx + 1, min(word_sentence_idx + 3, len(sentences))):
|
| 153 |
+
next_sentence = sentences[i]
|
| 154 |
+
if len(next_sentence) > 3:
|
| 155 |
+
context_sentences.append(next_sentence)
|
| 156 |
+
break
|
| 157 |
+
else:
|
| 158 |
+
if word_sentence_idx > 0:
|
| 159 |
+
prev_sentence = sentences[word_sentence_idx - 1]
|
| 160 |
+
if len(prev_sentence) > 5:
|
| 161 |
+
context_sentences.append(prev_sentence)
|
| 162 |
+
|
| 163 |
+
context_sentences.append(word_sentence)
|
| 164 |
+
|
| 165 |
+
if word_sentence_idx < len(sentences) - 1:
|
| 166 |
+
next_sentence = sentences[word_sentence_idx + 1]
|
| 167 |
+
if len(next_sentence) > 5:
|
| 168 |
+
context_sentences.append(next_sentence)
|
| 169 |
+
|
| 170 |
+
if language in ['zh-cn', 'zh-tw', 'ja']:
|
| 171 |
+
context = ''.join(context_sentences)
|
| 172 |
+
else:
|
| 173 |
+
context = ' '.join(context_sentences)
|
| 174 |
+
|
| 175 |
+
if len(context) > 150:
|
| 176 |
+
context = context[:150] + '...'
|
| 177 |
+
|
| 178 |
+
return context
|
| 179 |
+
|
| 180 |
+
def translate_to_target(self, text: str, source_lang: str, target_lang: str = 'en') -> str:
|
| 181 |
+
"""Translate text to target language"""
|
| 182 |
+
cache_key = f"{source_lang}:{target_lang}:{text}"
|
| 183 |
+
if cache_key in self.translator_cache:
|
| 184 |
+
return self.translator_cache[cache_key]
|
| 185 |
+
|
| 186 |
+
try:
|
| 187 |
+
source = self.lang_map.get(source_lang, source_lang)
|
| 188 |
+
target = self.lang_map.get(target_lang, target_lang)
|
| 189 |
+
|
| 190 |
+
translator = GoogleTranslator(source=source, target=target)
|
| 191 |
+
translation = translator.translate(text)
|
| 192 |
+
|
| 193 |
+
self.translator_cache[cache_key] = translation
|
| 194 |
+
return translation
|
| 195 |
+
except Exception as e:
|
| 196 |
+
return f"[Translation failed: {text}]"
|
| 197 |
+
|
| 198 |
+
def extract_learnable_items(self, ocr_result: Dict[str, Any], target_lang: str = 'en') -> List[Dict[str, Any]]:
|
| 199 |
+
"""Extract vocabulary items from OCR result"""
|
| 200 |
+
original_text = ocr_result.get('original_text', '') or ocr_result.get('text', '')
|
| 201 |
+
language = ocr_result.get('detected_language', 'unknown')
|
| 202 |
+
filename = ocr_result.get('filename', '')
|
| 203 |
+
|
| 204 |
+
if not original_text or language == 'unknown':
|
| 205 |
+
return []
|
| 206 |
+
|
| 207 |
+
language = language.lower()
|
| 208 |
+
|
| 209 |
+
# Extract vocabulary
|
| 210 |
+
vocabulary_items = self.extract_vocabulary_by_language(original_text, language)
|
| 211 |
+
|
| 212 |
+
if not vocabulary_items:
|
| 213 |
+
return []
|
| 214 |
+
|
| 215 |
+
# Determine length constraints
|
| 216 |
+
if language in ['zh-cn', 'zh-tw', 'ja']:
|
| 217 |
+
min_len, max_len = 2, 6
|
| 218 |
+
elif language == 'ko':
|
| 219 |
+
min_len, max_len = 2, 10
|
| 220 |
+
else:
|
| 221 |
+
min_len, max_len = 3, 15
|
| 222 |
+
|
| 223 |
+
filtered_items = self.filter_by_length(vocabulary_items, min_len=min_len, max_len=max_len)
|
| 224 |
+
filtered_items = self.filter_stop_words(filtered_items, language)
|
| 225 |
+
|
| 226 |
+
# Remove duplicates
|
| 227 |
+
unique_items = list(dict.fromkeys(filtered_items))[:10]
|
| 228 |
+
|
| 229 |
+
if not unique_items:
|
| 230 |
+
return []
|
| 231 |
+
|
| 232 |
+
items = []
|
| 233 |
+
for idx, item in enumerate(unique_items):
|
| 234 |
+
# Get translation
|
| 235 |
+
if language == target_lang:
|
| 236 |
+
translation = item
|
| 237 |
+
else:
|
| 238 |
+
translation = self.translate_to_target(item, language, target_lang)
|
| 239 |
+
|
| 240 |
+
# Skip if translation is same as original
|
| 241 |
+
if translation.strip().lower() == item.strip().lower():
|
| 242 |
+
continue
|
| 243 |
+
|
| 244 |
+
# Extract context
|
| 245 |
+
context = self.extract_context_sentence(item, original_text, language)
|
| 246 |
+
context_translated = ""
|
| 247 |
+
if context and language != target_lang:
|
| 248 |
+
context_translated = self.translate_to_target(context, language, target_lang)
|
| 249 |
+
|
| 250 |
+
items.append({
|
| 251 |
+
'id': idx + 1,
|
| 252 |
+
'front': item,
|
| 253 |
+
'back': translation,
|
| 254 |
+
'context': context,
|
| 255 |
+
'context_en': context_translated,
|
| 256 |
+
'language': language,
|
| 257 |
+
'content_type': 'ocr_vocab',
|
| 258 |
+
'source_file': filename,
|
| 259 |
+
})
|
| 260 |
+
|
| 261 |
+
return items
|
| 262 |
+
|
| 263 |
+
def generate_flashcards(self, ocr_results: List[Dict[str, Any]], target_lang: str = 'en') -> Dict[str, Any]:
|
| 264 |
+
"""Generate flashcards from OCR results"""
|
| 265 |
+
all_cards = []
|
| 266 |
+
|
| 267 |
+
for result in ocr_results:
|
| 268 |
+
learnable_items = self.extract_learnable_items(result, target_lang)
|
| 269 |
+
all_cards.extend(learnable_items)
|
| 270 |
+
|
| 271 |
+
return {
|
| 272 |
+
'total_cards': len(all_cards),
|
| 273 |
+
'cards': all_cards,
|
| 274 |
+
'metadata': {
|
| 275 |
+
'generator': 'FlashcardGenerator v2.0',
|
| 276 |
+
'method': 'context-extraction',
|
| 277 |
+
}
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
def save_flashcards(self, flashcards: Dict[str, Any], output_path: str):
|
| 281 |
+
"""Save flashcards to JSON file"""
|
| 282 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 283 |
+
json.dump(flashcards, f, ensure_ascii=False, indent=2)
|
| 284 |
+
|
| 285 |
+
def load_ocr_results(self, input_path: str) -> List[Dict[str, Any]]:
|
| 286 |
+
"""Load OCR results from JSON file"""
|
| 287 |
+
with open(input_path, 'r', encoding='utf-8') as f:
|
| 288 |
+
return json.load(f)
|
src/app/flashcards_tools.py
CHANGED
|
@@ -1,20 +1,33 @@
|
|
| 1 |
-
|
| 2 |
-
|
|
|
|
|
|
|
| 3 |
|
| 4 |
import json
|
| 5 |
import re
|
| 6 |
from pathlib import Path
|
| 7 |
-
from typing import Dict, List,
|
| 8 |
|
| 9 |
from deep_translator import GoogleTranslator
|
| 10 |
|
| 11 |
from .config import get_user_dir
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
def _get_decks_dir(username: str) -> Path:
|
| 15 |
-
"""
|
| 16 |
-
Returns the directory where all of a user's decks are stored.
|
| 17 |
-
"""
|
| 18 |
user_dir = get_user_dir(username)
|
| 19 |
decks_dir = user_dir / "decks"
|
| 20 |
decks_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -22,11 +35,7 @@ def _get_decks_dir(username: str) -> Path:
|
|
| 22 |
|
| 23 |
|
| 24 |
def list_user_decks(username: str) -> Dict[str, Path]:
|
| 25 |
-
"""
|
| 26 |
-
Returns a mapping of deck name -> deck json path.
|
| 27 |
-
Deck name is taken from the deck's "name" field if present,
|
| 28 |
-
otherwise the filename stem.
|
| 29 |
-
"""
|
| 30 |
decks_dir = _get_decks_dir(username)
|
| 31 |
deck_files = sorted(decks_dir.glob("*.json"))
|
| 32 |
decks: Dict[str, Path] = {}
|
|
@@ -38,7 +47,6 @@ def list_user_decks(username: str) -> Dict[str, Path]:
|
|
| 38 |
except Exception:
|
| 39 |
name = path.stem
|
| 40 |
|
| 41 |
-
# ensure uniqueness by appending stem if needed
|
| 42 |
if name in decks and decks[name] != path:
|
| 43 |
name = f"{name} ({path.stem})"
|
| 44 |
decks[name] = path
|
|
@@ -47,24 +55,31 @@ def list_user_decks(username: str) -> Dict[str, Path]:
|
|
| 47 |
|
| 48 |
|
| 49 |
def _ensure_card_stats(card: Dict) -> None:
|
| 50 |
-
"""
|
| 51 |
-
|
| 52 |
-
"""
|
| 53 |
-
if "score" not in card: # learning strength
|
| 54 |
card["score"] = 0
|
| 55 |
if "reviews" not in card:
|
| 56 |
card["reviews"] = 0
|
| 57 |
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
def load_deck(path: Path) -> Dict:
|
| 60 |
-
"""
|
| 61 |
-
Loads a deck from JSON, ensuring 'cards' exists and that
|
| 62 |
-
each card has basic stats for spaced repetition.
|
| 63 |
-
"""
|
| 64 |
try:
|
| 65 |
data = json.loads(path.read_text(encoding="utf-8"))
|
| 66 |
except Exception:
|
| 67 |
data = {}
|
|
|
|
| 68 |
if "cards" not in data or not isinstance(data["cards"], list):
|
| 69 |
data["cards"] = []
|
| 70 |
if "name" not in data:
|
|
@@ -79,9 +94,7 @@ def load_deck(path: Path) -> Dict:
|
|
| 79 |
|
| 80 |
|
| 81 |
def save_deck(path: Path, deck: Dict) -> None:
|
| 82 |
-
"""
|
| 83 |
-
Saves deck to JSON.
|
| 84 |
-
"""
|
| 85 |
if "cards" not in deck:
|
| 86 |
deck["cards"] = []
|
| 87 |
if "name" not in deck:
|
|
@@ -89,21 +102,14 @@ def save_deck(path: Path, deck: Dict) -> None:
|
|
| 89 |
if "tags" not in deck or not isinstance(deck["tags"], list):
|
| 90 |
deck["tags"] = []
|
| 91 |
|
| 92 |
-
# make sure stats are present
|
| 93 |
for card in deck["cards"]:
|
| 94 |
_ensure_card_stats(card)
|
| 95 |
|
| 96 |
path.write_text(json.dumps(deck, indent=2, ensure_ascii=False), encoding="utf-8")
|
| 97 |
|
| 98 |
|
| 99 |
-
# ------------------------------------------------------------
|
| 100 |
-
# Shared tokenization
|
| 101 |
-
# ------------------------------------------------------------
|
| 102 |
-
|
| 103 |
def _extract_candidate_words(text: str) -> List[str]:
|
| 104 |
-
"""
|
| 105 |
-
Simple tokenizer & filter for candidate vocab words.
|
| 106 |
-
"""
|
| 107 |
tokens = re.findall(r"\b\w+\b", text, flags=re.UNICODE)
|
| 108 |
out = []
|
| 109 |
seen = set()
|
|
@@ -121,28 +127,63 @@ def _extract_candidate_words(text: str) -> List[str]:
|
|
| 121 |
return out
|
| 122 |
|
| 123 |
|
| 124 |
-
# ------------------------------------------------------------
|
| 125 |
-
# OCR → Flashcards
|
| 126 |
-
# ------------------------------------------------------------
|
| 127 |
-
|
| 128 |
def generate_flashcards_from_ocr_results(
|
| 129 |
username: str,
|
| 130 |
ocr_results: List[Dict],
|
| 131 |
deck_name: str = "ocr",
|
| 132 |
target_lang: str = "en",
|
| 133 |
tags: Optional[List[str]] = None,
|
|
|
|
| 134 |
) -> Path:
|
| 135 |
"""
|
| 136 |
-
Takes OCR results
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
all_text = []
|
| 144 |
for res in ocr_results:
|
| 145 |
-
t = res.get("text") or res.get("raw_text") or ""
|
| 146 |
if t:
|
| 147 |
all_text.append(t)
|
| 148 |
joined = "\n".join(all_text)
|
|
@@ -153,7 +194,7 @@ def generate_flashcards_from_ocr_results(
|
|
| 153 |
|
| 154 |
translator = GoogleTranslator(source="auto", target=target_lang)
|
| 155 |
cards = []
|
| 156 |
-
for w in words:
|
| 157 |
try:
|
| 158 |
trans = translator.translate(w)
|
| 159 |
except Exception:
|
|
@@ -162,12 +203,14 @@ def generate_flashcards_from_ocr_results(
|
|
| 162 |
continue
|
| 163 |
if trans.strip().lower() == w.strip().lower():
|
| 164 |
continue
|
|
|
|
| 165 |
card = {
|
| 166 |
"front": w,
|
| 167 |
"back": trans,
|
| 168 |
"content_type": "ocr_vocab",
|
| 169 |
"language": target_lang,
|
| 170 |
}
|
|
|
|
| 171 |
_ensure_card_stats(card)
|
| 172 |
cards.append(card)
|
| 173 |
|
|
@@ -186,27 +229,73 @@ def generate_flashcards_from_ocr_results(
|
|
| 186 |
return deck_path
|
| 187 |
|
| 188 |
|
| 189 |
-
# ------------------------------------------------------------
|
| 190 |
-
# Conversation/Text → Flashcards
|
| 191 |
-
# ------------------------------------------------------------
|
| 192 |
-
|
| 193 |
def generate_flashcards_from_text(
|
| 194 |
username: str,
|
| 195 |
text: str,
|
| 196 |
deck_name: str = "conversation",
|
| 197 |
target_lang: str = "en",
|
| 198 |
tags: Optional[List[str]] = None,
|
|
|
|
| 199 |
) -> Path:
|
| 200 |
"""
|
| 201 |
-
Build a vocab deck from raw
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
words = _extract_candidate_words(text)
|
| 204 |
if not words:
|
| 205 |
raise ValueError("No candidate words found in text.")
|
| 206 |
|
| 207 |
translator = GoogleTranslator(source="auto", target=target_lang)
|
| 208 |
cards = []
|
| 209 |
-
for w in words:
|
| 210 |
try:
|
| 211 |
trans = translator.translate(w)
|
| 212 |
except Exception:
|
|
@@ -215,12 +304,14 @@ def generate_flashcards_from_text(
|
|
| 215 |
continue
|
| 216 |
if trans.strip().lower() == w.strip().lower():
|
| 217 |
continue
|
|
|
|
| 218 |
card = {
|
| 219 |
"front": w,
|
| 220 |
"back": trans,
|
| 221 |
"content_type": "conversation_vocab",
|
| 222 |
"language": target_lang,
|
| 223 |
}
|
|
|
|
| 224 |
_ensure_card_stats(card)
|
| 225 |
cards.append(card)
|
| 226 |
|
|
@@ -239,3 +330,16 @@ def generate_flashcards_from_text(
|
|
| 239 |
return deck_path
|
| 240 |
|
| 241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Flashcards Tools - Enhanced with FlashcardGenerator and DifficultyScorer
|
| 4 |
+
"""
|
| 5 |
|
| 6 |
import json
|
| 7 |
import re
|
| 8 |
from pathlib import Path
|
| 9 |
+
from typing import Dict, List, Optional, Any
|
| 10 |
|
| 11 |
from deep_translator import GoogleTranslator
|
| 12 |
|
| 13 |
from .config import get_user_dir
|
| 14 |
|
| 15 |
+
# Import advanced generators (with fallback)
|
| 16 |
+
try:
|
| 17 |
+
from .flashcard_generator import FlashcardGenerator
|
| 18 |
+
HAS_FLASHCARD_GENERATOR = True
|
| 19 |
+
except ImportError:
|
| 20 |
+
HAS_FLASHCARD_GENERATOR = False
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
from .difficulty_scorer import get_difficulty_scorer
|
| 24 |
+
HAS_DIFFICULTY_SCORER = True
|
| 25 |
+
except ImportError:
|
| 26 |
+
HAS_DIFFICULTY_SCORER = False
|
| 27 |
+
|
| 28 |
|
| 29 |
def _get_decks_dir(username: str) -> Path:
|
| 30 |
+
"""Returns the directory where all of a user's decks are stored."""
|
|
|
|
|
|
|
| 31 |
user_dir = get_user_dir(username)
|
| 32 |
decks_dir = user_dir / "decks"
|
| 33 |
decks_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 35 |
|
| 36 |
|
| 37 |
def list_user_decks(username: str) -> Dict[str, Path]:
|
| 38 |
+
"""Returns a mapping of deck name -> deck json path."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
decks_dir = _get_decks_dir(username)
|
| 40 |
deck_files = sorted(decks_dir.glob("*.json"))
|
| 41 |
decks: Dict[str, Path] = {}
|
|
|
|
| 47 |
except Exception:
|
| 48 |
name = path.stem
|
| 49 |
|
|
|
|
| 50 |
if name in decks and decks[name] != path:
|
| 51 |
name = f"{name} ({path.stem})"
|
| 52 |
decks[name] = path
|
|
|
|
| 55 |
|
| 56 |
|
| 57 |
def _ensure_card_stats(card: Dict) -> None:
|
| 58 |
+
"""Ensure that a card has simple spaced-repetition stats."""
|
| 59 |
+
if "score" not in card:
|
|
|
|
|
|
|
| 60 |
card["score"] = 0
|
| 61 |
if "reviews" not in card:
|
| 62 |
card["reviews"] = 0
|
| 63 |
|
| 64 |
|
| 65 |
+
def _add_difficulty_to_card(card: Dict) -> Dict:
|
| 66 |
+
"""Add difficulty scoring to a card if DifficultyScorer is available."""
|
| 67 |
+
if HAS_DIFFICULTY_SCORER:
|
| 68 |
+
try:
|
| 69 |
+
scorer = get_difficulty_scorer()
|
| 70 |
+
return scorer.score_flashcard(card)
|
| 71 |
+
except Exception:
|
| 72 |
+
pass
|
| 73 |
+
return card
|
| 74 |
+
|
| 75 |
+
|
| 76 |
def load_deck(path: Path) -> Dict:
|
| 77 |
+
"""Loads a deck from JSON with stats for spaced repetition."""
|
|
|
|
|
|
|
|
|
|
| 78 |
try:
|
| 79 |
data = json.loads(path.read_text(encoding="utf-8"))
|
| 80 |
except Exception:
|
| 81 |
data = {}
|
| 82 |
+
|
| 83 |
if "cards" not in data or not isinstance(data["cards"], list):
|
| 84 |
data["cards"] = []
|
| 85 |
if "name" not in data:
|
|
|
|
| 94 |
|
| 95 |
|
| 96 |
def save_deck(path: Path, deck: Dict) -> None:
|
| 97 |
+
"""Saves deck to JSON."""
|
|
|
|
|
|
|
| 98 |
if "cards" not in deck:
|
| 99 |
deck["cards"] = []
|
| 100 |
if "name" not in deck:
|
|
|
|
| 102 |
if "tags" not in deck or not isinstance(deck["tags"], list):
|
| 103 |
deck["tags"] = []
|
| 104 |
|
|
|
|
| 105 |
for card in deck["cards"]:
|
| 106 |
_ensure_card_stats(card)
|
| 107 |
|
| 108 |
path.write_text(json.dumps(deck, indent=2, ensure_ascii=False), encoding="utf-8")
|
| 109 |
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
def _extract_candidate_words(text: str) -> List[str]:
|
| 112 |
+
"""Simple tokenizer & filter for candidate vocab words."""
|
|
|
|
|
|
|
| 113 |
tokens = re.findall(r"\b\w+\b", text, flags=re.UNICODE)
|
| 114 |
out = []
|
| 115 |
seen = set()
|
|
|
|
| 127 |
return out
|
| 128 |
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
def generate_flashcards_from_ocr_results(
|
| 131 |
username: str,
|
| 132 |
ocr_results: List[Dict],
|
| 133 |
deck_name: str = "ocr",
|
| 134 |
target_lang: str = "en",
|
| 135 |
tags: Optional[List[str]] = None,
|
| 136 |
+
use_advanced_generator: bool = True,
|
| 137 |
) -> Path:
|
| 138 |
"""
|
| 139 |
+
Takes OCR results and constructs a vocab deck.
|
| 140 |
+
|
| 141 |
+
Args:
|
| 142 |
+
username: User identifier
|
| 143 |
+
ocr_results: List of OCR result dicts with 'text' key
|
| 144 |
+
deck_name: Name for the deck
|
| 145 |
+
target_lang: Target language for translations
|
| 146 |
+
tags: Optional tags for the deck
|
| 147 |
+
use_advanced_generator: Whether to use FlashcardGenerator
|
| 148 |
+
|
| 149 |
+
Returns:
|
| 150 |
+
Path to the saved deck
|
| 151 |
"""
|
| 152 |
+
# Try advanced generator first
|
| 153 |
+
if use_advanced_generator and HAS_FLASHCARD_GENERATOR:
|
| 154 |
+
try:
|
| 155 |
+
generator = FlashcardGenerator()
|
| 156 |
+
flashcard_data = generator.generate_flashcards(ocr_results, target_lang)
|
| 157 |
+
cards = flashcard_data.get('cards', [])
|
| 158 |
+
|
| 159 |
+
if cards:
|
| 160 |
+
# Add difficulty scores
|
| 161 |
+
if HAS_DIFFICULTY_SCORER:
|
| 162 |
+
scorer = get_difficulty_scorer()
|
| 163 |
+
cards = scorer.score_all_flashcards(cards)
|
| 164 |
+
|
| 165 |
+
# Ensure stats
|
| 166 |
+
for card in cards:
|
| 167 |
+
_ensure_card_stats(card)
|
| 168 |
+
|
| 169 |
+
decks_dir = _get_decks_dir(username)
|
| 170 |
+
deck_path = decks_dir / f"{deck_name}.json"
|
| 171 |
+
|
| 172 |
+
deck = {
|
| 173 |
+
"name": deck_name,
|
| 174 |
+
"cards": cards,
|
| 175 |
+
"tags": tags or ["ocr"],
|
| 176 |
+
"metadata": flashcard_data.get('metadata', {})
|
| 177 |
+
}
|
| 178 |
+
save_deck(deck_path, deck)
|
| 179 |
+
return deck_path
|
| 180 |
+
except Exception as e:
|
| 181 |
+
print(f"[flashcards_tools] Advanced generator failed: {e}, using fallback")
|
| 182 |
+
|
| 183 |
+
# Fallback to simple extraction
|
| 184 |
all_text = []
|
| 185 |
for res in ocr_results:
|
| 186 |
+
t = res.get("text") or res.get("raw_text") or res.get("original_text") or ""
|
| 187 |
if t:
|
| 188 |
all_text.append(t)
|
| 189 |
joined = "\n".join(all_text)
|
|
|
|
| 194 |
|
| 195 |
translator = GoogleTranslator(source="auto", target=target_lang)
|
| 196 |
cards = []
|
| 197 |
+
for w in words[:20]: # Limit to 20 words
|
| 198 |
try:
|
| 199 |
trans = translator.translate(w)
|
| 200 |
except Exception:
|
|
|
|
| 203 |
continue
|
| 204 |
if trans.strip().lower() == w.strip().lower():
|
| 205 |
continue
|
| 206 |
+
|
| 207 |
card = {
|
| 208 |
"front": w,
|
| 209 |
"back": trans,
|
| 210 |
"content_type": "ocr_vocab",
|
| 211 |
"language": target_lang,
|
| 212 |
}
|
| 213 |
+
card = _add_difficulty_to_card(card)
|
| 214 |
_ensure_card_stats(card)
|
| 215 |
cards.append(card)
|
| 216 |
|
|
|
|
| 229 |
return deck_path
|
| 230 |
|
| 231 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
def generate_flashcards_from_text(
|
| 233 |
username: str,
|
| 234 |
text: str,
|
| 235 |
deck_name: str = "conversation",
|
| 236 |
target_lang: str = "en",
|
| 237 |
tags: Optional[List[str]] = None,
|
| 238 |
+
source_lang: Optional[str] = None,
|
| 239 |
) -> Path:
|
| 240 |
"""
|
| 241 |
+
Build a vocab deck from raw text.
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
username: User identifier
|
| 245 |
+
text: Raw text to extract vocabulary from
|
| 246 |
+
deck_name: Name for the deck
|
| 247 |
+
target_lang: Target language for translations
|
| 248 |
+
tags: Optional tags for the deck
|
| 249 |
+
source_lang: Source language (auto-detect if None)
|
| 250 |
+
|
| 251 |
+
Returns:
|
| 252 |
+
Path to the saved deck
|
| 253 |
"""
|
| 254 |
+
# Try advanced generator first
|
| 255 |
+
if HAS_FLASHCARD_GENERATOR:
|
| 256 |
+
try:
|
| 257 |
+
generator = FlashcardGenerator()
|
| 258 |
+
|
| 259 |
+
# Create fake OCR result
|
| 260 |
+
ocr_result = {
|
| 261 |
+
'original_text': text,
|
| 262 |
+
'text': text,
|
| 263 |
+
'detected_language': source_lang or 'auto',
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
flashcard_data = generator.generate_flashcards([ocr_result], target_lang)
|
| 267 |
+
cards = flashcard_data.get('cards', [])
|
| 268 |
+
|
| 269 |
+
if cards:
|
| 270 |
+
if HAS_DIFFICULTY_SCORER:
|
| 271 |
+
scorer = get_difficulty_scorer()
|
| 272 |
+
cards = scorer.score_all_flashcards(cards)
|
| 273 |
+
|
| 274 |
+
for card in cards:
|
| 275 |
+
card['content_type'] = 'conversation_vocab'
|
| 276 |
+
_ensure_card_stats(card)
|
| 277 |
+
|
| 278 |
+
decks_dir = _get_decks_dir(username)
|
| 279 |
+
deck_path = decks_dir / f"{deck_name}.json"
|
| 280 |
+
|
| 281 |
+
deck = {
|
| 282 |
+
"name": deck_name,
|
| 283 |
+
"cards": cards,
|
| 284 |
+
"tags": tags or ["conversation"],
|
| 285 |
+
}
|
| 286 |
+
save_deck(deck_path, deck)
|
| 287 |
+
return deck_path
|
| 288 |
+
except Exception as e:
|
| 289 |
+
print(f"[flashcards_tools] Advanced generator failed: {e}, using fallback")
|
| 290 |
+
|
| 291 |
+
# Fallback
|
| 292 |
words = _extract_candidate_words(text)
|
| 293 |
if not words:
|
| 294 |
raise ValueError("No candidate words found in text.")
|
| 295 |
|
| 296 |
translator = GoogleTranslator(source="auto", target=target_lang)
|
| 297 |
cards = []
|
| 298 |
+
for w in words[:20]:
|
| 299 |
try:
|
| 300 |
trans = translator.translate(w)
|
| 301 |
except Exception:
|
|
|
|
| 304 |
continue
|
| 305 |
if trans.strip().lower() == w.strip().lower():
|
| 306 |
continue
|
| 307 |
+
|
| 308 |
card = {
|
| 309 |
"front": w,
|
| 310 |
"back": trans,
|
| 311 |
"content_type": "conversation_vocab",
|
| 312 |
"language": target_lang,
|
| 313 |
}
|
| 314 |
+
card = _add_difficulty_to_card(card)
|
| 315 |
_ensure_card_stats(card)
|
| 316 |
cards.append(card)
|
| 317 |
|
|
|
|
| 330 |
return deck_path
|
| 331 |
|
| 332 |
|
| 333 |
+
def add_difficulty_to_deck(deck: Dict) -> Dict:
|
| 334 |
+
"""Add difficulty scores to all cards in a deck."""
|
| 335 |
+
if not HAS_DIFFICULTY_SCORER:
|
| 336 |
+
return deck
|
| 337 |
+
|
| 338 |
+
try:
|
| 339 |
+
scorer = get_difficulty_scorer()
|
| 340 |
+
deck["cards"] = scorer.score_all_flashcards(deck.get("cards", []))
|
| 341 |
+
deck["statistics"] = scorer.get_statistics(deck["cards"])
|
| 342 |
+
except Exception as e:
|
| 343 |
+
print(f"[flashcards_tools] Difficulty scoring failed: {e}")
|
| 344 |
+
|
| 345 |
+
return deck
|
src/app/ocr_tools.py
CHANGED
|
@@ -1,22 +1,331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
import io
|
|
|
|
| 3 |
from typing import Any, Dict, List, Optional
|
| 4 |
|
|
|
|
| 5 |
from PIL import Image
|
| 6 |
import pytesseract
|
| 7 |
from deep_translator import GoogleTranslator
|
| 8 |
-
from src.app.config import get_user_dir # keep this if you use it
|
| 9 |
|
| 10 |
-
#
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
"""
|
| 15 |
-
|
|
|
|
| 16 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
def ocr_and_translate_batch(
|
|
@@ -25,40 +334,41 @@ def ocr_and_translate_batch(
|
|
| 25 |
prefer_ocr_local: bool = True,
|
| 26 |
) -> List[Dict]:
|
| 27 |
"""
|
| 28 |
-
Runs OCR on a batch of images
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
"""
|
| 38 |
-
|
| 39 |
|
| 40 |
-
results: List[Dict] = []
|
| 41 |
for img_bytes in images:
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
)
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
{
|
| 59 |
-
"text": "",
|
| 60 |
-
"translation": "",
|
| 61 |
-
"target_lang": target_lang,
|
| 62 |
-
}
|
| 63 |
-
)
|
| 64 |
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
OCR Tools - Advanced text extraction with multi-language support
|
| 4 |
+
Supports: English, Chinese, Japanese, Korean, German, Spanish, Russian
|
| 5 |
+
"""
|
| 6 |
|
| 7 |
import io
|
| 8 |
+
import re
|
| 9 |
from typing import Any, Dict, List, Optional
|
| 10 |
|
| 11 |
+
import numpy as np
|
| 12 |
from PIL import Image
|
| 13 |
import pytesseract
|
| 14 |
from deep_translator import GoogleTranslator
|
|
|
|
| 15 |
|
| 16 |
+
# Try to import optional dependencies
|
| 17 |
+
try:
|
| 18 |
+
import cv2
|
| 19 |
+
HAS_CV2 = True
|
| 20 |
+
except ImportError:
|
| 21 |
+
HAS_CV2 = False
|
| 22 |
|
| 23 |
+
try:
|
| 24 |
+
from langdetect import detect
|
| 25 |
+
HAS_LANGDETECT = True
|
| 26 |
+
except ImportError:
|
| 27 |
+
HAS_LANGDETECT = False
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
from paddleocr import PaddleOCR
|
| 31 |
+
HAS_PADDLEOCR = True
|
| 32 |
+
_paddle_ocr = None
|
| 33 |
+
except ImportError:
|
| 34 |
+
HAS_PADDLEOCR = False
|
| 35 |
+
_paddle_ocr = None
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# Language code mapping
|
| 39 |
+
LANG_CODE_MAP = {
|
| 40 |
+
'zh-cn': 'zh-CN',
|
| 41 |
+
'zh-tw': 'zh-TW',
|
| 42 |
+
'en': 'en',
|
| 43 |
+
'ja': 'ja',
|
| 44 |
+
'ko': 'ko',
|
| 45 |
+
'fr': 'fr',
|
| 46 |
+
'de': 'de',
|
| 47 |
+
'es': 'es',
|
| 48 |
+
'ru': 'ru',
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
# Tesseract language codes for each supported language
|
| 52 |
+
TESSERACT_LANG_MAP = {
|
| 53 |
+
'en': 'eng',
|
| 54 |
+
'english': 'eng',
|
| 55 |
+
'zh-cn': 'chi_sim',
|
| 56 |
+
'chinese': 'chi_sim',
|
| 57 |
+
'zh-tw': 'chi_tra',
|
| 58 |
+
'ja': 'jpn',
|
| 59 |
+
'japanese': 'jpn',
|
| 60 |
+
'ko': 'kor',
|
| 61 |
+
'korean': 'kor',
|
| 62 |
+
'de': 'deu',
|
| 63 |
+
'german': 'deu',
|
| 64 |
+
'es': 'spa',
|
| 65 |
+
'spanish': 'spa',
|
| 66 |
+
'ru': 'rus',
|
| 67 |
+
'russian': 'rus',
|
| 68 |
+
'fr': 'fra',
|
| 69 |
+
'french': 'fra',
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _get_paddle_ocr():
|
| 74 |
+
"""Lazily initialize PaddleOCR"""
|
| 75 |
+
global _paddle_ocr
|
| 76 |
+
if HAS_PADDLEOCR and _paddle_ocr is None:
|
| 77 |
+
try:
|
| 78 |
+
_paddle_ocr = PaddleOCR(use_textline_orientation=True, lang='ch', show_log=False)
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"[OCR] PaddleOCR init failed: {e}")
|
| 81 |
+
return _paddle_ocr
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def filter_pinyin_keep_chinese(text: str) -> str:
|
| 85 |
"""
|
| 86 |
+
Filter out pinyin and keep only Chinese characters.
|
| 87 |
+
Preserves complete sentences with Chinese characters.
|
| 88 |
"""
|
| 89 |
+
lines = text.split('\n')
|
| 90 |
+
filtered_lines = []
|
| 91 |
+
|
| 92 |
+
for line in lines:
|
| 93 |
+
line_stripped = line.strip()
|
| 94 |
+
if not line_stripped:
|
| 95 |
+
continue
|
| 96 |
+
|
| 97 |
+
# Check if line contains Chinese characters
|
| 98 |
+
has_chinese = bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', line))
|
| 99 |
+
|
| 100 |
+
# Check if line is pure pinyin
|
| 101 |
+
is_pinyin = bool(re.match(r'^[a-zA-Z\u0101\u00e1\u01ce\u00e0\u0113\u00e9\u011b\u00e8\u012b\u00ed\u01d0\u00ec\u014d\u00f3\u01d2\u00f2\u016b\u00fa\u01d4\u00f9\u00fc\u01d6\u01d8\u01da\u01dc\u0144\u0148\u01f9\s]+$', line_stripped))
|
| 102 |
+
|
| 103 |
+
if is_pinyin:
|
| 104 |
+
continue
|
| 105 |
+
|
| 106 |
+
if has_chinese:
|
| 107 |
+
chinese_parts = re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf]+', line)
|
| 108 |
+
if chinese_parts:
|
| 109 |
+
filtered_lines.append(''.join(chinese_parts))
|
| 110 |
+
|
| 111 |
+
return '\n'.join(filtered_lines)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def detect_language_from_text(text: str) -> str:
|
| 115 |
+
"""Detect language, with special handling for Chinese characters"""
|
| 116 |
+
has_chinese = bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', text))
|
| 117 |
+
if has_chinese:
|
| 118 |
+
return 'zh-cn'
|
| 119 |
+
|
| 120 |
+
has_japanese = bool(re.search(r'[\u3040-\u309f\u30a0-\u30ff]', text))
|
| 121 |
+
if has_japanese:
|
| 122 |
+
return 'ja'
|
| 123 |
+
|
| 124 |
+
has_korean = bool(re.search(r'[\uac00-\ud7af]', text))
|
| 125 |
+
if has_korean:
|
| 126 |
+
return 'ko'
|
| 127 |
+
|
| 128 |
+
if HAS_LANGDETECT:
|
| 129 |
+
try:
|
| 130 |
+
return detect(text)
|
| 131 |
+
except:
|
| 132 |
+
pass
|
| 133 |
+
|
| 134 |
+
return 'en'
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def _preprocess_image(img_array: np.ndarray, method: str = 'simple') -> np.ndarray:
|
| 138 |
+
"""Apply image preprocessing for better OCR accuracy"""
|
| 139 |
+
if not HAS_CV2:
|
| 140 |
+
return img_array
|
| 141 |
+
|
| 142 |
+
# Convert to grayscale if needed
|
| 143 |
+
if len(img_array.shape) == 3:
|
| 144 |
+
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
| 145 |
+
else:
|
| 146 |
+
gray = img_array
|
| 147 |
+
|
| 148 |
+
if method == 'simple':
|
| 149 |
+
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 150 |
+
return binary
|
| 151 |
+
elif method == 'adaptive':
|
| 152 |
+
return cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
|
| 153 |
+
elif method == 'clahe':
|
| 154 |
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
| 155 |
+
enhanced = clahe.apply(gray)
|
| 156 |
+
_, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 157 |
+
return binary
|
| 158 |
+
elif method == 'denoised':
|
| 159 |
+
kernel = np.ones((2, 2), np.uint8)
|
| 160 |
+
denoised = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel, iterations=1)
|
| 161 |
+
_, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 162 |
+
return binary
|
| 163 |
+
elif method == 'advanced':
|
| 164 |
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
| 165 |
+
enhanced = clahe.apply(gray)
|
| 166 |
+
denoised = cv2.fastNlMeansDenoising(enhanced, None, 10, 7, 21)
|
| 167 |
+
return cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
|
| 168 |
+
else:
|
| 169 |
+
return gray
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def _ocr_with_paddleocr(image_bytes: bytes) -> tuple:
|
| 173 |
+
"""Use PaddleOCR for text extraction (best for Chinese)"""
|
| 174 |
+
paddle = _get_paddle_ocr()
|
| 175 |
+
if paddle is None:
|
| 176 |
+
return None, 0
|
| 177 |
+
|
| 178 |
+
try:
|
| 179 |
+
img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 180 |
+
img_array = np.array(img)
|
| 181 |
+
|
| 182 |
+
result = paddle.ocr(img_array, cls=True)
|
| 183 |
+
|
| 184 |
+
if not result or len(result) == 0 or result[0] is None:
|
| 185 |
+
return None, 0
|
| 186 |
+
|
| 187 |
+
texts = []
|
| 188 |
+
scores = []
|
| 189 |
+
for line in result[0]:
|
| 190 |
+
if line and len(line) >= 2:
|
| 191 |
+
text_info = line[1]
|
| 192 |
+
if isinstance(text_info, tuple) and len(text_info) >= 2:
|
| 193 |
+
texts.append(text_info[0])
|
| 194 |
+
scores.append(text_info[1])
|
| 195 |
+
|
| 196 |
+
if not texts:
|
| 197 |
+
return None, 0
|
| 198 |
+
|
| 199 |
+
full_text = '\n'.join(texts)
|
| 200 |
+
avg_confidence = sum(scores) / len(scores) if scores else 0
|
| 201 |
+
|
| 202 |
+
return full_text, avg_confidence * 100
|
| 203 |
+
|
| 204 |
+
except Exception as e:
|
| 205 |
+
print(f"[OCR] PaddleOCR error: {e}")
|
| 206 |
+
return None, 0
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def _ocr_with_tesseract(image_bytes: bytes, lang: str = 'eng+chi_sim+jpn+kor') -> tuple:
|
| 210 |
+
"""Use Tesseract with multiple preprocessing methods"""
|
| 211 |
img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 212 |
+
img_array = np.array(img)
|
| 213 |
+
|
| 214 |
+
best_text = ""
|
| 215 |
+
best_confidence = 0
|
| 216 |
+
best_method = ""
|
| 217 |
+
|
| 218 |
+
# Try different preprocessing methods
|
| 219 |
+
methods = ['simple', 'adaptive', 'clahe', 'denoised']
|
| 220 |
+
if HAS_CV2:
|
| 221 |
+
methods.append('advanced')
|
| 222 |
+
|
| 223 |
+
for method in methods:
|
| 224 |
+
try:
|
| 225 |
+
if HAS_CV2:
|
| 226 |
+
processed = _preprocess_image(img_array, method)
|
| 227 |
+
processed_img = Image.fromarray(processed)
|
| 228 |
+
else:
|
| 229 |
+
processed_img = img
|
| 230 |
+
|
| 231 |
+
# Get OCR data with confidence
|
| 232 |
+
data = pytesseract.image_to_data(processed_img, lang=lang, output_type=pytesseract.Output.DICT)
|
| 233 |
+
text = pytesseract.image_to_string(processed_img, lang=lang)
|
| 234 |
+
|
| 235 |
+
# Calculate average confidence
|
| 236 |
+
confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
|
| 237 |
+
avg_confidence = sum(confidences) / len(confidences) if confidences else 0
|
| 238 |
+
|
| 239 |
+
if text.strip() and avg_confidence > best_confidence:
|
| 240 |
+
best_text = text
|
| 241 |
+
best_confidence = avg_confidence
|
| 242 |
+
best_method = method
|
| 243 |
+
|
| 244 |
+
except Exception as e:
|
| 245 |
+
continue
|
| 246 |
+
|
| 247 |
+
return best_text.strip(), best_confidence, best_method
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def ocr_single_image(
|
| 251 |
+
image_bytes: bytes,
|
| 252 |
+
source_lang: Optional[str] = None,
|
| 253 |
+
target_lang: str = "en",
|
| 254 |
+
use_paddle: bool = True,
|
| 255 |
+
) -> Dict[str, Any]:
|
| 256 |
+
"""
|
| 257 |
+
Extract text from a single image and translate.
|
| 258 |
+
|
| 259 |
+
Args:
|
| 260 |
+
image_bytes: Raw image bytes
|
| 261 |
+
source_lang: Source language hint (auto-detect if None)
|
| 262 |
+
target_lang: Target language for translation
|
| 263 |
+
use_paddle: Whether to try PaddleOCR first
|
| 264 |
+
|
| 265 |
+
Returns:
|
| 266 |
+
Dict with original_text, translated_text, detected_language, confidence, method
|
| 267 |
+
"""
|
| 268 |
+
best_text = ""
|
| 269 |
+
best_method = ""
|
| 270 |
+
best_confidence = 0
|
| 271 |
+
|
| 272 |
+
# Determine Tesseract language string
|
| 273 |
+
tess_lang = 'eng+chi_sim+chi_tra+jpn+kor+deu+spa+rus+fra'
|
| 274 |
+
if source_lang:
|
| 275 |
+
mapped = TESSERACT_LANG_MAP.get(source_lang.lower())
|
| 276 |
+
if mapped:
|
| 277 |
+
tess_lang = mapped
|
| 278 |
+
|
| 279 |
+
# Try PaddleOCR first (best for Chinese)
|
| 280 |
+
if use_paddle and HAS_PADDLEOCR:
|
| 281 |
+
paddle_text, paddle_conf = _ocr_with_paddleocr(image_bytes)
|
| 282 |
+
if paddle_text and paddle_text.strip():
|
| 283 |
+
best_text = paddle_text
|
| 284 |
+
best_method = "PaddleOCR"
|
| 285 |
+
best_confidence = paddle_conf
|
| 286 |
+
|
| 287 |
+
# Try Tesseract (fallback or if PaddleOCR failed)
|
| 288 |
+
if not best_text.strip():
|
| 289 |
+
tess_text, tess_conf, tess_method = _ocr_with_tesseract(image_bytes, tess_lang)
|
| 290 |
+
if tess_text and (tess_conf > best_confidence or not best_text):
|
| 291 |
+
best_text = tess_text
|
| 292 |
+
best_method = f"Tesseract-{tess_method}"
|
| 293 |
+
best_confidence = tess_conf
|
| 294 |
+
|
| 295 |
+
if not best_text.strip():
|
| 296 |
+
return {
|
| 297 |
+
"original_text": "",
|
| 298 |
+
"translated_text": "",
|
| 299 |
+
"detected_language": "unknown",
|
| 300 |
+
"confidence": 0,
|
| 301 |
+
"method": "none",
|
| 302 |
+
"error": "No text detected"
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
# Filter pinyin for Chinese text
|
| 306 |
+
filtered_text = filter_pinyin_keep_chinese(best_text)
|
| 307 |
+
if not filtered_text.strip():
|
| 308 |
+
filtered_text = best_text
|
| 309 |
+
|
| 310 |
+
# Detect language
|
| 311 |
+
detected_lang = detect_language_from_text(filtered_text)
|
| 312 |
+
|
| 313 |
+
# Translate
|
| 314 |
+
try:
|
| 315 |
+
source = LANG_CODE_MAP.get(detected_lang, detected_lang)
|
| 316 |
+
target = LANG_CODE_MAP.get(target_lang, target_lang)
|
| 317 |
+
translator = GoogleTranslator(source=source, target=target)
|
| 318 |
+
translated = translator.translate(filtered_text)
|
| 319 |
+
except Exception as e:
|
| 320 |
+
translated = ""
|
| 321 |
+
|
| 322 |
+
return {
|
| 323 |
+
"original_text": filtered_text.strip(),
|
| 324 |
+
"translated_text": translated.strip() if translated else "",
|
| 325 |
+
"detected_language": detected_lang,
|
| 326 |
+
"confidence": round(best_confidence, 2),
|
| 327 |
+
"method": best_method
|
| 328 |
+
}
|
| 329 |
|
| 330 |
|
| 331 |
def ocr_and_translate_batch(
|
|
|
|
| 334 |
prefer_ocr_local: bool = True,
|
| 335 |
) -> List[Dict]:
|
| 336 |
"""
|
| 337 |
+
Runs OCR on a batch of images with advanced processing.
|
| 338 |
+
|
| 339 |
+
Args:
|
| 340 |
+
images: List of image bytes
|
| 341 |
+
target_lang: Target language for translation
|
| 342 |
+
prefer_ocr_local: Whether to prefer local OCR (PaddleOCR)
|
| 343 |
+
|
| 344 |
+
Returns:
|
| 345 |
+
List of dicts with OCR results
|
| 346 |
"""
|
| 347 |
+
results = []
|
| 348 |
|
|
|
|
| 349 |
for img_bytes in images:
|
| 350 |
+
result = ocr_single_image(
|
| 351 |
+
image_bytes=img_bytes,
|
| 352 |
+
target_lang=target_lang,
|
| 353 |
+
use_paddle=prefer_ocr_local and HAS_PADDLEOCR
|
| 354 |
+
)
|
| 355 |
+
|
| 356 |
+
# Convert to expected format for backward compatibility
|
| 357 |
+
results.append({
|
| 358 |
+
"text": result.get("original_text", ""),
|
| 359 |
+
"translation": result.get("translated_text", ""),
|
| 360 |
+
"target_lang": target_lang,
|
| 361 |
+
"detected_language": result.get("detected_language", "unknown"),
|
| 362 |
+
"confidence": result.get("confidence", 0),
|
| 363 |
+
"method": result.get("method", "unknown"),
|
| 364 |
+
})
|
| 365 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
return results
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
# Keep old function for backward compatibility
|
| 370 |
+
def _simple_ocr(image_bytes: bytes) -> str:
|
| 371 |
+
"""Simple OCR using pytesseract (backward compatibility)"""
|
| 372 |
+
img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 373 |
+
text = pytesseract.image_to_string(img)
|
| 374 |
+
return text.strip()
|
src/app/quiz_tools.py
CHANGED
|
@@ -1,17 +1,329 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
|
| 7 |
import json
|
|
|
|
| 8 |
import random
|
| 9 |
from datetime import datetime
|
|
|
|
|
|
|
|
|
|
| 10 |
from .config import get_user_dir
|
| 11 |
-
from .flashcards_tools import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
reading_passages = [
|
| 16 |
f"{topic.capitalize()} is important in daily life. Many people enjoy talking about it.",
|
| 17 |
f"Here is a short story based on the topic '{topic}'.",
|
|
@@ -25,28 +337,25 @@ def create_semantic_quiz_for_user(username: str, topic: str, num_questions: int
|
|
| 25 |
|
| 26 |
if q_type == "translate_phrase":
|
| 27 |
questions.append({
|
|
|
|
| 28 |
"type": "semantic_translate_phrase",
|
| 29 |
-
"prompt": f"Translate
|
| 30 |
-
|
| 31 |
-
'{passage}'",
|
| 32 |
"answer": "(model evaluated)",
|
| 33 |
"explanation": f"Checks ability to translate topic '{topic}'."
|
| 34 |
})
|
| 35 |
elif q_type == "summarize":
|
| 36 |
questions.append({
|
|
|
|
| 37 |
"type": "semantic_summarize",
|
| 38 |
-
"prompt": f"Summarize
|
| 39 |
-
|
| 40 |
-
{passage}",
|
| 41 |
"answer": "(model evaluated)",
|
| 42 |
"explanation": f"Checks comprehension of topic '{topic}'."
|
| 43 |
})
|
| 44 |
elif q_type == "interpret":
|
| 45 |
questions.append({
|
|
|
|
| 46 |
"type": "semantic_interpret",
|
| 47 |
-
"prompt": f"Interpret meaning
|
| 48 |
-
|
| 49 |
-
{passage}",
|
| 50 |
"answer": "(model evaluated)",
|
| 51 |
"explanation": f"Checks conceptual understanding of '{topic}'."
|
| 52 |
})
|
|
@@ -58,5 +367,59 @@ def create_semantic_quiz_for_user(username: str, topic: str, num_questions: int
|
|
| 58 |
"id": quiz_id,
|
| 59 |
"created_at": ts,
|
| 60 |
"topic": topic,
|
|
|
|
| 61 |
"questions": questions,
|
| 62 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Quiz Tools - AI-Powered Quiz Generation from Flashcards
|
| 4 |
+
Supports multiple question types and uses OpenAI API for intelligent quiz creation
|
| 5 |
+
"""
|
| 6 |
|
| 7 |
import json
|
| 8 |
+
import os
|
| 9 |
import random
|
| 10 |
from datetime import datetime
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Dict, List, Any, Optional
|
| 13 |
+
|
| 14 |
from .config import get_user_dir
|
| 15 |
+
from .flashcards_tools import load_deck, list_user_decks
|
| 16 |
+
|
| 17 |
+
# Try to import OpenAI
|
| 18 |
+
try:
|
| 19 |
+
from openai import OpenAI
|
| 20 |
+
HAS_OPENAI = True
|
| 21 |
+
except ImportError:
|
| 22 |
+
HAS_OPENAI = False
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class QuizGenerator:
|
| 26 |
+
"""Generate intelligent quizzes using OpenAI API"""
|
| 27 |
+
|
| 28 |
+
QUESTION_TYPES = [
|
| 29 |
+
'multiple_choice',
|
| 30 |
+
'fill_in_blank',
|
| 31 |
+
'true_false',
|
| 32 |
+
'matching',
|
| 33 |
+
'short_answer'
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
def __init__(self, api_key: Optional[str] = None, model: str = "gpt-4o-mini"):
|
| 37 |
+
"""
|
| 38 |
+
Initialize the quiz generator
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
api_key: OpenAI API key (uses env var if not provided)
|
| 42 |
+
model: Model to use for quiz generation
|
| 43 |
+
"""
|
| 44 |
+
self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
| 45 |
+
self.model = model
|
| 46 |
+
self.client = None
|
| 47 |
+
|
| 48 |
+
if HAS_OPENAI and self.api_key:
|
| 49 |
+
try:
|
| 50 |
+
self.client = OpenAI(api_key=self.api_key)
|
| 51 |
+
except Exception as e:
|
| 52 |
+
print(f"[QuizGenerator] OpenAI init failed: {e}")
|
| 53 |
+
|
| 54 |
+
def _prepare_flashcard_context(self, flashcards: List[Dict], max_cards: int = 20) -> str:
|
| 55 |
+
"""Prepare flashcard data as context for AI"""
|
| 56 |
+
selected_cards = flashcards[:max_cards] if len(flashcards) > max_cards else flashcards
|
| 57 |
+
|
| 58 |
+
context_parts = []
|
| 59 |
+
for idx, card in enumerate(selected_cards, 1):
|
| 60 |
+
card_info = (
|
| 61 |
+
f"{idx}. Word: {card.get('front', '')}\n"
|
| 62 |
+
f" Translation: {card.get('back', '')}\n"
|
| 63 |
+
f" Language: {card.get('language', 'unknown')}\n"
|
| 64 |
+
f" Context: {card.get('context', 'N/A')}"
|
| 65 |
+
)
|
| 66 |
+
context_parts.append(card_info)
|
| 67 |
+
|
| 68 |
+
return "\n\n".join(context_parts)
|
| 69 |
+
|
| 70 |
+
def _create_quiz_prompt(self, flashcards: List[Dict], num_questions: int = 30) -> str:
|
| 71 |
+
"""Create the prompt for AI quiz generation"""
|
| 72 |
+
flashcard_context = self._prepare_flashcard_context(flashcards)
|
| 73 |
+
|
| 74 |
+
prompt = f"""You are an expert language teacher creating a QUESTION BANK to test students' knowledge of vocabulary.
|
| 75 |
+
|
| 76 |
+
Based on the following flashcards, generate exactly {num_questions} diverse quiz questions.
|
| 77 |
+
|
| 78 |
+
FLASHCARDS:
|
| 79 |
+
{flashcard_context}
|
| 80 |
+
|
| 81 |
+
REQUIREMENTS:
|
| 82 |
+
1. Generate exactly {num_questions} questions
|
| 83 |
+
2. Use different question types: multiple_choice, fill_in_blank, true_false, matching, short_answer
|
| 84 |
+
3. Questions should test different aspects: vocabulary recall, context understanding, usage
|
| 85 |
+
4. Each question must include the correct answer
|
| 86 |
+
5. For multiple choice questions, provide 4 options with one correct answer
|
| 87 |
+
6. For matching questions, provide 4 word-translation pairs
|
| 88 |
+
7. Make questions challenging but fair
|
| 89 |
+
8. Vary difficulty levels across questions
|
| 90 |
+
|
| 91 |
+
OUTPUT FORMAT (JSON):
|
| 92 |
+
{{
|
| 93 |
+
"quiz_title": "Vocabulary Quiz",
|
| 94 |
+
"total_questions": {num_questions},
|
| 95 |
+
"questions": [
|
| 96 |
+
{{
|
| 97 |
+
"question_number": 1,
|
| 98 |
+
"type": "multiple_choice",
|
| 99 |
+
"question": "What does 'word' mean?",
|
| 100 |
+
"options": ["Option A", "Option B", "Option C", "Option D"],
|
| 101 |
+
"correct_answer": "Option B",
|
| 102 |
+
"explanation": "Brief explanation."
|
| 103 |
+
}},
|
| 104 |
+
{{
|
| 105 |
+
"question_number": 2,
|
| 106 |
+
"type": "fill_in_blank",
|
| 107 |
+
"question": "Complete: The ___ ran quickly.",
|
| 108 |
+
"correct_answer": "cat",
|
| 109 |
+
"explanation": "Brief explanation."
|
| 110 |
+
}},
|
| 111 |
+
{{
|
| 112 |
+
"question_number": 3,
|
| 113 |
+
"type": "true_false",
|
| 114 |
+
"question": "'Word' means 'definition' in English.",
|
| 115 |
+
"correct_answer": false,
|
| 116 |
+
"explanation": "Brief explanation."
|
| 117 |
+
}},
|
| 118 |
+
{{
|
| 119 |
+
"question_number": 4,
|
| 120 |
+
"type": "matching",
|
| 121 |
+
"question": "Match the words to their correct translations",
|
| 122 |
+
"pairs": [
|
| 123 |
+
{{"word": "word1", "translation": "translation1"}},
|
| 124 |
+
{{"word": "word2", "translation": "translation2"}},
|
| 125 |
+
{{"word": "word3", "translation": "translation3"}},
|
| 126 |
+
{{"word": "word4", "translation": "translation4"}}
|
| 127 |
+
],
|
| 128 |
+
"correct_answer": "All pairs are correctly matched",
|
| 129 |
+
"explanation": "Brief explanation."
|
| 130 |
+
}},
|
| 131 |
+
{{
|
| 132 |
+
"question_number": 5,
|
| 133 |
+
"type": "short_answer",
|
| 134 |
+
"question": "Explain the usage of 'word'.",
|
| 135 |
+
"correct_answer": "Model answer here.",
|
| 136 |
+
"explanation": "Brief explanation."
|
| 137 |
+
}}
|
| 138 |
+
]
|
| 139 |
+
}}
|
| 140 |
+
|
| 141 |
+
Generate the quiz now:"""
|
| 142 |
+
|
| 143 |
+
return prompt
|
| 144 |
+
|
| 145 |
+
def generate_quiz_with_ai(self, flashcards: List[Dict], num_questions: int = 30) -> Dict[str, Any]:
|
| 146 |
+
"""Generate quiz using OpenAI API"""
|
| 147 |
+
if not self.client:
|
| 148 |
+
raise ValueError("OpenAI client not initialized. Check API key.")
|
| 149 |
+
|
| 150 |
+
if not flashcards:
|
| 151 |
+
raise ValueError("No flashcards provided for quiz generation")
|
| 152 |
+
|
| 153 |
+
prompt = self._create_quiz_prompt(flashcards, num_questions)
|
| 154 |
+
|
| 155 |
+
try:
|
| 156 |
+
response = self.client.chat.completions.create(
|
| 157 |
+
model=self.model,
|
| 158 |
+
messages=[
|
| 159 |
+
{
|
| 160 |
+
"role": "system",
|
| 161 |
+
"content": "You are an expert language teacher who creates engaging, educational quizzes. Always respond with valid JSON."
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"role": "user",
|
| 165 |
+
"content": prompt
|
| 166 |
+
}
|
| 167 |
+
],
|
| 168 |
+
response_format={"type": "json_object"},
|
| 169 |
+
temperature=0.7,
|
| 170 |
+
max_tokens=4000
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
quiz_content = response.choices[0].message.content
|
| 174 |
+
quiz_data = json.loads(quiz_content)
|
| 175 |
+
|
| 176 |
+
quiz_data['metadata'] = {
|
| 177 |
+
'generator': 'AI-Powered Quiz Generator',
|
| 178 |
+
'model': self.model,
|
| 179 |
+
'source_flashcards': len(flashcards),
|
| 180 |
+
'tokens_used': response.usage.total_tokens if response.usage else 0
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
return quiz_data
|
| 184 |
+
|
| 185 |
+
except Exception as e:
|
| 186 |
+
print(f"[QuizGenerator] AI generation failed: {e}")
|
| 187 |
+
raise
|
| 188 |
+
|
| 189 |
+
def generate_simple_quiz(self, flashcards: List[Dict], num_questions: int = 5) -> Dict[str, Any]:
|
| 190 |
+
"""Generate a simple quiz without AI (fallback)"""
|
| 191 |
+
if not flashcards:
|
| 192 |
+
raise ValueError("No flashcards provided")
|
| 193 |
+
|
| 194 |
+
questions = []
|
| 195 |
+
used_cards = random.sample(flashcards, min(num_questions * 2, len(flashcards)))
|
| 196 |
+
|
| 197 |
+
for i, card in enumerate(used_cards[:num_questions]):
|
| 198 |
+
q_type = random.choice(['multiple_choice', 'fill_in_blank', 'true_false'])
|
| 199 |
+
|
| 200 |
+
if q_type == 'multiple_choice':
|
| 201 |
+
# Create wrong options from other cards
|
| 202 |
+
other_cards = [c for c in flashcards if c != card]
|
| 203 |
+
wrong_options = random.sample(
|
| 204 |
+
[c.get('back', 'Unknown') for c in other_cards],
|
| 205 |
+
min(3, len(other_cards))
|
| 206 |
+
)
|
| 207 |
+
while len(wrong_options) < 3:
|
| 208 |
+
wrong_options.append(f"Not {card.get('back', 'this')}")
|
| 209 |
+
|
| 210 |
+
options = wrong_options + [card.get('back', '')]
|
| 211 |
+
random.shuffle(options)
|
| 212 |
+
|
| 213 |
+
questions.append({
|
| 214 |
+
"question_number": i + 1,
|
| 215 |
+
"type": "multiple_choice",
|
| 216 |
+
"question": f"What does '{card.get('front', '')}' mean?",
|
| 217 |
+
"options": options,
|
| 218 |
+
"correct_answer": card.get('back', ''),
|
| 219 |
+
"explanation": f"'{card.get('front', '')}' translates to '{card.get('back', '')}'."
|
| 220 |
+
})
|
| 221 |
|
| 222 |
+
elif q_type == 'fill_in_blank':
|
| 223 |
+
questions.append({
|
| 224 |
+
"question_number": i + 1,
|
| 225 |
+
"type": "fill_in_blank",
|
| 226 |
+
"question": f"Translate: '{card.get('front', '')}' = _____",
|
| 227 |
+
"correct_answer": card.get('back', ''),
|
| 228 |
+
"explanation": f"The correct translation is '{card.get('back', '')}'."
|
| 229 |
+
})
|
| 230 |
|
| 231 |
+
elif q_type == 'true_false':
|
| 232 |
+
is_true = random.choice([True, False])
|
| 233 |
+
if is_true:
|
| 234 |
+
shown_answer = card.get('back', '')
|
| 235 |
+
else:
|
| 236 |
+
other_cards = [c for c in flashcards if c != card]
|
| 237 |
+
if other_cards:
|
| 238 |
+
shown_answer = random.choice(other_cards).get('back', 'something else')
|
| 239 |
+
else:
|
| 240 |
+
shown_answer = f"Not {card.get('back', 'this')}"
|
| 241 |
+
|
| 242 |
+
questions.append({
|
| 243 |
+
"question_number": i + 1,
|
| 244 |
+
"type": "true_false",
|
| 245 |
+
"question": f"'{card.get('front', '')}' means '{shown_answer}'.",
|
| 246 |
+
"correct_answer": is_true,
|
| 247 |
+
"explanation": f"'{card.get('front', '')}' actually means '{card.get('back', '')}'."
|
| 248 |
+
})
|
| 249 |
+
|
| 250 |
+
return {
|
| 251 |
+
"quiz_title": "Vocabulary Quiz",
|
| 252 |
+
"total_questions": len(questions),
|
| 253 |
+
"questions": questions,
|
| 254 |
+
"metadata": {
|
| 255 |
+
"generator": "Simple Quiz Generator",
|
| 256 |
+
"source_flashcards": len(flashcards)
|
| 257 |
+
}
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
def create_quiz_from_deck(
|
| 262 |
+
username: str,
|
| 263 |
+
deck_name: str,
|
| 264 |
+
num_questions: int = 5,
|
| 265 |
+
use_ai: bool = True,
|
| 266 |
+
api_key: Optional[str] = None
|
| 267 |
+
) -> Dict[str, Any]:
|
| 268 |
+
"""
|
| 269 |
+
Create a quiz from a user's flashcard deck
|
| 270 |
+
|
| 271 |
+
Args:
|
| 272 |
+
username: User identifier
|
| 273 |
+
deck_name: Name of the deck to create quiz from
|
| 274 |
+
num_questions: Number of questions for the quiz session
|
| 275 |
+
use_ai: Whether to use AI for quiz generation
|
| 276 |
+
api_key: Optional OpenAI API key
|
| 277 |
+
|
| 278 |
+
Returns:
|
| 279 |
+
Quiz dictionary with questions
|
| 280 |
+
"""
|
| 281 |
+
decks = list_user_decks(username)
|
| 282 |
+
|
| 283 |
+
if deck_name not in decks:
|
| 284 |
+
raise ValueError(f"Deck '{deck_name}' not found")
|
| 285 |
+
|
| 286 |
+
deck = load_deck(decks[deck_name])
|
| 287 |
+
flashcards = deck.get('cards', [])
|
| 288 |
+
|
| 289 |
+
if not flashcards:
|
| 290 |
+
raise ValueError(f"Deck '{deck_name}' has no cards")
|
| 291 |
+
|
| 292 |
+
generator = QuizGenerator(api_key=api_key)
|
| 293 |
+
|
| 294 |
+
try:
|
| 295 |
+
if use_ai and generator.client:
|
| 296 |
+
# Generate larger question bank with AI
|
| 297 |
+
quiz = generator.generate_quiz_with_ai(flashcards, num_questions=30)
|
| 298 |
+
else:
|
| 299 |
+
# Use simple generator
|
| 300 |
+
quiz = generator.generate_simple_quiz(flashcards, num_questions=num_questions)
|
| 301 |
+
except Exception as e:
|
| 302 |
+
print(f"[quiz_tools] AI quiz generation failed: {e}, using simple generator")
|
| 303 |
+
quiz = generator.generate_simple_quiz(flashcards, num_questions=num_questions)
|
| 304 |
+
|
| 305 |
+
# Add quiz metadata
|
| 306 |
+
ts = datetime.utcnow().strftime("%Y-%m-%dT%H-%M-%SZ")
|
| 307 |
+
quiz['id'] = f"quiz_{ts}"
|
| 308 |
+
quiz['created_at'] = ts
|
| 309 |
+
quiz['deck_name'] = deck_name
|
| 310 |
+
quiz['questions_per_session'] = num_questions
|
| 311 |
+
|
| 312 |
+
return quiz
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
def create_semantic_quiz_for_user(username: str, topic: str, num_questions: int = 5) -> Dict[str, Any]:
|
| 316 |
+
"""
|
| 317 |
+
Create a semantic quiz based on a topic (for conversation practice)
|
| 318 |
+
|
| 319 |
+
Args:
|
| 320 |
+
username: User identifier
|
| 321 |
+
topic: Topic for the quiz
|
| 322 |
+
num_questions: Number of questions
|
| 323 |
+
|
| 324 |
+
Returns:
|
| 325 |
+
Quiz dictionary
|
| 326 |
+
"""
|
| 327 |
reading_passages = [
|
| 328 |
f"{topic.capitalize()} is important in daily life. Many people enjoy talking about it.",
|
| 329 |
f"Here is a short story based on the topic '{topic}'.",
|
|
|
|
| 337 |
|
| 338 |
if q_type == "translate_phrase":
|
| 339 |
questions.append({
|
| 340 |
+
"question_number": i + 1,
|
| 341 |
"type": "semantic_translate_phrase",
|
| 342 |
+
"prompt": f"Translate:\n\n'{passage}'",
|
|
|
|
|
|
|
| 343 |
"answer": "(model evaluated)",
|
| 344 |
"explanation": f"Checks ability to translate topic '{topic}'."
|
| 345 |
})
|
| 346 |
elif q_type == "summarize":
|
| 347 |
questions.append({
|
| 348 |
+
"question_number": i + 1,
|
| 349 |
"type": "semantic_summarize",
|
| 350 |
+
"prompt": f"Summarize:\n\n{passage}",
|
|
|
|
|
|
|
| 351 |
"answer": "(model evaluated)",
|
| 352 |
"explanation": f"Checks comprehension of topic '{topic}'."
|
| 353 |
})
|
| 354 |
elif q_type == "interpret":
|
| 355 |
questions.append({
|
| 356 |
+
"question_number": i + 1,
|
| 357 |
"type": "semantic_interpret",
|
| 358 |
+
"prompt": f"Interpret meaning:\n\n{passage}",
|
|
|
|
|
|
|
| 359 |
"answer": "(model evaluated)",
|
| 360 |
"explanation": f"Checks conceptual understanding of '{topic}'."
|
| 361 |
})
|
|
|
|
| 367 |
"id": quiz_id,
|
| 368 |
"created_at": ts,
|
| 369 |
"topic": topic,
|
| 370 |
+
"total_questions": len(questions),
|
| 371 |
"questions": questions,
|
| 372 |
}
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
def save_quiz(username: str, quiz: Dict[str, Any]) -> Path:
|
| 376 |
+
"""Save a quiz to the user's directory"""
|
| 377 |
+
user_dir = get_user_dir(username)
|
| 378 |
+
quizzes_dir = user_dir / "quizzes"
|
| 379 |
+
quizzes_dir.mkdir(parents=True, exist_ok=True)
|
| 380 |
+
|
| 381 |
+
quiz_id = quiz.get('id', f"quiz_{datetime.utcnow().strftime('%Y%m%d%H%M%S')}")
|
| 382 |
+
quiz_path = quizzes_dir / f"{quiz_id}.json"
|
| 383 |
+
|
| 384 |
+
with open(quiz_path, 'w', encoding='utf-8') as f:
|
| 385 |
+
json.dump(quiz, f, ensure_ascii=False, indent=2)
|
| 386 |
+
|
| 387 |
+
return quiz_path
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
def load_quiz(username: str, quiz_id: str) -> Dict[str, Any]:
|
| 391 |
+
"""Load a saved quiz"""
|
| 392 |
+
user_dir = get_user_dir(username)
|
| 393 |
+
quiz_path = user_dir / "quizzes" / f"{quiz_id}.json"
|
| 394 |
+
|
| 395 |
+
if not quiz_path.exists():
|
| 396 |
+
raise FileNotFoundError(f"Quiz '{quiz_id}' not found")
|
| 397 |
+
|
| 398 |
+
with open(quiz_path, 'r', encoding='utf-8') as f:
|
| 399 |
+
return json.load(f)
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
def list_user_quizzes(username: str) -> List[Dict[str, Any]]:
|
| 403 |
+
"""List all quizzes for a user"""
|
| 404 |
+
user_dir = get_user_dir(username)
|
| 405 |
+
quizzes_dir = user_dir / "quizzes"
|
| 406 |
+
|
| 407 |
+
if not quizzes_dir.exists():
|
| 408 |
+
return []
|
| 409 |
+
|
| 410 |
+
quizzes = []
|
| 411 |
+
for quiz_file in sorted(quizzes_dir.glob("*.json"), reverse=True):
|
| 412 |
+
try:
|
| 413 |
+
with open(quiz_file, 'r', encoding='utf-8') as f:
|
| 414 |
+
quiz = json.load(f)
|
| 415 |
+
quizzes.append({
|
| 416 |
+
'id': quiz.get('id', quiz_file.stem),
|
| 417 |
+
'title': quiz.get('quiz_title', 'Untitled Quiz'),
|
| 418 |
+
'created_at': quiz.get('created_at', ''),
|
| 419 |
+
'total_questions': quiz.get('total_questions', 0),
|
| 420 |
+
'deck_name': quiz.get('deck_name', ''),
|
| 421 |
+
})
|
| 422 |
+
except Exception:
|
| 423 |
+
continue
|
| 424 |
+
|
| 425 |
+
return quizzes
|