| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| import gradio as gr |
| import PyPDF2 |
| import re |
| import json |
| from typing import List, Dict |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
| import torch |
| import tempfile |
| import os |
|
|
| |
| print("Loading models... This may take a minute on first run.") |
|
|
| model_name = "valhalla/t5-small-qg-hl" |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
|
| |
| model.eval() |
| device = torch.device("cpu") |
| model.to(device) |
|
|
| def extract_key_phrases(text: str) -> List[str]: |
| """Extract potential answer candidates from text.""" |
| |
| candidates = [] |
| |
| |
| capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text) |
| candidates.extend(capitalized[:3]) |
| |
| |
| |
| concept_patterns = [ |
| r'(?:process|method|technique|approach|concept|theory|principle|system) of ([^,.]{10,50})', |
| r'(?:known as|called|termed|referred to as) ([^,.]{5,40})', |
| r'(?:is|are|was|were) (\w+(?:\s+\w+){1,4}) (?:that|which|who)', |
| ] |
| |
| for pattern in concept_patterns: |
| matches = re.findall(pattern, text, re.IGNORECASE) |
| candidates.extend(matches[:2]) |
| |
| |
| candidates = [c.strip() for c in candidates if len(c.strip()) > 5] |
| return list(dict.fromkeys(candidates))[:5] |
|
|
| def generate_questions(context: str, answer: str, question_type: str = "what", max_length: int = 128) -> str: |
| """Generate a question using T5 model with specified type.""" |
| try: |
| |
| input_text = f"generate question: <hl> {answer} <hl> {context}" |
| |
| |
| inputs = tokenizer( |
| input_text, |
| return_tensors="pt", |
| max_length=512, |
| truncation=True, |
| padding=True |
| ).to(device) |
| |
| |
| temperature = 0.7 if question_type == "what" else 0.85 |
| num_beams = 4 if question_type == "what" else 5 |
| |
| |
| with torch.no_grad(): |
| outputs = model.generate( |
| **inputs, |
| max_length=max_length, |
| num_beams=num_beams, |
| early_stopping=True, |
| do_sample=True, |
| temperature=temperature |
| ) |
| |
| |
| question = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| |
| |
| question = re.sub(r'^(question:|q:)', '', question, flags=re.IGNORECASE).strip() |
| |
| |
| question = improve_question(question, answer, context, question_type) |
| |
| return question if len(question) > 10 else "" |
| |
| except Exception as e: |
| print(f"Error generating question: {e}") |
| return "" |
|
|
| def improve_question(question: str, answer: str, context: str, question_type: str) -> str: |
| """Post-process generated questions to improve quality and add variety.""" |
| |
| |
| if not question.endswith('?'): |
| question = question.rstrip('.') + '?' |
| |
| |
| question = question[0].upper() + question[1:] if question else question |
| |
| |
| if question_type == "why" and not question.lower().startswith("why"): |
| |
| if re.search(r'\b(is|are|was|were|does|do|did)\b', question, re.IGNORECASE): |
| question = create_why_question(question, answer, context) |
| |
| elif question_type == "how" and not question.lower().startswith("how"): |
| |
| if re.search(r'\b(does|do|did|can|could)\b', question, re.IGNORECASE): |
| question = create_how_question(question, answer, context) |
| |
| return question |
|
|
| def create_why_question(base_question: str, answer: str, context: str) -> str: |
| """Transform or create a 'why' question.""" |
| |
| |
| causal_patterns = [ |
| r'because ([^,.]{10,60})', |
| r'due to ([^,.]{10,60})', |
| r'as a result of ([^,.]{10,60})', |
| r'(?:leads to|causes|results in) ([^,.]{10,60})', |
| r'in order to ([^,.]{10,60})' |
| ] |
| |
| for pattern in causal_patterns: |
| match = re.search(pattern, context, re.IGNORECASE) |
| if match: |
| |
| subject_match = re.search(r'([A-Z][a-z]+(?:\s+[a-z]+){0,3})\s+(?:is|are|was|were|does|do)', context) |
| if subject_match: |
| subject = subject_match.group(1) |
| return f"Why does {subject.lower()} occur?" |
| |
| |
| |
| words = answer.split() |
| if len(words) > 3: |
| return f"Why is {' '.join(words[:4])}... important?" |
| |
| return base_question |
|
|
| def create_how_question(base_question: str, answer: str, context: str) -> str: |
| """Transform or create a 'how' question.""" |
| |
| |
| process_patterns = [ |
| r'(process|method|procedure|technique|approach) (?:of|for|to) ([^,.]{10,60})', |
| r'by ([^,.]{10,60})', |
| r'through ([^,.]{10,60})' |
| ] |
| |
| for pattern in process_patterns: |
| match = re.search(pattern, context, re.IGNORECASE) |
| if match: |
| if len(match.groups()) > 1: |
| process = match.group(2) |
| return f"How does {process.lower()} work?" |
| else: |
| process = match.group(1) |
| return f"How is {process.lower()} achieved?" |
| |
| |
| verbs = re.findall(r'\b(works?|functions?|operates?|performs?|executes?)\b', context, re.IGNORECASE) |
| if verbs: |
| subject_match = re.search(r'([A-Z][a-z]+(?:\s+[a-z]+){0,3})\s+' + verbs[0], context, re.IGNORECASE) |
| if subject_match: |
| subject = subject_match.group(1) |
| return f"How does {subject.lower()} {verbs[0].lower()}?" |
| |
| return base_question |
|
|
| def extract_text_from_pdf(pdf_file) -> str: |
| """Extract text from uploaded PDF file.""" |
| text = "" |
| try: |
| if isinstance(pdf_file, str): |
| pdf_reader = PyPDF2.PdfReader(pdf_file) |
| else: |
| pdf_reader = PyPDF2.PdfReader(pdf_file) |
| |
| for page in pdf_reader.pages: |
| page_text = page.extract_text() |
| if page_text: |
| text += page_text + "\n" |
| except Exception as e: |
| return f"Error reading PDF: {str(e)}" |
| |
| return text |
|
|
| def clean_text(text: str) -> str: |
| """Clean and preprocess extracted text.""" |
| |
| text = re.sub(r'\s+', ' ', text) |
| |
| text = re.sub(r'[^\w\s.,;!?-]', '', text) |
| return text.strip() |
|
|
| def chunk_text(text: str, max_chunk_size: int = 512, overlap: int = 50) -> List[str]: |
| """Split text into overlapping chunks for processing.""" |
| sentences = re.split(r'(?<=[.!?])\s+', text) |
| chunks = [] |
| current_chunk = "" |
| |
| for sentence in sentences: |
| if len(current_chunk) + len(sentence) < max_chunk_size: |
| current_chunk += " " + sentence |
| else: |
| if current_chunk: |
| chunks.append(current_chunk.strip()) |
| current_chunk = sentence |
| |
| if current_chunk: |
| chunks.append(current_chunk.strip()) |
| |
| |
| overlapped_chunks = [] |
| for i, chunk in enumerate(chunks): |
| if i > 0 and overlap > 0: |
| prev_sentences = chunks[i-1].split('. ') |
| overlap_text = '. '.join(prev_sentences[-2:]) if len(prev_sentences) > 1 else chunks[i-1][-overlap:] |
| chunk = overlap_text + " " + chunk |
| overlapped_chunks.append(chunk) |
| |
| return overlapped_chunks |
|
|
| def generate_qa_pairs(chunk: str, num_questions: int = 3) -> List[Dict[str, str]]: |
| """Generate question-answer pairs from a text chunk with variety.""" |
| flashcards = [] |
| |
| |
| words = chunk.split() |
| if len(words) < 20: |
| return [] |
| |
| try: |
| |
| key_phrases = extract_key_phrases(chunk) |
| |
| |
| sentences = [s.strip() for s in chunk.split('. ') if len(s.strip()) > 20] |
| |
| |
| answer_candidates = key_phrases + sentences[:2] |
| |
| if len(answer_candidates) < 1: |
| return [] |
| |
| |
| question_types = ["what", "why", "how"] |
| |
| |
| questions_generated = 0 |
| for i, answer in enumerate(answer_candidates): |
| if questions_generated >= num_questions: |
| break |
| |
| |
| if len(answer.split()) < 3: |
| continue |
| |
| |
| q_type = question_types[i % len(question_types)] |
| |
| question = generate_questions(chunk, answer, question_type=q_type) |
| |
| if question and question != answer: |
| flashcards.append({ |
| "question": question, |
| "answer": answer, |
| "context": chunk[:200] + "..." if len(chunk) > 200 else chunk, |
| "type": q_type |
| }) |
| questions_generated += 1 |
| |
| except Exception as e: |
| print(f"Error generating QA: {e}") |
| |
| return flashcards |
|
|
| def process_pdf(pdf_file, questions_per_chunk: int = 3, max_chunks: int = 20): |
| """Main processing function.""" |
| if pdf_file is None: |
| return "Please upload a PDF file.", "", "", "Your flashcards will appear here..." |
| |
| try: |
| |
| yield "π Extracting text from PDF...", "", "", "Processing..." |
| raw_text = extract_text_from_pdf(pdf_file) |
| |
| if raw_text.startswith("Error"): |
| yield raw_text, "", "", "Error occurred" |
| return |
| |
| if len(raw_text.strip()) < 100: |
| yield "PDF appears to be empty or contains no extractable text.", "", "", "Error occurred" |
| return |
| |
| |
| yield "π§Ή Cleaning text...", "", "", "Processing..." |
| cleaned_text = clean_text(raw_text) |
| |
| |
| yield "βοΈ Chunking text into sections...", "", "", "Processing..." |
| chunks = chunk_text(cleaned_text) |
| |
| |
| chunks = chunks[:max_chunks] |
| |
| |
| all_flashcards = [] |
| total_chunks = len(chunks) |
| |
| for i, chunk in enumerate(chunks): |
| progress = f"π΄ Generating flashcards... ({i+1}/{total_chunks} chunks processed)" |
| yield progress, "", "", "Processing..." |
| |
| cards = generate_qa_pairs(chunk, questions_per_chunk) |
| all_flashcards.extend(cards) |
| |
| if not all_flashcards: |
| yield "Could not generate flashcards from this PDF. Try a PDF with more textual content.", "", "", "No flashcards generated" |
| return |
| |
| |
| yield "β
Finalizing...", "", "", "Almost done..." |
| |
| |
| display_text = format_flashcards_display(all_flashcards) |
| |
| |
| json_output = json.dumps(all_flashcards, indent=2, ensure_ascii=False) |
| |
| |
| csv_lines = ["Question,Answer,Type"] |
| for card in all_flashcards: |
| q = card['question'].replace('"', '""') |
| a = card['answer'].replace('"', '""') |
| t = card.get('type', 'what') |
| csv_lines.append(f'"{q}","{a}","{t}"') |
| csv_output = "\n".join(csv_lines) |
| |
| |
| stats = f"β
Done! Generated {len(all_flashcards)} flashcards (" |
| types_count = {} |
| for card in all_flashcards: |
| t = card.get('type', 'what') |
| types_count[t] = types_count.get(t, 0) + 1 |
| stats += ", ".join([f"{count} {qtype}" for qtype, count in types_count.items()]) + ")" |
| |
| yield stats, csv_output, json_output, display_text |
| |
| except Exception as e: |
| error_msg = f"Error processing PDF: {str(e)}" |
| print(error_msg) |
| yield error_msg, "", "", error_msg |
|
|
| def format_flashcards_display(flashcards: List[Dict]) -> str: |
| """Format flashcards for nice display.""" |
| lines = [f"## π΄ Generated {len(flashcards)} Flashcards\n"] |
| |
| |
| types_count = {} |
| for card in flashcards: |
| t = card.get('type', 'what') |
| types_count[t] = types_count.get(t, 0) + 1 |
| |
| lines.append(f"**Breakdown:** {', '.join([f'{count} {qtype.upper()}' for qtype, count in types_count.items()])}\n") |
| lines.append("---\n") |
| |
| for i, card in enumerate(flashcards, 1): |
| qtype = card.get('type', 'what').upper() |
| emoji = "β" if qtype == "WHAT" else "π€" if qtype == "WHY" else "π§" |
| |
| lines.append(f"### {emoji} Card {i} - {qtype}") |
| lines.append(f"**Q:** {card['question']}") |
| lines.append(f"**A:** {card['answer']}") |
| lines.append(f"*Context: {card['context'][:100]}...*\n") |
| lines.append("---\n") |
| |
| return "\n".join(lines) |
|
|
| def create_sample_flashcard(): |
| """Create a sample flashcard for demo purposes.""" |
| sample = [ |
| { |
| "question": "What is photosynthesis?", |
| "answer": "Photosynthesis is the process by which plants convert sunlight into energy.", |
| "context": "Photosynthesis is the process by which plants convert sunlight into energy...", |
| "type": "what" |
| }, |
| { |
| "question": "Why do plants need chlorophyll?", |
| "answer": "Chlorophyll absorbs light energy needed for photosynthesis.", |
| "context": "Chlorophyll absorbs light energy needed for photosynthesis...", |
| "type": "why" |
| }, |
| { |
| "question": "How do plants convert light into chemical energy?", |
| "answer": "Through the process of photosynthesis in the chloroplasts.", |
| "context": "Through the process of photosynthesis in the chloroplasts...", |
| "type": "how" |
| } |
| ] |
| return format_flashcards_display(sample) |
|
|
| |
| custom_css = """ |
| .flashcard-container { |
| border: 2px solid #e0e0e0; |
| border-radius: 10px; |
| padding: 20px; |
| margin: 10px 0; |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
| color: white; |
| } |
| .question { |
| font-size: 1.2em; |
| font-weight: bold; |
| margin-bottom: 10px; |
| } |
| .answer { |
| font-size: 1em; |
| opacity: 0.9; |
| } |
| """ |
|
|
| |
| with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo: |
| gr.Markdown(""" |
| # π PDF to Flashcards Generator (Enhanced) |
| |
| Upload any PDF document and automatically generate study flashcards with **What, Why, and How** questions using AI. |
| |
| **β¨ New Features:** |
| - π― Generates **What** questions (factual) |
| - π€ Generates **Why** questions (reasoning) |
| - π§ Generates **How** questions (process) |
| - π Improved question quality and variety |
| - π§ Better answer extraction |
| |
| **Core Features:** |
| - π§ Uses local CPU-friendly AI (no GPU needed) |
| - π Extracts text from any PDF |
| - βοΈ Intelligently chunks content |
| - π΄ Generates diverse question-answer pairs |
| - πΎ Export to CSV (Anki-compatible) or JSON |
| |
| *Note: Processing is done entirely on CPU, so large PDFs may take a few minutes.* |
| """) |
| |
| with gr.Row(): |
| with gr.Column(scale=1): |
| pdf_input = gr.File( |
| label="Upload PDF", |
| file_types=[".pdf"], |
| type="filepath" |
| ) |
| |
| with gr.Row(): |
| questions_per_chunk = gr.Slider( |
| minimum=1, |
| maximum=6, |
| value=3, |
| step=1, |
| label="Questions per section" |
| ) |
| max_chunks = gr.Slider( |
| minimum=5, |
| maximum=50, |
| value=20, |
| step=5, |
| label="Max sections to process" |
| ) |
| |
| process_btn = gr.Button("π Generate Flashcards", variant="primary") |
| |
| gr.Markdown(""" |
| ### π‘ Tips: |
| - Text-based PDFs work best (scanned images won't work) |
| - Academic papers and articles work great |
| - Adjust "Questions per section" for more variety |
| - Higher questions per section = more Why/How questions |
| """) |
| |
| with gr.Column(scale=2): |
| status_text = gr.Textbox( |
| label="Status", |
| value="Ready to process PDF...", |
| interactive=False |
| ) |
| |
| output_display = gr.Markdown( |
| label="Generated Flashcards", |
| value="Your flashcards will appear here..." |
| ) |
| |
| with gr.Row(): |
| with gr.Column(): |
| csv_output = gr.Textbox( |
| label="CSV Format (for Anki import)", |
| lines=10, |
| visible=True |
| ) |
| gr.Markdown("*Copy the CSV content and save as `.csv` file to import into Anki*") |
| |
| with gr.Column(): |
| json_output = gr.Textbox( |
| label="JSON Format", |
| lines=10, |
| visible=True |
| ) |
| gr.Markdown("*Raw JSON data for custom applications*") |
| |
| |
| process_btn.click( |
| fn=process_pdf, |
| inputs=[pdf_input, questions_per_chunk, max_chunks], |
| outputs=[status_text, csv_output, json_output, output_display] |
| ) |
| |
| |
| gr.Markdown("---") |
| gr.Markdown("### π― Example Output Format") |
| gr.Markdown(create_sample_flashcard()) |
|
|
| if __name__ == "__main__": |
| demo.launch() |