Shubham170793 commited on
Commit
2239986
Β·
verified Β·
1 Parent(s): a2da12f

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +85 -80
src/streamlit_app.py CHANGED
@@ -277,7 +277,18 @@ def _hash_content(file_path):
277
  with open(file_path, "rb") as f:
278
  while chunk := f.read(8192):
279
  hasher.update(chunk)
280
- return hasher.hexdigest()[:12] # keep short hash for filenames
 
 
 
 
 
 
 
 
 
 
 
281
 
282
  # --- Document selection ---
283
  if doc_choice == "-- Select --":
@@ -308,91 +319,81 @@ else:
308
  file_hash = _hash_content(temp_path)
309
  doc_identifier = f"{doc_name}_{file_hash}" # unique per content
310
 
311
- # πŸ” Reprocess only if new or changed document
312
- if "doc_ready" not in st.session_state or st.session_state.get("last_doc") != doc_identifier:
313
- status = st.empty()
314
- status.info("πŸ“€ Upload complete β€” reading document...")
315
-
316
- # 🧩 Step 1: Extract text and TOC
317
- text, toc, toc_source = extract_text_from_pdf(temp_path)
318
-
319
- # 🧩 Step 2: Chunk the text
320
- status.info("πŸ“‘ Parsing and chunking document...")
321
- chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
322
-
323
- # βœ… Step 2.5: Registry pre-check (Commit #4 – with fresh suggestion rebuild)
324
- if "registry" in st.session_state:
325
- registry = st.session_state["registry"]
326
- existing_doc = next((d for d in registry.list_docs() if d["name"] == doc_name), None)
327
-
328
- if existing_doc:
329
- doc_data = registry.get_doc(existing_doc["name"])
330
-
331
- # 🧠 Update session with existing document data
332
- st.session_state.update({
333
- "text": doc_data.get("chunks", ""),
334
- "chunks": doc_data.get("chunks", []),
335
- "embeddings": doc_data.get("embeddings"),
336
- "index": doc_data.get("index"),
337
- "doc_ready": True,
338
- "active_doc": existing_doc["name"],
339
- "status_text": f"βœ… {doc_name} already processed β€” loaded from registry."
340
- })
341
-
342
- refresh_suggestions(doc_name, doc_data.get("toc", []), doc_data.get("chunks", []))
343
-
344
- # 🧭 Optional: Dev visibility message
345
- if show_dev:
346
- st.info(f"🧠 Reused cached registry entry for {doc_name} β€” suggestions refreshed.")
347
-
348
- st.rerun()
349
-
350
-
351
- # 🧩 Step 3: Embed and index
352
- status.info("🧠 Building embeddings and search index...")
353
- embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
354
- index = build_faiss_index(embeddings)
355
-
356
- # βœ… Step 3.5: Register in session registry
357
- if "registry" not in st.session_state:
358
- st.session_state["registry"] = DocumentRegistry()
359
-
360
- registry = st.session_state["registry"]
361
- doc_id = registry.register(temp_path, chunks, embeddings, index)
362
- st.session_state["active_doc"] = doc_id
363
-
364
- # 🧩 Step 4: Final success message
365
- status.success("βœ… Document processed successfully β€” all set to query your assistant!")
366
 
367
- # 🧠 Store everything in session state
 
 
 
368
  st.session_state.update({
369
- "text": text,
370
- "toc": toc,
371
- "chunks": chunks,
372
- "embeddings": embeddings,
373
- "index": index,
374
  "doc_ready": True,
375
- "last_doc": doc_identifier,
376
- "status_text": "βœ… Document processed successfully β€” all set to query your assistant!"
377
  })
378
 
379
- # 🧠 Build fresh suggestions and rerun
380
- refresh_suggestions(doc_name, toc, chunks)
 
 
 
 
 
 
 
381
  st.rerun()
382
 
383
- else:
384
- # ♻️ Reuse cached session state (same file)
385
- text = st.session_state["text"]
386
- toc = st.session_state["toc"]
387
- chunks = st.session_state["chunks"]
388
- embeddings = st.session_state["embeddings"]
389
- index = st.session_state["index"]
390
- query_suggestions = st.session_state.get("query_suggestions_fixed", [])
391
- active_name = st.session_state.get("active_doc") or st.session_state.get("last_doc")
392
- st.info(st.session_state.get("status_text", f"πŸ“„ {active_name or 'Document'} is ready for queries."))
393
-
394
- # --- Ask section ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
  st.markdown("### πŸ’¬ Ask the Assistant")
 
396
  if query_suggestions:
397
  visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
398
  cols = st.columns(min(3, len(visible)))
@@ -405,7 +406,11 @@ else:
405
  st.session_state["show_more"] = not st.session_state["show_more"]
406
  st.rerun()
407
 
408
- user_query = st.text_input("Type your question or click one above:", key="user_query_input")
 
 
 
 
409
 
410
  if user_query.strip():
411
  reasoning_mode = mode == "Extended (Document + General)"
@@ -419,8 +424,8 @@ else:
419
  if not reasoning_mode and not answer.startswith("⚠️"):
420
  answer = re.sub(r"\*\*(.*?)\*\*", r"\1", answer)
421
  answer = re.sub(r"(^|\n)-\s*", r"\1<br>β€’ ", answer)
422
- st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)
423
 
 
424
 
425
 
426
  # ==========================================================
 
277
  with open(file_path, "rb") as f:
278
  while chunk := f.read(8192):
279
  hasher.update(chunk)
280
+ return hasher.hexdigest()[:12] # short unique hash for same-name files
281
+
282
+
283
+ def refresh_suggestions(doc_name, toc, chunks):
284
+ """Refresh dynamic suggestions and reset related states."""
285
+ st.session_state["query_suggestions_fixed"] = generate_dynamic_suggestions_from_toc(
286
+ toc, chunks, doc_name
287
+ )
288
+ st.session_state["user_query_input"] = ""
289
+ st.session_state["selected_suggestion"] = None
290
+ st.session_state["show_more"] = False
291
+
292
 
293
  # --- Document selection ---
294
  if doc_choice == "-- Select --":
 
319
  file_hash = _hash_content(temp_path)
320
  doc_identifier = f"{doc_name}_{file_hash}" # unique per content
321
 
322
+ # βœ… Step 0: Initialize registry
323
+ if "registry" not in st.session_state:
324
+ st.session_state["registry"] = DocumentRegistry()
325
+ registry = st.session_state["registry"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
 
327
+ # βœ… Step 1: Check if document already in registry
328
+ existing_doc = next((d for d in registry.list_docs() if d["name"] == doc_name), None)
329
+ if existing_doc:
330
+ doc_data = registry.get_doc(existing_doc["name"])
331
  st.session_state.update({
332
+ "text": doc_data.get("text", ""),
333
+ "toc": doc_data.get("toc", []),
334
+ "chunks": doc_data.get("chunks", []),
335
+ "embeddings": doc_data.get("embeddings"),
336
+ "index": doc_data.get("index"),
337
  "doc_ready": True,
338
+ "active_doc": existing_doc["name"],
339
+ "status_text": f"βœ… {doc_name} already processed β€” loaded from registry."
340
  })
341
 
342
+ # βœ… Refresh suggestions when switching
343
+ refresh_suggestions(
344
+ existing_doc["name"],
345
+ st.session_state["toc"],
346
+ st.session_state["chunks"]
347
+ )
348
+
349
+ if show_dev:
350
+ st.info(f"🧠 Loaded from registry: {doc_name}")
351
  st.rerun()
352
 
353
+ # βœ… Step 2: If new document β†’ process normally
354
+ status = st.empty()
355
+ status.info("πŸ“€ Upload complete β€” reading document...")
356
+
357
+ # 🧩 Step 2.1: Extract text and TOC
358
+ text, toc, toc_source = extract_text_from_pdf(temp_path)
359
+
360
+ # 🧩 Step 2.2: Chunk the text
361
+ status.info("πŸ“‘ Parsing and chunking document...")
362
+ chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
363
+
364
+ # 🧩 Step 2.3: Embed and index
365
+ status.info("🧠 Building embeddings and search index...")
366
+ embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
367
+ index = build_faiss_index(embeddings)
368
+
369
+ # 🧩 Step 2.4: Register document
370
+ doc_id = registry.register(temp_path, chunks, embeddings, index)
371
+ st.session_state["active_doc"] = doc_id
372
+
373
+ # 🧩 Step 2.5: Success message + suggestions
374
+ status.success("βœ… Document processed successfully β€” all set to query your assistant!")
375
+ refresh_suggestions(doc_name, toc, chunks)
376
+
377
+ # 🧠 Update session
378
+ st.session_state.update({
379
+ "text": text,
380
+ "toc": toc,
381
+ "chunks": chunks,
382
+ "embeddings": embeddings,
383
+ "index": index,
384
+ "doc_ready": True,
385
+ "last_doc": doc_identifier,
386
+ "status_text": "βœ… Document processed successfully β€” all set to query your assistant!"
387
+ })
388
+ st.rerun()
389
+
390
+ # --- Display Ready Message + Ask Section ---
391
+ if st.session_state.get("doc_ready"):
392
+ active_name = st.session_state.get("active_doc") or st.session_state.get("last_doc")
393
+ st.info(st.session_state.get("status_text", f"πŸ“„ {active_name or 'Document'} is ready for queries."))
394
+
395
  st.markdown("### πŸ’¬ Ask the Assistant")
396
+ query_suggestions = st.session_state.get("query_suggestions_fixed", [])
397
  if query_suggestions:
398
  visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
399
  cols = st.columns(min(3, len(visible)))
 
406
  st.session_state["show_more"] = not st.session_state["show_more"]
407
  st.rerun()
408
 
409
+ user_query = st.text_input(
410
+ "Type your question or click one above:",
411
+ key="user_query_input",
412
+ label_visibility="visible"
413
+ )
414
 
415
  if user_query.strip():
416
  reasoning_mode = mode == "Extended (Document + General)"
 
424
  if not reasoning_mode and not answer.startswith("⚠️"):
425
  answer = re.sub(r"\*\*(.*?)\*\*", r"\1", answer)
426
  answer = re.sub(r"(^|\n)-\s*", r"\1<br>β€’ ", answer)
 
427
 
428
+ st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)
429
 
430
 
431
  # ==========================================================