Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 25, 2025

Commit

2239986

verified ·

1 Parent(s): a2da12f

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +85 -80

src/streamlit_app.py CHANGED Viewed

@@ -277,7 +277,18 @@ def _hash_content(file_path):
     with open(file_path, "rb") as f:
         while chunk := f.read(8192):
             hasher.update(chunk)
-    return hasher.hexdigest()[:12]  # keep short hash for filenames
 # --- Document selection ---
 if doc_choice == "-- Select --":
@@ -308,91 +319,81 @@ else:
         file_hash = _hash_content(temp_path)
         doc_identifier = f"{doc_name}_{file_hash}"  # unique per content
-        # 🔍 Reprocess only if new or changed document
-        if "doc_ready" not in st.session_state or st.session_state.get("last_doc") != doc_identifier:
-            status = st.empty()
-            status.info("📤 Upload complete — reading document...")
-            # 🧩 Step 1: Extract text and TOC
-            text, toc, toc_source = extract_text_from_pdf(temp_path)
-            # 🧩 Step 2: Chunk the text
-            status.info("📑 Parsing and chunking document...")
-            chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
-            # ✅ Step 2.5: Registry pre-check (Commit #4 – with fresh suggestion rebuild)
-            if "registry" in st.session_state:
-                registry = st.session_state["registry"]
-                existing_doc = next((d for d in registry.list_docs() if d["name"] == doc_name), None)
-                if existing_doc:
-                    doc_data = registry.get_doc(existing_doc["name"])
-                    # 🧠 Update session with existing document data
-                    st.session_state.update({
-                        "text": doc_data.get("chunks", ""),
-                        "chunks": doc_data.get("chunks", []),
-                        "embeddings": doc_data.get("embeddings"),
-                        "index": doc_data.get("index"),
-                        "doc_ready": True,
-                        "active_doc": existing_doc["name"],
-                        "status_text": f"✅ {doc_name} already processed — loaded from registry."
-                    })
-                    refresh_suggestions(doc_name, doc_data.get("toc", []), doc_data.get("chunks", []))
-                    # 🧭 Optional: Dev visibility message
-                    if show_dev:
-                        st.info(f"🧠 Reused cached registry entry for {doc_name} — suggestions refreshed.")
-                    st.rerun()
-            # 🧩 Step 3: Embed and index
-            status.info("🧠 Building embeddings and search index...")
-            embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
-            index = build_faiss_index(embeddings)
-            # ✅ Step 3.5: Register in session registry
-            if "registry" not in st.session_state:
-                st.session_state["registry"] = DocumentRegistry()
-            registry = st.session_state["registry"]
-            doc_id = registry.register(temp_path, chunks, embeddings, index)
-            st.session_state["active_doc"] = doc_id
-            # 🧩 Step 4: Final success message
-            status.success("✅ Document processed successfully — all set to query your assistant!")
-            # 🧠 Store everything in session state
             st.session_state.update({
-                "text": text,
-                "toc": toc,
-                "chunks": chunks,
-                "embeddings": embeddings,
-                "index": index,
                 "doc_ready": True,
-                "last_doc": doc_identifier,
-                "status_text": "✅ Document processed successfully — all set to query your assistant!"
             })
-            # 🧠 Build fresh suggestions and rerun
-            refresh_suggestions(doc_name, toc, chunks)
             st.rerun()
-        else:
-            # ♻️ Reuse cached session state (same file)
-            text = st.session_state["text"]
-            toc = st.session_state["toc"]
-            chunks = st.session_state["chunks"]
-            embeddings = st.session_state["embeddings"]
-            index = st.session_state["index"]
-            query_suggestions = st.session_state.get("query_suggestions_fixed", [])
-            active_name = st.session_state.get("active_doc") or st.session_state.get("last_doc")
-            st.info(st.session_state.get("status_text", f"📄 {active_name or 'Document'} is ready for queries."))
-        # --- Ask section ---
         st.markdown("### 💬 Ask the Assistant")
         if query_suggestions:
             visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
             cols = st.columns(min(3, len(visible)))
@@ -405,7 +406,11 @@ else:
                 st.session_state["show_more"] = not st.session_state["show_more"]
                 st.rerun()
-        user_query = st.text_input("Type your question or click one above:", key="user_query_input")
         if user_query.strip():
             reasoning_mode = mode == "Extended (Document + General)"
@@ -419,8 +424,8 @@ else:
             if not reasoning_mode and not answer.startswith("⚠️"):
                 answer = re.sub(r"\*\*(.*?)\*\*", r"\1", answer)
                 answer = re.sub(r"(^|\n)-\s*", r"\1<br>• ", answer)
-            st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)
 # ==========================================================

     with open(file_path, "rb") as f:
         while chunk := f.read(8192):
             hasher.update(chunk)
+    return hasher.hexdigest()[:12]  # short unique hash for same-name files
+def refresh_suggestions(doc_name, toc, chunks):
+    """Refresh dynamic suggestions and reset related states."""
+    st.session_state["query_suggestions_fixed"] = generate_dynamic_suggestions_from_toc(
+        toc, chunks, doc_name
+    )
+    st.session_state["user_query_input"] = ""
+    st.session_state["selected_suggestion"] = None
+    st.session_state["show_more"] = False
 # --- Document selection ---
 if doc_choice == "-- Select --":
         file_hash = _hash_content(temp_path)
         doc_identifier = f"{doc_name}_{file_hash}"  # unique per content
+        # ✅ Step 0: Initialize registry
+        if "registry" not in st.session_state:
+            st.session_state["registry"] = DocumentRegistry()
+        registry = st.session_state["registry"]
+        # ✅ Step 1: Check if document already in registry
+        existing_doc = next((d for d in registry.list_docs() if d["name"] == doc_name), None)
+        if existing_doc:
+            doc_data = registry.get_doc(existing_doc["name"])
             st.session_state.update({
+                "text": doc_data.get("text", ""),
+                "toc": doc_data.get("toc", []),
+                "chunks": doc_data.get("chunks", []),
+                "embeddings": doc_data.get("embeddings"),
+                "index": doc_data.get("index"),
                 "doc_ready": True,
+                "active_doc": existing_doc["name"],
+                "status_text": f"✅ {doc_name} already processed — loaded from registry."
             })
+            # ✅ Refresh suggestions when switching
+            refresh_suggestions(
+                existing_doc["name"],
+                st.session_state["toc"],
+                st.session_state["chunks"]
+            )
+            if show_dev:
+                st.info(f"🧠 Loaded from registry: {doc_name}")
             st.rerun()
+        # ✅ Step 2: If new document → process normally
+        status = st.empty()
+        status.info("📤 Upload complete — reading document...")
+        # 🧩 Step 2.1: Extract text and TOC
+        text, toc, toc_source = extract_text_from_pdf(temp_path)
+        # 🧩 Step 2.2: Chunk the text
+        status.info("📑 Parsing and chunking document...")
+        chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
+        # 🧩 Step 2.3: Embed and index
+        status.info("🧠 Building embeddings and search index...")
+        embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
+        index = build_faiss_index(embeddings)
+        # 🧩 Step 2.4: Register document
+        doc_id = registry.register(temp_path, chunks, embeddings, index)
+        st.session_state["active_doc"] = doc_id
+        # 🧩 Step 2.5: Success message + suggestions
+        status.success("✅ Document processed successfully — all set to query your assistant!")
+        refresh_suggestions(doc_name, toc, chunks)
+        # 🧠 Update session
+        st.session_state.update({
+            "text": text,
+            "toc": toc,
+            "chunks": chunks,
+            "embeddings": embeddings,
+            "index": index,
+            "doc_ready": True,
+            "last_doc": doc_identifier,
+            "status_text": "✅ Document processed successfully — all set to query your assistant!"
+        })
+        st.rerun()
+    # --- Display Ready Message + Ask Section ---
+    if st.session_state.get("doc_ready"):
+        active_name = st.session_state.get("active_doc") or st.session_state.get("last_doc")
+        st.info(st.session_state.get("status_text", f"📄 {active_name or 'Document'} is ready for queries."))
         st.markdown("### 💬 Ask the Assistant")
+        query_suggestions = st.session_state.get("query_suggestions_fixed", [])
         if query_suggestions:
             visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
             cols = st.columns(min(3, len(visible)))
                 st.session_state["show_more"] = not st.session_state["show_more"]
                 st.rerun()
+        user_query = st.text_input(
+            "Type your question or click one above:",
+            key="user_query_input",
+            label_visibility="visible"
+        )
         if user_query.strip():
             reasoning_mode = mode == "Extended (Document + General)"
             if not reasoning_mode and not answer.startswith("⚠️"):
                 answer = re.sub(r"\*\*(.*?)\*\*", r"\1", answer)
                 answer = re.sub(r"(^|\n)-\s*", r"\1<br>• ", answer)
+            st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)
 # ==========================================================