Update src/streamlit_app.py
Browse files- src/streamlit_app.py +85 -80
src/streamlit_app.py
CHANGED
|
@@ -277,7 +277,18 @@ def _hash_content(file_path):
|
|
| 277 |
with open(file_path, "rb") as f:
|
| 278 |
while chunk := f.read(8192):
|
| 279 |
hasher.update(chunk)
|
| 280 |
-
return hasher.hexdigest()[:12] #
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
# --- Document selection ---
|
| 283 |
if doc_choice == "-- Select --":
|
|
@@ -308,91 +319,81 @@ else:
|
|
| 308 |
file_hash = _hash_content(temp_path)
|
| 309 |
doc_identifier = f"{doc_name}_{file_hash}" # unique per content
|
| 310 |
|
| 311 |
-
#
|
| 312 |
-
if "
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
# π§© Step 1: Extract text and TOC
|
| 317 |
-
text, toc, toc_source = extract_text_from_pdf(temp_path)
|
| 318 |
-
|
| 319 |
-
# π§© Step 2: Chunk the text
|
| 320 |
-
status.info("π Parsing and chunking document...")
|
| 321 |
-
chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
|
| 322 |
-
|
| 323 |
-
# β
Step 2.5: Registry pre-check (Commit #4 β with fresh suggestion rebuild)
|
| 324 |
-
if "registry" in st.session_state:
|
| 325 |
-
registry = st.session_state["registry"]
|
| 326 |
-
existing_doc = next((d for d in registry.list_docs() if d["name"] == doc_name), None)
|
| 327 |
-
|
| 328 |
-
if existing_doc:
|
| 329 |
-
doc_data = registry.get_doc(existing_doc["name"])
|
| 330 |
-
|
| 331 |
-
# π§ Update session with existing document data
|
| 332 |
-
st.session_state.update({
|
| 333 |
-
"text": doc_data.get("chunks", ""),
|
| 334 |
-
"chunks": doc_data.get("chunks", []),
|
| 335 |
-
"embeddings": doc_data.get("embeddings"),
|
| 336 |
-
"index": doc_data.get("index"),
|
| 337 |
-
"doc_ready": True,
|
| 338 |
-
"active_doc": existing_doc["name"],
|
| 339 |
-
"status_text": f"β
{doc_name} already processed β loaded from registry."
|
| 340 |
-
})
|
| 341 |
-
|
| 342 |
-
refresh_suggestions(doc_name, doc_data.get("toc", []), doc_data.get("chunks", []))
|
| 343 |
-
|
| 344 |
-
# π§ Optional: Dev visibility message
|
| 345 |
-
if show_dev:
|
| 346 |
-
st.info(f"π§ Reused cached registry entry for {doc_name} β suggestions refreshed.")
|
| 347 |
-
|
| 348 |
-
st.rerun()
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
# π§© Step 3: Embed and index
|
| 352 |
-
status.info("π§ Building embeddings and search index...")
|
| 353 |
-
embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
|
| 354 |
-
index = build_faiss_index(embeddings)
|
| 355 |
-
|
| 356 |
-
# β
Step 3.5: Register in session registry
|
| 357 |
-
if "registry" not in st.session_state:
|
| 358 |
-
st.session_state["registry"] = DocumentRegistry()
|
| 359 |
-
|
| 360 |
-
registry = st.session_state["registry"]
|
| 361 |
-
doc_id = registry.register(temp_path, chunks, embeddings, index)
|
| 362 |
-
st.session_state["active_doc"] = doc_id
|
| 363 |
-
|
| 364 |
-
# π§© Step 4: Final success message
|
| 365 |
-
status.success("β
Document processed successfully β all set to query your assistant!")
|
| 366 |
|
| 367 |
-
|
|
|
|
|
|
|
|
|
|
| 368 |
st.session_state.update({
|
| 369 |
-
"text": text,
|
| 370 |
-
"toc": toc,
|
| 371 |
-
"chunks": chunks,
|
| 372 |
-
"embeddings": embeddings,
|
| 373 |
-
"index": index,
|
| 374 |
"doc_ready": True,
|
| 375 |
-
"
|
| 376 |
-
"status_text": "β
|
| 377 |
})
|
| 378 |
|
| 379 |
-
#
|
| 380 |
-
refresh_suggestions(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
st.rerun()
|
| 382 |
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
st.markdown("### π¬ Ask the Assistant")
|
|
|
|
| 396 |
if query_suggestions:
|
| 397 |
visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
|
| 398 |
cols = st.columns(min(3, len(visible)))
|
|
@@ -405,7 +406,11 @@ else:
|
|
| 405 |
st.session_state["show_more"] = not st.session_state["show_more"]
|
| 406 |
st.rerun()
|
| 407 |
|
| 408 |
-
user_query = st.text_input(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
|
| 410 |
if user_query.strip():
|
| 411 |
reasoning_mode = mode == "Extended (Document + General)"
|
|
@@ -419,8 +424,8 @@ else:
|
|
| 419 |
if not reasoning_mode and not answer.startswith("β οΈ"):
|
| 420 |
answer = re.sub(r"\*\*(.*?)\*\*", r"\1", answer)
|
| 421 |
answer = re.sub(r"(^|\n)-\s*", r"\1<br>β’ ", answer)
|
| 422 |
-
st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)
|
| 423 |
|
|
|
|
| 424 |
|
| 425 |
|
| 426 |
# ==========================================================
|
|
|
|
| 277 |
with open(file_path, "rb") as f:
|
| 278 |
while chunk := f.read(8192):
|
| 279 |
hasher.update(chunk)
|
| 280 |
+
return hasher.hexdigest()[:12] # short unique hash for same-name files
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
def refresh_suggestions(doc_name, toc, chunks):
|
| 284 |
+
"""Refresh dynamic suggestions and reset related states."""
|
| 285 |
+
st.session_state["query_suggestions_fixed"] = generate_dynamic_suggestions_from_toc(
|
| 286 |
+
toc, chunks, doc_name
|
| 287 |
+
)
|
| 288 |
+
st.session_state["user_query_input"] = ""
|
| 289 |
+
st.session_state["selected_suggestion"] = None
|
| 290 |
+
st.session_state["show_more"] = False
|
| 291 |
+
|
| 292 |
|
| 293 |
# --- Document selection ---
|
| 294 |
if doc_choice == "-- Select --":
|
|
|
|
| 319 |
file_hash = _hash_content(temp_path)
|
| 320 |
doc_identifier = f"{doc_name}_{file_hash}" # unique per content
|
| 321 |
|
| 322 |
+
# β
Step 0: Initialize registry
|
| 323 |
+
if "registry" not in st.session_state:
|
| 324 |
+
st.session_state["registry"] = DocumentRegistry()
|
| 325 |
+
registry = st.session_state["registry"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
|
| 327 |
+
# β
Step 1: Check if document already in registry
|
| 328 |
+
existing_doc = next((d for d in registry.list_docs() if d["name"] == doc_name), None)
|
| 329 |
+
if existing_doc:
|
| 330 |
+
doc_data = registry.get_doc(existing_doc["name"])
|
| 331 |
st.session_state.update({
|
| 332 |
+
"text": doc_data.get("text", ""),
|
| 333 |
+
"toc": doc_data.get("toc", []),
|
| 334 |
+
"chunks": doc_data.get("chunks", []),
|
| 335 |
+
"embeddings": doc_data.get("embeddings"),
|
| 336 |
+
"index": doc_data.get("index"),
|
| 337 |
"doc_ready": True,
|
| 338 |
+
"active_doc": existing_doc["name"],
|
| 339 |
+
"status_text": f"β
{doc_name} already processed β loaded from registry."
|
| 340 |
})
|
| 341 |
|
| 342 |
+
# β
Refresh suggestions when switching
|
| 343 |
+
refresh_suggestions(
|
| 344 |
+
existing_doc["name"],
|
| 345 |
+
st.session_state["toc"],
|
| 346 |
+
st.session_state["chunks"]
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
if show_dev:
|
| 350 |
+
st.info(f"π§ Loaded from registry: {doc_name}")
|
| 351 |
st.rerun()
|
| 352 |
|
| 353 |
+
# β
Step 2: If new document β process normally
|
| 354 |
+
status = st.empty()
|
| 355 |
+
status.info("π€ Upload complete β reading document...")
|
| 356 |
+
|
| 357 |
+
# π§© Step 2.1: Extract text and TOC
|
| 358 |
+
text, toc, toc_source = extract_text_from_pdf(temp_path)
|
| 359 |
+
|
| 360 |
+
# π§© Step 2.2: Chunk the text
|
| 361 |
+
status.info("π Parsing and chunking document...")
|
| 362 |
+
chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
|
| 363 |
+
|
| 364 |
+
# π§© Step 2.3: Embed and index
|
| 365 |
+
status.info("π§ Building embeddings and search index...")
|
| 366 |
+
embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
|
| 367 |
+
index = build_faiss_index(embeddings)
|
| 368 |
+
|
| 369 |
+
# π§© Step 2.4: Register document
|
| 370 |
+
doc_id = registry.register(temp_path, chunks, embeddings, index)
|
| 371 |
+
st.session_state["active_doc"] = doc_id
|
| 372 |
+
|
| 373 |
+
# π§© Step 2.5: Success message + suggestions
|
| 374 |
+
status.success("β
Document processed successfully β all set to query your assistant!")
|
| 375 |
+
refresh_suggestions(doc_name, toc, chunks)
|
| 376 |
+
|
| 377 |
+
# π§ Update session
|
| 378 |
+
st.session_state.update({
|
| 379 |
+
"text": text,
|
| 380 |
+
"toc": toc,
|
| 381 |
+
"chunks": chunks,
|
| 382 |
+
"embeddings": embeddings,
|
| 383 |
+
"index": index,
|
| 384 |
+
"doc_ready": True,
|
| 385 |
+
"last_doc": doc_identifier,
|
| 386 |
+
"status_text": "β
Document processed successfully β all set to query your assistant!"
|
| 387 |
+
})
|
| 388 |
+
st.rerun()
|
| 389 |
+
|
| 390 |
+
# --- Display Ready Message + Ask Section ---
|
| 391 |
+
if st.session_state.get("doc_ready"):
|
| 392 |
+
active_name = st.session_state.get("active_doc") or st.session_state.get("last_doc")
|
| 393 |
+
st.info(st.session_state.get("status_text", f"π {active_name or 'Document'} is ready for queries."))
|
| 394 |
+
|
| 395 |
st.markdown("### π¬ Ask the Assistant")
|
| 396 |
+
query_suggestions = st.session_state.get("query_suggestions_fixed", [])
|
| 397 |
if query_suggestions:
|
| 398 |
visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
|
| 399 |
cols = st.columns(min(3, len(visible)))
|
|
|
|
| 406 |
st.session_state["show_more"] = not st.session_state["show_more"]
|
| 407 |
st.rerun()
|
| 408 |
|
| 409 |
+
user_query = st.text_input(
|
| 410 |
+
"Type your question or click one above:",
|
| 411 |
+
key="user_query_input",
|
| 412 |
+
label_visibility="visible"
|
| 413 |
+
)
|
| 414 |
|
| 415 |
if user_query.strip():
|
| 416 |
reasoning_mode = mode == "Extended (Document + General)"
|
|
|
|
| 424 |
if not reasoning_mode and not answer.startswith("β οΈ"):
|
| 425 |
answer = re.sub(r"\*\*(.*?)\*\*", r"\1", answer)
|
| 426 |
answer = re.sub(r"(^|\n)-\s*", r"\1<br>β’ ", answer)
|
|
|
|
| 427 |
|
| 428 |
+
st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)
|
| 429 |
|
| 430 |
|
| 431 |
# ==========================================================
|