heymenn commited on
Commit
38f4809
·
1 Parent(s): 12d5a0c

add endpoints to extract text

Browse files
Files changed (1) hide show
  1. api/docs.py +17 -0
api/docs.py CHANGED
@@ -648,3 +648,20 @@ async def extract_requirements_from_docs(req: ExtractRequirementsRequest, llm_ro
648
  yield progress_update(ProgressUpdate(status="complete", data=final_response.model_dump(), total_docs=n_docs, processed_docs=n_processed))
649
 
650
  return StreamingResponse(_stream_generator(process_futures), media_type="text/event-stream")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
  yield progress_update(ProgressUpdate(status="complete", data=final_response.model_dump(), total_docs=n_docs, processed_docs=n_processed))
649
 
650
  return StreamingResponse(_stream_generator(process_futures), media_type="text/event-stream")
651
+
652
+ # ======================================================================================================================================================================================
653
+
654
+ @router.post("/extract_text_from_url")
655
+ async def extract_text_from_url(url: str, http_client: AsyncClient = Depends(get_http_client)) -> dict:
656
+ """Extract text from a given document URL and return the text content."""
657
+
658
+ logging.info(f"Extracting text from URL: {url}")
659
+
660
+ try:
661
+ filename, ext, bytes = await get_doc_archive(url, http_client)
662
+ text_lines = await extract_text_contents(filename, ext, bytes)
663
+ content = "\n".join(text_lines)
664
+ return {"document": filename, "content": content}
665
+ except Exception as e:
666
+ logging.error(f"Failed to extract text from URL '{url}': {e}")
667
+ raise HTTPException(status_code=500, detail=f"Text extraction failed: {e}")