Spaces:
Sleeping
Sleeping
add endpoints to extract text
Browse files- api/docs.py +17 -0
api/docs.py
CHANGED
|
@@ -648,3 +648,20 @@ async def extract_requirements_from_docs(req: ExtractRequirementsRequest, llm_ro
|
|
| 648 |
yield progress_update(ProgressUpdate(status="complete", data=final_response.model_dump(), total_docs=n_docs, processed_docs=n_processed))
|
| 649 |
|
| 650 |
return StreamingResponse(_stream_generator(process_futures), media_type="text/event-stream")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 648 |
yield progress_update(ProgressUpdate(status="complete", data=final_response.model_dump(), total_docs=n_docs, processed_docs=n_processed))
|
| 649 |
|
| 650 |
return StreamingResponse(_stream_generator(process_futures), media_type="text/event-stream")
|
| 651 |
+
|
| 652 |
+
# ======================================================================================================================================================================================
|
| 653 |
+
|
| 654 |
+
@router.post("/extract_text_from_url")
|
| 655 |
+
async def extract_text_from_url(url: str, http_client: AsyncClient = Depends(get_http_client)) -> dict:
|
| 656 |
+
"""Extract text from a given document URL and return the text content."""
|
| 657 |
+
|
| 658 |
+
logging.info(f"Extracting text from URL: {url}")
|
| 659 |
+
|
| 660 |
+
try:
|
| 661 |
+
filename, ext, bytes = await get_doc_archive(url, http_client)
|
| 662 |
+
text_lines = await extract_text_contents(filename, ext, bytes)
|
| 663 |
+
content = "\n".join(text_lines)
|
| 664 |
+
return {"document": filename, "content": content}
|
| 665 |
+
except Exception as e:
|
| 666 |
+
logging.error(f"Failed to extract text from URL '{url}': {e}")
|
| 667 |
+
raise HTTPException(status_code=500, detail=f"Text extraction failed: {e}")
|