Spaces:

OrganizedProgrammers
/

Docxtract

Sleeping

App Files Files Community

Lucas ARRIESSE commited on Aug 8

Commit

5f1cdfa

1 Parent(s): d2dc29e

Use single method for retrieving TDocs + prepare code to refine

Browse files

Files changed (1) hide show

api/docs.py +62 -69

api/docs.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import asyncio
 from typing import Dict, List, Literal, Tuple
 from fastapi.routing import APIRouter
 import logging
@@ -12,6 +14,7 @@ import requests
 import subprocess
 import pandas as pd
 import re
 from lxml import etree
 from bs4 import BeautifulSoup
 from fastapi import Depends, BackgroundTasks, HTTPException, Request
@@ -33,14 +36,54 @@ NSMAP = {
 # ================================== Converting of files to .txt ====================================
 def get_docx_archive(url: str) -> zipfile.ZipFile:
     """Récupère le docx depuis l'URL et le retourne comme objet ZipFile"""
     if not url.endswith("zip"):
         raise ValueError("URL doit pointer vers un fichier ZIP")
     doc_id = os.path.splitext(os.path.basename(url))[0]
     resp = requests.get(url, verify=False, headers={
         "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
     })
     resp.raise_for_status()
     with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
@@ -49,28 +92,9 @@ def get_docx_archive(url: str) -> zipfile.ZipFile:
                 docx_bytes = zf.read(file_name)
                 return zipfile.ZipFile(io.BytesIO(docx_bytes))
             elif file_name.endswith(".doc"):
-                input_path = f"/tmp/{doc_id}.doc"
-                output_path = f"/tmp/{doc_id}.docx"
-                docx_bytes = zf.read(file_name)
-                with open(input_path, "wb") as f:
-                    f.write(docx_bytes)
-                subprocess.run([
-                    "libreoffice",
-                    "--headless",
-                    "--convert-to", "docx",
-                    "--outdir", "/tmp",
-                    input_path
-                ], check=True)
-                with open(output_path, "rb") as f:
-                    docx_bytes = f.read()
-                os.remove(input_path)
-                os.remove(output_path)
-                return zipfile.ZipFile(io.BytesIO(docx_bytes))
     raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
@@ -107,7 +131,7 @@ def clean_document_xml(root: etree._Element) -> None:
                 parent.remove(elem)
-def create_modified_docx(original_zip: zipfile.ZipFile, modified_root: etree._Element) -> bytes:
     """Crée un nouveau docx avec le XML modifié"""
     output = io.BytesIO()
@@ -127,33 +151,24 @@ def create_modified_docx(original_zip: zipfile.ZipFile, modified_root: etree._El
         new_zip.writestr('word/document.xml', xml_str)
     output.seek(0)
-    return output.getvalue()
-def docx_to_txt(doc_id: str, url: str):
     docx_zip = get_docx_archive(url)
     root = parse_document_xml(docx_zip)
     clean_document_xml(root)
     modified_bytes = create_modified_docx(docx_zip, root)
-    input_path = f"/tmp/{doc_id}_cleaned.docx"
-    output_path = f"/tmp/{doc_id}_cleaned.txt"
-    with open(input_path, "wb") as f:
-        f.write(modified_bytes)
-    subprocess.run([
-        "libreoffice",
-        "--headless",
-        "--convert-to", "txt",
-        "--outdir", "/tmp",
-        input_path
-    ], check=True)
-    with open(output_path, "r", encoding="utf-8") as f:
-        txt_data = [line.strip() for line in f if line.strip()]
-    os.remove(input_path)
-    os.remove(output_path)
     return txt_data
@@ -255,27 +270,6 @@ def download_tdocs(req: DocDownloadRequest):
     logging.info(f"Downloading TDocs: {document_ids}")
-    # Retrieve all doc URLs to download
-    doc_urls_req = requests.post(DOC_FINDER_BASE_URL + "find/batch",
-                                 headers={
-                                     "Content-Type": "application/json"
-                                 },
-                                 data=json.dumps({
-                                     "doc_ids": document_ids
-                                 }),
-                                 verify=False)
-    doc_urls_req.raise_for_status()
-    doc_urls = doc_urls_req.json()
-    # early check to bail out if no doc is available.
-    if len(doc_urls["results"]) == 0:
-        logging.warning(
-            f"Got no URL results for docs {document_ids}. 3GPP index may not be up to date")
-        raise HTTPException(
-            status_code=501, detail="Got no URL results for docs {documents}. 3GPP index may not be up to date")
     documents_content: Dict[str, bytes] = {}
     failed_documents: List[str] = []
@@ -292,18 +286,17 @@ def download_tdocs(req: DocDownloadRequest):
                 "utf-8")
             return False, error_message
-    for doc_id, doc_url in doc_urls["results"].items():
-        success, content = _process_single_document(doc_id, doc_url)
-        documents_content[doc_id] = content
         if not success:
-            failed_documents.append(doc_id)
     # sanity check to ensure all requested documents are accounted for, adding error messages for any missing ones
     for requested_doc_id in document_ids:
         if requested_doc_id not in documents_content:
             error_msg = (
                 f"Failed to retrieve or process document '{requested_doc_id}'. "
-                "The 3GPP index may not be up to date, or the document might be unavailable."
             ).encode("utf-8")
             documents_content[requested_doc_id] = error_msg
@@ -361,9 +354,9 @@ async def gen_reqs(req: ExtractRequirementsRequest, llm_router: Router = Depends
         try:
             full = "\n".join(docx_to_txt(doc_id, url))
         except Exception as e:
-            logging.error(
-                f"Failed to process document {doc_id}", e, stack_info=True)
-            return [DocRequirements(document=doc_id, context="Error LLM", requirements=[])]
         try:
             await concurrency_sema.acquire()

 import asyncio
+from pathlib import Path
+import traceback
 from typing import Dict, List, Literal, Tuple
 from fastapi.routing import APIRouter
 import logging
 import subprocess
 import pandas as pd
 import re
+import tempfile
 from lxml import etree
 from bs4 import BeautifulSoup
 from fastapi import Depends, BackgroundTasks, HTTPException, Request
 # ================================== Converting of files to .txt ====================================
+def convert_file(contents: io.BytesIO, filename: str, input_ext: str, output_ext: str, filter: str = None) -> io.BytesIO:
+    """
+    Converts the given file bytes using Libreoffice headless to the specified file type.
+    Args:
+        contents: File contents
+        filename: File base name WITHOUT THE EXTENSION
+        input_ext: Input extension (WITHOUT THE DOT)
+        output_ext: Output extension (WITHOUT THE DOT)
+        filter: The conversion filter to use.
+    """
+    with tempfile.TemporaryDirectory() as tmpdir:
+        dir_path = Path(tmpdir)
+        input_file_path = dir_path / f"{filename}.{input_ext}"
+        output_file_path = dir_path / f"{filename}.{output_ext}"
+        # write the memory contents to the input file
+        with open(input_file_path, "wb") as in_file:
+            in_file.write(contents.read())
+        out_bytes = io.BytesIO()
+        # convert using libreoffice
+        subprocess.run([
+            "libreoffice",
+            "--headless",
+            "--convert-to", f"{output_ext}:{filter}" if filter else output_ext,
+            "--outdir", tmpdir,
+            input_file_path
+        ], check=True)
+        with open(output_file_path, mode="rb") as out:
+            out_bytes.write(out.read())
+        out_bytes.seek(0)
+        return out_bytes
 def get_docx_archive(url: str) -> zipfile.ZipFile:
     """Récupère le docx depuis l'URL et le retourne comme objet ZipFile"""
     if not url.endswith("zip"):
         raise ValueError("URL doit pointer vers un fichier ZIP")
     doc_id = os.path.splitext(os.path.basename(url))[0]
     resp = requests.get(url, verify=False, headers={
         "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
     })
     resp.raise_for_status()
     with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
                 docx_bytes = zf.read(file_name)
                 return zipfile.ZipFile(io.BytesIO(docx_bytes))
             elif file_name.endswith(".doc"):
+                in_bytes = io.BytesIO(zf.read(file_name))
+                docx_bytes = convert_file(in_bytes, doc_id, "doc", "docx")
+                return zipfile.ZipFile(docx_bytes)
     raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
                 parent.remove(elem)
+def create_modified_docx(original_zip: zipfile.ZipFile, modified_root: etree._Element) -> io.BytesIO:
     """Crée un nouveau docx avec le XML modifié"""
     output = io.BytesIO()
         new_zip.writestr('word/document.xml', xml_str)
     output.seek(0)
+    return output
+def docx_to_txt(doc_id: str, url: str) -> str:
     docx_zip = get_docx_archive(url)
     root = parse_document_xml(docx_zip)
     clean_document_xml(root)
     modified_bytes = create_modified_docx(docx_zip, root)
+    final_bytes = convert_file(
+        modified_bytes, f"{doc_id}", "docx", "txt")
+    final_bytes_text = str(final_bytes.read(), encoding="utf-8")
+    txt_data = [line.strip()
+                for line in final_bytes_text.splitlines() if line.strip()]
     return txt_data
     logging.info(f"Downloading TDocs: {document_ids}")
     documents_content: Dict[str, bytes] = {}
     failed_documents: List[str] = []
                 "utf-8")
             return False, error_message
+    for doc in req.documents:
+        success, content = _process_single_document(doc.document, doc.url)
+        documents_content[doc.document] = content
         if not success:
+            failed_documents.append(doc.doc_id)
     # sanity check to ensure all requested documents are accounted for, adding error messages for any missing ones
     for requested_doc_id in document_ids:
         if requested_doc_id not in documents_content:
             error_msg = (
                 f"Failed to retrieve or process document '{requested_doc_id}'. "
             ).encode("utf-8")
             documents_content[requested_doc_id] = error_msg
         try:
             full = "\n".join(docx_to_txt(doc_id, url))
         except Exception as e:
+            fmt = "".join(traceback.format_exception(e))
+            logging.error(f"Failed to process doc {doc_id} : {fmt}")
+            return [DocRequirements(document=doc_id, context="Failed to process document", requirements=[])]
         try:
             await concurrency_sema.acquire()