Spaces:

VyLala
/

mtDNALocation

Running

App Files Files Community

VyLala commited on Jul 7

Commit

4cc7eed

verified ·

1 Parent(s): da14ceb

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +125 -25

pipeline.py CHANGED Viewed

@@ -2,7 +2,8 @@
 # test2: "A1YU101" thailand cross-ref
 # test3: "EBK109" thailand cross-ref
 # test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
-from iterate3 import data_preprocess, model
 import mtdna_classifier
 import app
 import pandas as pd
@@ -17,6 +18,57 @@ import standardize_location
 # Track time
 import time
 import multiprocessing
 def run_with_timeout(func, args=(), kwargs={}, timeout=20):
     """
@@ -98,19 +150,37 @@ def pipeline_with_gemini(accessions):
       # set up step: create the folder to save document
       chunk, all_output = "",""
       if pudID:
-        id = pudID
         saveTitle = title
       else:
         saveTitle = title + "_" + col_date
         id = "DirectSubmission"
-      folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
-      if not folder_path.exists():
-          cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}'
-          result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-          print("data/"+str(id) +" created.")
-      else:
-          print("data/"+str(id) +" already exists.")
-      saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
       # first way: ncbi method
       if country.lower() != "unknown":
         stand_country = standardize_location.smart_country_lookup(country.lower())
@@ -154,24 +224,34 @@ def pipeline_with_gemini(accessions):
               links.append(link)
           if jsonSM:
             links += sum((jsonSM[key] for key in jsonSM),[])
-      print(links)
       links = unique_preserve_order(links)
       acc_score["source"] = links
-      chunk_path = "/"+saveTitle+"_merged_document.docx"
-      all_path = "/"+saveTitle+"_all_merged_document.docx"
-      # if chunk and all output not exist yet
-      file_chunk_path = saveLinkFolder + chunk_path
-      file_all_path = saveLinkFolder + all_path
-      if os.path.exists(file_chunk_path):
         print("File chunk exists!")
         if not chunk:
-          text, table, document_title = model.read_docx_text(file_chunk_path)
-          chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
-      if os.path.exists(file_all_path):
         print("File all output exists!")
         if not all_output:
-          text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
-          all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
       if not chunk and not all_output:
         # else: check if we can reuse these chunk and all output of existed accession to find another
         if links:
@@ -233,8 +313,16 @@ def pipeline_with_gemini(accessions):
             all_output = all_output[:1*1024*1024]
         print("chunk len: ", len(chunk))
         print("all output len: ", len(all_output))
         data_preprocess.save_text_to_docx(chunk, file_chunk_path)
         data_preprocess.save_text_to_docx(all_output, file_all_path)
       # else:
       #   final_input = ""
       #   if all_output:
@@ -253,9 +341,21 @@ def pipeline_with_gemini(accessions):
       #       chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
       # Define paths for cached RAG assets
-      faiss_index_path = saveLinkFolder+"/faiss_index.bin"
-      document_chunks_path = saveLinkFolder+"/document_chunks.json"
-      structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
       master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
           faiss_index_path, document_chunks_path, structured_lookup_path

 # test2: "A1YU101" thailand cross-ref
 # test3: "EBK109" thailand cross-ref
 # test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
+import data_preprocess
+import model
 import mtdna_classifier
 import app
 import pandas as pd
 # Track time
 import time
 import multiprocessing
+import gspread
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
+from oauth2client.service_account import ServiceAccountCredentials
+import io
+#––– Authentication setup –––
+GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
+GDRIVE_DATA_FOLDER_NAME = "data"
+GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"])  # from HF secrets
+GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
+drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
+def get_or_create_drive_folder(name, parent_id=None):
+    query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'"
+    if parent_id:
+        query += f" and '{parent_id}' in parents"
+    results = drive_service.files().list(q=query, spaces='drive', fields="files(id, name)").execute()
+    items = results.get("files", [])
+    if items:
+        return items[0]["id"]
+    file_metadata = {
+        "name": name,
+        "mimeType": "application/vnd.google-apps.folder"
+    }
+    if parent_id:
+        file_metadata["parents"] = [parent_id]
+    file = drive_service.files().create(body=file_metadata, fields="id").execute()
+    return file["id"]
+def upload_file_to_drive(local_path, remote_name, folder_id):
+    file_metadata = {"name": remote_name, "parents": [folder_id]}
+    media = MediaFileUpload(local_path, resumable=True)
+    existing = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute().get("files", [])
+    if existing:
+        drive_service.files().delete(fileId=existing[0]["id"]).execute()
+    file = drive_service.files().create(body=file_metadata, media_body=media, fields="id").execute()
+    return file["id"]
+def download_file_from_drive(remote_name, folder_id, local_path):
+    results = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
+    files = results.get("files", [])
+    if not files:
+        return False
+    file_id = files[0]["id"]
+    request = drive_service.files().get_media(fileId=file_id)
+    fh = io.FileIO(local_path, 'wb')
+    downloader = MediaIoBaseDownload(fh, request)
+    done = False
+    while not done:
+        _, done = downloader.next_chunk()
+    return True
 def run_with_timeout(func, args=(), kwargs={}, timeout=20):
     """
       # set up step: create the folder to save document
       chunk, all_output = "",""
       if pudID:
+        id = str(pudID)
         saveTitle = title
       else:
         saveTitle = title + "_" + col_date
         id = "DirectSubmission"
+      # folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
+      # if not folder_path.exists():
+      #     cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}'
+      #     result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+      #     print("data/"+str(id) +" created.")
+      # else:
+      #     print("data/"+str(id) +" already exists.")
+      # saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
+      parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
+      data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
+      sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
+      # Define document names
+      chunk_filename = f"{saveTitle}_merged_document.docx"
+      all_filename = f"{saveTitle}_all_merged_document.docx"
+      # Define local temp paths for reading/writing
+      import tempfile
+      tmp_dir = tempfile.mkdtemp()
+      file_chunk_path = os.path.join(tmp_dir, chunk_filename)
+      file_all_path = os.path.join(tmp_dir, all_filename)
+      # Try to download if already exists on Drive
+      chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
+      all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
       # first way: ncbi method
       if country.lower() != "unknown":
         stand_country = standardize_location.smart_country_lookup(country.lower())
               links.append(link)
           if jsonSM:
             links += sum((jsonSM[key] for key in jsonSM),[])
+      #print(links)
       links = unique_preserve_order(links)
       acc_score["source"] = links
+      # chunk_path = "/"+saveTitle+"_merged_document.docx"
+      # all_path = "/"+saveTitle+"_all_merged_document.docx"
+      # # if chunk and all output not exist yet
+      # file_chunk_path = saveLinkFolder + chunk_path
+      # file_all_path = saveLinkFolder + all_path
+      # if os.path.exists(file_chunk_path):
+      #   print("File chunk exists!")
+      #   if not chunk:
+      #     text, table, document_title = model.read_docx_text(file_chunk_path)
+      #     chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
+      # if os.path.exists(file_all_path):
+      #   print("File all output exists!")
+      #   if not all_output:
+      #     text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
+      #     all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
+      if chunk_exists:
         print("File chunk exists!")
         if not chunk:
+            text, table, document_title = model.read_docx_text(file_chunk_path)
+            chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
+      if all_exists:
         print("File all output exists!")
         if not all_output:
+            text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
+            all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
       if not chunk and not all_output:
         # else: check if we can reuse these chunk and all output of existed accession to find another
         if links:
             all_output = all_output[:1*1024*1024]
         print("chunk len: ", len(chunk))
         print("all output len: ", len(all_output))
+        # data_preprocess.save_text_to_docx(chunk, file_chunk_path)
+        # data_preprocess.save_text_to_docx(all_output, file_all_path)
+        # Later when saving new files
         data_preprocess.save_text_to_docx(chunk, file_chunk_path)
         data_preprocess.save_text_to_docx(all_output, file_all_path)
+        # Upload to Drive
+        upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
+        upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
       # else:
       #   final_input = ""
       #   if all_output:
       #       chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
       # Define paths for cached RAG assets
+      # faiss_index_path = saveLinkFolder+"/faiss_index.bin"
+      # document_chunks_path = saveLinkFolder+"/document_chunks.json"
+      # structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
+      faiss_filename = "faiss_index.bin"
+      chunks_filename = "document_chunks.json"
+      lookup_filename = "structured_lookup.json"
+      # Save in temporary local directory
+      faiss_index_path = os.path.join(tmp_dir, faiss_filename)
+      document_chunks_path = os.path.join(tmp_dir, chunks_filename)
+      structured_lookup_path = os.path.join(tmp_dir, lookup_filename)
+      download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
+      download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
+      download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
       master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
           faiss_index_path, document_chunks_path, structured_lookup_path