Spaces:
Running
Running
Update pipeline.py
Browse files- pipeline.py +125 -25
pipeline.py
CHANGED
|
@@ -2,7 +2,8 @@
|
|
| 2 |
# test2: "A1YU101" thailand cross-ref
|
| 3 |
# test3: "EBK109" thailand cross-ref
|
| 4 |
# test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
|
| 5 |
-
|
|
|
|
| 6 |
import mtdna_classifier
|
| 7 |
import app
|
| 8 |
import pandas as pd
|
|
@@ -17,6 +18,57 @@ import standardize_location
|
|
| 17 |
# Track time
|
| 18 |
import time
|
| 19 |
import multiprocessing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
def run_with_timeout(func, args=(), kwargs={}, timeout=20):
|
| 22 |
"""
|
|
@@ -98,19 +150,37 @@ def pipeline_with_gemini(accessions):
|
|
| 98 |
# set up step: create the folder to save document
|
| 99 |
chunk, all_output = "",""
|
| 100 |
if pudID:
|
| 101 |
-
id = pudID
|
| 102 |
saveTitle = title
|
| 103 |
else:
|
| 104 |
saveTitle = title + "_" + col_date
|
| 105 |
id = "DirectSubmission"
|
| 106 |
-
folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
|
| 107 |
-
if not folder_path.exists():
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
else:
|
| 112 |
-
|
| 113 |
-
saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
# first way: ncbi method
|
| 115 |
if country.lower() != "unknown":
|
| 116 |
stand_country = standardize_location.smart_country_lookup(country.lower())
|
|
@@ -154,24 +224,34 @@ def pipeline_with_gemini(accessions):
|
|
| 154 |
links.append(link)
|
| 155 |
if jsonSM:
|
| 156 |
links += sum((jsonSM[key] for key in jsonSM),[])
|
| 157 |
-
print(links)
|
| 158 |
links = unique_preserve_order(links)
|
| 159 |
acc_score["source"] = links
|
| 160 |
-
chunk_path = "/"+saveTitle+"_merged_document.docx"
|
| 161 |
-
all_path = "/"+saveTitle+"_all_merged_document.docx"
|
| 162 |
-
# if chunk and all output not exist yet
|
| 163 |
-
file_chunk_path = saveLinkFolder + chunk_path
|
| 164 |
-
file_all_path = saveLinkFolder + all_path
|
| 165 |
-
if os.path.exists(file_chunk_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
print("File chunk exists!")
|
| 167 |
if not chunk:
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
if
|
| 171 |
print("File all output exists!")
|
| 172 |
if not all_output:
|
| 173 |
-
|
| 174 |
-
|
| 175 |
if not chunk and not all_output:
|
| 176 |
# else: check if we can reuse these chunk and all output of existed accession to find another
|
| 177 |
if links:
|
|
@@ -233,8 +313,16 @@ def pipeline_with_gemini(accessions):
|
|
| 233 |
all_output = all_output[:1*1024*1024]
|
| 234 |
print("chunk len: ", len(chunk))
|
| 235 |
print("all output len: ", len(all_output))
|
|
|
|
|
|
|
|
|
|
| 236 |
data_preprocess.save_text_to_docx(chunk, file_chunk_path)
|
| 237 |
data_preprocess.save_text_to_docx(all_output, file_all_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
# else:
|
| 239 |
# final_input = ""
|
| 240 |
# if all_output:
|
|
@@ -253,9 +341,21 @@ def pipeline_with_gemini(accessions):
|
|
| 253 |
# chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
|
| 254 |
|
| 255 |
# Define paths for cached RAG assets
|
| 256 |
-
faiss_index_path = saveLinkFolder+"/faiss_index.bin"
|
| 257 |
-
document_chunks_path = saveLinkFolder+"/document_chunks.json"
|
| 258 |
-
structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
|
| 261 |
faiss_index_path, document_chunks_path, structured_lookup_path
|
|
|
|
| 2 |
# test2: "A1YU101" thailand cross-ref
|
| 3 |
# test3: "EBK109" thailand cross-ref
|
| 4 |
# test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
|
| 5 |
+
import data_preprocess
|
| 6 |
+
import model
|
| 7 |
import mtdna_classifier
|
| 8 |
import app
|
| 9 |
import pandas as pd
|
|
|
|
| 18 |
# Track time
|
| 19 |
import time
|
| 20 |
import multiprocessing
|
| 21 |
+
import gspread
|
| 22 |
+
from googleapiclient.discovery import build
|
| 23 |
+
from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
|
| 24 |
+
from oauth2client.service_account import ServiceAccountCredentials
|
| 25 |
+
import io
|
| 26 |
+
#––– Authentication setup –––
|
| 27 |
+
GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
|
| 28 |
+
GDRIVE_DATA_FOLDER_NAME = "data"
|
| 29 |
+
GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"]) # from HF secrets
|
| 30 |
+
GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
|
| 31 |
+
drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
|
| 32 |
+
|
| 33 |
+
def get_or_create_drive_folder(name, parent_id=None):
|
| 34 |
+
query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'"
|
| 35 |
+
if parent_id:
|
| 36 |
+
query += f" and '{parent_id}' in parents"
|
| 37 |
+
results = drive_service.files().list(q=query, spaces='drive', fields="files(id, name)").execute()
|
| 38 |
+
items = results.get("files", [])
|
| 39 |
+
if items:
|
| 40 |
+
return items[0]["id"]
|
| 41 |
+
file_metadata = {
|
| 42 |
+
"name": name,
|
| 43 |
+
"mimeType": "application/vnd.google-apps.folder"
|
| 44 |
+
}
|
| 45 |
+
if parent_id:
|
| 46 |
+
file_metadata["parents"] = [parent_id]
|
| 47 |
+
file = drive_service.files().create(body=file_metadata, fields="id").execute()
|
| 48 |
+
return file["id"]
|
| 49 |
+
|
| 50 |
+
def upload_file_to_drive(local_path, remote_name, folder_id):
|
| 51 |
+
file_metadata = {"name": remote_name, "parents": [folder_id]}
|
| 52 |
+
media = MediaFileUpload(local_path, resumable=True)
|
| 53 |
+
existing = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute().get("files", [])
|
| 54 |
+
if existing:
|
| 55 |
+
drive_service.files().delete(fileId=existing[0]["id"]).execute()
|
| 56 |
+
file = drive_service.files().create(body=file_metadata, media_body=media, fields="id").execute()
|
| 57 |
+
return file["id"]
|
| 58 |
+
|
| 59 |
+
def download_file_from_drive(remote_name, folder_id, local_path):
|
| 60 |
+
results = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
|
| 61 |
+
files = results.get("files", [])
|
| 62 |
+
if not files:
|
| 63 |
+
return False
|
| 64 |
+
file_id = files[0]["id"]
|
| 65 |
+
request = drive_service.files().get_media(fileId=file_id)
|
| 66 |
+
fh = io.FileIO(local_path, 'wb')
|
| 67 |
+
downloader = MediaIoBaseDownload(fh, request)
|
| 68 |
+
done = False
|
| 69 |
+
while not done:
|
| 70 |
+
_, done = downloader.next_chunk()
|
| 71 |
+
return True
|
| 72 |
|
| 73 |
def run_with_timeout(func, args=(), kwargs={}, timeout=20):
|
| 74 |
"""
|
|
|
|
| 150 |
# set up step: create the folder to save document
|
| 151 |
chunk, all_output = "",""
|
| 152 |
if pudID:
|
| 153 |
+
id = str(pudID)
|
| 154 |
saveTitle = title
|
| 155 |
else:
|
| 156 |
saveTitle = title + "_" + col_date
|
| 157 |
id = "DirectSubmission"
|
| 158 |
+
# folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
|
| 159 |
+
# if not folder_path.exists():
|
| 160 |
+
# cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}'
|
| 161 |
+
# result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
| 162 |
+
# print("data/"+str(id) +" created.")
|
| 163 |
+
# else:
|
| 164 |
+
# print("data/"+str(id) +" already exists.")
|
| 165 |
+
# saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
|
| 166 |
+
parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
|
| 167 |
+
data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
|
| 168 |
+
sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
|
| 169 |
+
|
| 170 |
+
# Define document names
|
| 171 |
+
chunk_filename = f"{saveTitle}_merged_document.docx"
|
| 172 |
+
all_filename = f"{saveTitle}_all_merged_document.docx"
|
| 173 |
+
|
| 174 |
+
# Define local temp paths for reading/writing
|
| 175 |
+
import tempfile
|
| 176 |
+
tmp_dir = tempfile.mkdtemp()
|
| 177 |
+
file_chunk_path = os.path.join(tmp_dir, chunk_filename)
|
| 178 |
+
file_all_path = os.path.join(tmp_dir, all_filename)
|
| 179 |
+
|
| 180 |
+
# Try to download if already exists on Drive
|
| 181 |
+
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
|
| 182 |
+
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
| 183 |
+
|
| 184 |
# first way: ncbi method
|
| 185 |
if country.lower() != "unknown":
|
| 186 |
stand_country = standardize_location.smart_country_lookup(country.lower())
|
|
|
|
| 224 |
links.append(link)
|
| 225 |
if jsonSM:
|
| 226 |
links += sum((jsonSM[key] for key in jsonSM),[])
|
| 227 |
+
#print(links)
|
| 228 |
links = unique_preserve_order(links)
|
| 229 |
acc_score["source"] = links
|
| 230 |
+
# chunk_path = "/"+saveTitle+"_merged_document.docx"
|
| 231 |
+
# all_path = "/"+saveTitle+"_all_merged_document.docx"
|
| 232 |
+
# # if chunk and all output not exist yet
|
| 233 |
+
# file_chunk_path = saveLinkFolder + chunk_path
|
| 234 |
+
# file_all_path = saveLinkFolder + all_path
|
| 235 |
+
# if os.path.exists(file_chunk_path):
|
| 236 |
+
# print("File chunk exists!")
|
| 237 |
+
# if not chunk:
|
| 238 |
+
# text, table, document_title = model.read_docx_text(file_chunk_path)
|
| 239 |
+
# chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
|
| 240 |
+
# if os.path.exists(file_all_path):
|
| 241 |
+
# print("File all output exists!")
|
| 242 |
+
# if not all_output:
|
| 243 |
+
# text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
|
| 244 |
+
# all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
|
| 245 |
+
if chunk_exists:
|
| 246 |
print("File chunk exists!")
|
| 247 |
if not chunk:
|
| 248 |
+
text, table, document_title = model.read_docx_text(file_chunk_path)
|
| 249 |
+
chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
|
| 250 |
+
if all_exists:
|
| 251 |
print("File all output exists!")
|
| 252 |
if not all_output:
|
| 253 |
+
text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
|
| 254 |
+
all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
|
| 255 |
if not chunk and not all_output:
|
| 256 |
# else: check if we can reuse these chunk and all output of existed accession to find another
|
| 257 |
if links:
|
|
|
|
| 313 |
all_output = all_output[:1*1024*1024]
|
| 314 |
print("chunk len: ", len(chunk))
|
| 315 |
print("all output len: ", len(all_output))
|
| 316 |
+
# data_preprocess.save_text_to_docx(chunk, file_chunk_path)
|
| 317 |
+
# data_preprocess.save_text_to_docx(all_output, file_all_path)
|
| 318 |
+
# Later when saving new files
|
| 319 |
data_preprocess.save_text_to_docx(chunk, file_chunk_path)
|
| 320 |
data_preprocess.save_text_to_docx(all_output, file_all_path)
|
| 321 |
+
|
| 322 |
+
# Upload to Drive
|
| 323 |
+
upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
|
| 324 |
+
upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
|
| 325 |
+
|
| 326 |
# else:
|
| 327 |
# final_input = ""
|
| 328 |
# if all_output:
|
|
|
|
| 341 |
# chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
|
| 342 |
|
| 343 |
# Define paths for cached RAG assets
|
| 344 |
+
# faiss_index_path = saveLinkFolder+"/faiss_index.bin"
|
| 345 |
+
# document_chunks_path = saveLinkFolder+"/document_chunks.json"
|
| 346 |
+
# structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
|
| 347 |
+
faiss_filename = "faiss_index.bin"
|
| 348 |
+
chunks_filename = "document_chunks.json"
|
| 349 |
+
lookup_filename = "structured_lookup.json"
|
| 350 |
+
|
| 351 |
+
# Save in temporary local directory
|
| 352 |
+
faiss_index_path = os.path.join(tmp_dir, faiss_filename)
|
| 353 |
+
document_chunks_path = os.path.join(tmp_dir, chunks_filename)
|
| 354 |
+
structured_lookup_path = os.path.join(tmp_dir, lookup_filename)
|
| 355 |
+
|
| 356 |
+
download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
|
| 357 |
+
download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
|
| 358 |
+
download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
|
| 359 |
|
| 360 |
master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
|
| 361 |
faiss_index_path, document_chunks_path, structured_lookup_path
|