Spaces:
Running
Running
Update pipeline.py
Browse files- pipeline.py +32 -16
pipeline.py
CHANGED
|
@@ -250,23 +250,39 @@ def pipeline_with_gemini(accessions):
|
|
| 250 |
# Define local temp paths for reading/writing
|
| 251 |
# import tempfile
|
| 252 |
# tmp_dir = tempfile.mkdtemp()
|
| 253 |
-
|
| 254 |
-
os.makedirs(
|
| 255 |
-
file_chunk_path = os.path.join(
|
| 256 |
-
file_all_path = os.path.join(
|
| 257 |
# file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename)
|
| 258 |
# file_all_path = os.path.join(tempfile.gettempdir(), all_filename)
|
| 259 |
print(file_chunk_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
# 🔥 Remove the local file first if it exists
|
| 261 |
-
if os.path.exists(file_chunk_path):
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
if os.path.exists(file_all_path):
|
| 265 |
-
|
| 266 |
-
|
| 267 |
# Try to download if already exists on Drive
|
| 268 |
-
|
| 269 |
-
|
| 270 |
print("chunk exist: ", chunk_exists)
|
| 271 |
# first way: ncbi method
|
| 272 |
print("country.lower: ",country.lower())
|
|
@@ -405,11 +421,11 @@ def pipeline_with_gemini(accessions):
|
|
| 405 |
all_output = all_output[:1*1024*1024]
|
| 406 |
print("chunk len: ", len(chunk))
|
| 407 |
print("all output len: ", len(all_output))
|
| 408 |
-
|
| 409 |
-
|
| 410 |
# Later when saving new files
|
| 411 |
-
data_preprocess.save_text_to_docx(chunk, chunk_filename, sample_folder_id)
|
| 412 |
-
data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
|
| 413 |
|
| 414 |
# Upload to Drive
|
| 415 |
upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
|
|
|
|
| 250 |
# Define local temp paths for reading/writing
|
| 251 |
# import tempfile
|
| 252 |
# tmp_dir = tempfile.mkdtemp()
|
| 253 |
+
LOCAL_TEMP_DIR = "/mnt/data/generated_docs"
|
| 254 |
+
os.makedirs(LOCAL_TEMP_DIR, exist_ok=True)
|
| 255 |
+
file_chunk_path = os.path.join(LOCAL_TEMP_DIR, chunk_filename)
|
| 256 |
+
file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename)
|
| 257 |
# file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename)
|
| 258 |
# file_all_path = os.path.join(tempfile.gettempdir(), all_filename)
|
| 259 |
print(file_chunk_path)
|
| 260 |
+
chunk_id = find_drive_file(chunk_filename, sample_folder_id)
|
| 261 |
+
all_id = find_drive_file(all_filename, sample_folder_id)
|
| 262 |
+
|
| 263 |
+
if chunk_id and all_id:
|
| 264 |
+
print("✅ Files already exist in Google Drive. Downloading them...")
|
| 265 |
+
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
|
| 266 |
+
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
| 267 |
+
# Read and parse these into `chunk` and `all_output`
|
| 268 |
+
else:
|
| 269 |
+
# 🔥 Remove any stale local copies
|
| 270 |
+
if os.path.exists(file_chunk_path):
|
| 271 |
+
os.remove(file_chunk_path)
|
| 272 |
+
print(f"🗑️ Removed stale: {file_chunk_path}")
|
| 273 |
+
if os.path.exists(file_all_path):
|
| 274 |
+
os.remove(file_all_path)
|
| 275 |
+
print(f"🗑️ Removed stale: {file_all_path}")
|
| 276 |
# 🔥 Remove the local file first if it exists
|
| 277 |
+
# if os.path.exists(file_chunk_path):
|
| 278 |
+
# os.remove(file_chunk_path)
|
| 279 |
+
# print("remove chunk path")
|
| 280 |
+
# if os.path.exists(file_all_path):
|
| 281 |
+
# os.remove(file_all_path)
|
| 282 |
+
# print("remove all path")
|
| 283 |
# Try to download if already exists on Drive
|
| 284 |
+
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
|
| 285 |
+
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
| 286 |
print("chunk exist: ", chunk_exists)
|
| 287 |
# first way: ncbi method
|
| 288 |
print("country.lower: ",country.lower())
|
|
|
|
| 421 |
all_output = all_output[:1*1024*1024]
|
| 422 |
print("chunk len: ", len(chunk))
|
| 423 |
print("all output len: ", len(all_output))
|
| 424 |
+
data_preprocess.save_text_to_docx(chunk, file_chunk_path)
|
| 425 |
+
data_preprocess.save_text_to_docx(all_output, file_all_path)
|
| 426 |
# Later when saving new files
|
| 427 |
+
# data_preprocess.save_text_to_docx(chunk, chunk_filename, sample_folder_id)
|
| 428 |
+
# data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
|
| 429 |
|
| 430 |
# Upload to Drive
|
| 431 |
upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
|