Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
e306cd2
1
Parent(s):
7615b9a
Attempt to use the API
Browse files- ui/src/app/api/hf-jobs/route.ts +33 -69
ui/src/app/api/hf-jobs/route.ts
CHANGED
|
@@ -1033,81 +1033,45 @@ async function checkHFJobStatus(token: string, jobId: string, jobNamespace?: str
|
|
| 1033 |
}
|
| 1034 |
|
| 1035 |
async function checkHFJobsCapacity(token: string): Promise<any> {
|
| 1036 |
-
|
| 1037 |
-
console.log('Checking HF Jobs capacity for namespace: lora-training-frenzi');
|
| 1038 |
-
|
| 1039 |
-
// Create a temporary file to store the output
|
| 1040 |
-
const tempFile = path.join(tmpdir(), `hf_jobs_ps_${Date.now()}.txt`);
|
| 1041 |
-
console.log(`Writing output to temp file: ${tempFile}`);
|
| 1042 |
-
|
| 1043 |
-
// Use shell redirection to write to file
|
| 1044 |
-
const command = `hf jobs ps --namespace lora-training-frenzi --token "${token}" > "${tempFile}" 2>&1`;
|
| 1045 |
|
| 1046 |
-
|
| 1047 |
-
|
| 1048 |
-
|
| 1049 |
-
|
| 1050 |
-
|
| 1051 |
-
NO_COLOR: '1',
|
| 1052 |
-
}
|
| 1053 |
});
|
| 1054 |
|
| 1055 |
-
|
| 1056 |
-
|
| 1057 |
-
|
| 1058 |
-
try {
|
| 1059 |
-
// Read the output from the temporary file
|
| 1060 |
-
const output = await readFile(tempFile, 'utf-8');
|
| 1061 |
-
|
| 1062 |
-
console.log('=== RAW OUTPUT START ===');
|
| 1063 |
-
console.log(output);
|
| 1064 |
-
console.log('=== RAW OUTPUT END ===');
|
| 1065 |
-
|
| 1066 |
-
// Count RUNNING jobs in the output
|
| 1067 |
-
// Split by newline and filter out empty lines
|
| 1068 |
-
const lines = output.split(/\r?\n/).filter(line => line.trim().length > 0);
|
| 1069 |
-
let runningCount = 0;
|
| 1070 |
-
|
| 1071 |
-
console.log(`Total non-empty lines in output: ${lines.length}`);
|
| 1072 |
-
|
| 1073 |
-
for (let i = 0; i < lines.length; i++) {
|
| 1074 |
-
const line = lines[i];
|
| 1075 |
-
console.log(`Line ${i}: "${line}"`);
|
| 1076 |
-
|
| 1077 |
-
// Check if line contains RUNNING (case-sensitive as shown in your output)
|
| 1078 |
-
if (line.includes('RUNNING')) {
|
| 1079 |
-
runningCount++;
|
| 1080 |
-
console.log(` ✓ Line ${i} contains RUNNING (count: ${runningCount})`);
|
| 1081 |
-
}
|
| 1082 |
-
}
|
| 1083 |
|
| 1084 |
-
|
|
|
|
| 1085 |
|
| 1086 |
-
|
| 1087 |
-
|
| 1088 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1089 |
|
| 1090 |
-
|
| 1091 |
-
try {
|
| 1092 |
-
await unlink(tempFile);
|
| 1093 |
-
} catch (unlinkError) {
|
| 1094 |
-
console.warn('Failed to delete temp file:', unlinkError);
|
| 1095 |
-
}
|
| 1096 |
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
-
capacityLimit: 32,
|
| 1101 |
-
});
|
| 1102 |
-
} catch (parseError: any) {
|
| 1103 |
-
console.error('Failed to read or parse jobs ps output:', parseError);
|
| 1104 |
-
reject(new Error('Failed to parse capacity status'));
|
| 1105 |
-
}
|
| 1106 |
-
});
|
| 1107 |
|
| 1108 |
-
|
| 1109 |
-
|
| 1110 |
-
|
| 1111 |
-
|
| 1112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1113 |
}
|
|
|
|
| 1033 |
}
|
| 1034 |
|
| 1035 |
async function checkHFJobsCapacity(token: string): Promise<any> {
|
| 1036 |
+
try {
|
| 1037 |
+
console.log('Checking HF Jobs capacity for namespace: lora-training-frenzi via API');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1038 |
|
| 1039 |
+
// Use HuggingFace API directly instead of CLI to avoid TTY issues
|
| 1040 |
+
const response = await fetch('https://huggingface.co/api/jobs/lora-training-frenzi', {
|
| 1041 |
+
headers: {
|
| 1042 |
+
'Authorization': `Bearer ${token}`,
|
| 1043 |
+
},
|
|
|
|
|
|
|
| 1044 |
});
|
| 1045 |
|
| 1046 |
+
if (!response.ok) {
|
| 1047 |
+
throw new Error(`API request failed: ${response.status} ${response.statusText}`);
|
| 1048 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1049 |
|
| 1050 |
+
const jobs = await response.json();
|
| 1051 |
+
console.log(`Fetched ${jobs.length} total jobs from API`);
|
| 1052 |
|
| 1053 |
+
// Count jobs with status RUNNING
|
| 1054 |
+
let runningCount = 0;
|
| 1055 |
+
for (const job of jobs) {
|
| 1056 |
+
const status = job.status?.stage || job.status;
|
| 1057 |
+
if (status === 'RUNNING') {
|
| 1058 |
+
runningCount++;
|
| 1059 |
+
}
|
| 1060 |
+
}
|
| 1061 |
|
| 1062 |
+
const atCapacity = runningCount >= 32;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1063 |
|
| 1064 |
+
console.log(`\n=== FINAL COUNT ===`);
|
| 1065 |
+
console.log(`Found ${runningCount} RUNNING jobs. At capacity: ${atCapacity}`);
|
| 1066 |
+
console.log(`==================\n`);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1067 |
|
| 1068 |
+
return {
|
| 1069 |
+
runningJobs: runningCount,
|
| 1070 |
+
atCapacity,
|
| 1071 |
+
capacityLimit: 32,
|
| 1072 |
+
};
|
| 1073 |
+
} catch (error: any) {
|
| 1074 |
+
console.error('Failed to check capacity via API:', error);
|
| 1075 |
+
throw new Error(`Failed to check capacity: ${error.message}`);
|
| 1076 |
+
}
|
| 1077 |
}
|