Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- src/openai_patch.ts +5 -0
- src/routes/health.ts +5 -0
- src/routes/index.ts +1 -0
- src/routes/responses.ts +19 -76
- src/schemas.ts +2 -2
- src/server.ts +3 -1
src/openai_patch.ts
CHANGED
|
@@ -9,6 +9,7 @@ import type {
|
|
| 9 |
ResponseOutputText,
|
| 10 |
} from "openai/resources/responses/responses";
|
| 11 |
|
|
|
|
| 12 |
export interface ReasoningTextContent {
|
| 13 |
type: "reasoning_text";
|
| 14 |
text: string;
|
|
@@ -42,3 +43,7 @@ export type PatchedResponseStreamEvent =
|
|
| 42 |
| PatchedResponseReasoningTextDoneEvent;
|
| 43 |
|
| 44 |
export type PatchedResponseContentPart = ResponseOutputText | ResponseOutputRefusal;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
ResponseOutputText,
|
| 10 |
} from "openai/resources/responses/responses";
|
| 11 |
|
| 12 |
+
import type { ChatCompletionChunk } from "openai/resources/chat/completions";
|
| 13 |
export interface ReasoningTextContent {
|
| 14 |
type: "reasoning_text";
|
| 15 |
text: string;
|
|
|
|
| 43 |
| PatchedResponseReasoningTextDoneEvent;
|
| 44 |
|
| 45 |
export type PatchedResponseContentPart = ResponseOutputText | ResponseOutputRefusal;
|
| 46 |
+
|
| 47 |
+
export type PatchedDeltaWithReasoning = ChatCompletionChunk.Choice.Delta & {
|
| 48 |
+
reasoning?: string;
|
| 49 |
+
};
|
src/routes/health.ts
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { Request, Response } from "express";
|
| 2 |
+
|
| 3 |
+
export function getHealth(req: Request, res: Response): void {
|
| 4 |
+
res.send("OK");
|
| 5 |
+
}
|
src/routes/index.ts
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
export { postCreateResponse } from "./responses.js";
|
| 2 |
export { getLandingPageHtml } from "./landingPageHtml.js";
|
|
|
|
|
|
| 1 |
export { postCreateResponse } from "./responses.js";
|
| 2 |
export { getLandingPageHtml } from "./landingPageHtml.js";
|
| 3 |
+
export { getHealth } from "./health.js";
|
src/routes/responses.ts
CHANGED
|
@@ -15,16 +15,15 @@ import type {
|
|
| 15 |
PatchedResponseReasoningItem,
|
| 16 |
PatchedResponseStreamEvent,
|
| 17 |
ReasoningTextContent,
|
|
|
|
| 18 |
} from "../openai_patch";
|
| 19 |
import type {
|
| 20 |
ChatCompletionCreateParamsStreaming,
|
| 21 |
ChatCompletionMessageParam,
|
| 22 |
ChatCompletionTool,
|
| 23 |
-
ChatCompletionChunk,
|
| 24 |
} from "openai/resources/chat/completions.js";
|
| 25 |
import type { FunctionParameters } from "openai/resources/shared.js";
|
| 26 |
import { callMcpTool, connectMcpServer } from "../mcp.js";
|
| 27 |
-
import type { Stream } from "openai/core/streaming.js";
|
| 28 |
|
| 29 |
class StreamingError extends Error {
|
| 30 |
constructor(message: string) {
|
|
@@ -36,10 +35,6 @@ class StreamingError extends Error {
|
|
| 36 |
type IncompleteResponse = Omit<Response, "incomplete_details" | "output_text" | "parallel_tool_calls">;
|
| 37 |
const SEQUENCE_NUMBER_PLACEHOLDER = -1;
|
| 38 |
|
| 39 |
-
// TODO: this depends on the model. To be adapted.
|
| 40 |
-
const REASONING_START_TOKEN = "<think>";
|
| 41 |
-
const REASONING_END_TOKEN = "</think>";
|
| 42 |
-
|
| 43 |
export const postCreateResponse = async (
|
| 44 |
req: ValidatedRequest<CreateResponseParams>,
|
| 45 |
res: ExpressResponse
|
|
@@ -498,7 +493,7 @@ async function* handleOneTurnStream(
|
|
| 498 |
baseURL: process.env.OPENAI_BASE_URL ?? "https://router.huggingface.co/v1",
|
| 499 |
apiKey: apiKey,
|
| 500 |
});
|
| 501 |
-
const stream =
|
| 502 |
let previousInputTokens = responseObject.usage?.input_tokens ?? 0;
|
| 503 |
let previousOutputTokens = responseObject.usage?.output_tokens ?? 0;
|
| 504 |
let previousTotalTokens = responseObject.usage?.total_tokens ?? 0;
|
|
@@ -516,22 +511,26 @@ async function* handleOneTurnStream(
|
|
| 516 |
};
|
| 517 |
}
|
| 518 |
|
| 519 |
-
const delta = chunk.choices[0].delta;
|
| 520 |
|
| 521 |
-
if (delta.content) {
|
| 522 |
let currentOutputItem = responseObject.output.at(-1);
|
| 523 |
-
let deltaText = delta.content;
|
| 524 |
|
| 525 |
// If start or end of reasoning, skip token and update the current text mode
|
| 526 |
-
if (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
currentTextMode = "reasoning";
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
}
|
| 534 |
-
|
| 535 |
}
|
| 536 |
|
| 537 |
// If start of a new message, create it
|
|
@@ -611,7 +610,7 @@ async function* handleOneTurnStream(
|
|
| 611 |
item_id: currentOutputMessage.id,
|
| 612 |
output_index: responseObject.output.length - 1,
|
| 613 |
content_index: currentOutputMessage.content.length - 1,
|
| 614 |
-
delta: delta.content,
|
| 615 |
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
| 616 |
};
|
| 617 |
} else if (currentTextMode === "reasoning") {
|
|
@@ -636,13 +635,13 @@ async function* handleOneTurnStream(
|
|
| 636 |
|
| 637 |
// Add text delta
|
| 638 |
const contentPart = currentReasoningItem.content.at(-1) as ReasoningTextContent;
|
| 639 |
-
contentPart.text += delta.
|
| 640 |
yield {
|
| 641 |
type: "response.reasoning_text.delta",
|
| 642 |
item_id: currentReasoningItem.id,
|
| 643 |
output_index: responseObject.output.length - 1,
|
| 644 |
content_index: currentReasoningItem.content.length - 1,
|
| 645 |
-
delta: delta.
|
| 646 |
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
| 647 |
};
|
| 648 |
}
|
|
@@ -992,59 +991,3 @@ async function* closeLastOutputItem(
|
|
| 992 |
}
|
| 993 |
}
|
| 994 |
}
|
| 995 |
-
|
| 996 |
-
/*
|
| 997 |
-
* Wrap a chat completion stream to handle reasoning.
|
| 998 |
-
*
|
| 999 |
-
* The reasoning start and end tokens might be sent in a longer text chunk.
|
| 1000 |
-
* We want to split that text chunk so that the reasoning token is isolated in a separate chunk.
|
| 1001 |
-
*
|
| 1002 |
-
* TODO: also adapt for when reasoning token is sent in separate chunks.
|
| 1003 |
-
*/
|
| 1004 |
-
async function* wrapChatCompletionStream(
|
| 1005 |
-
stream: Stream<ChatCompletionChunk & { _request_id?: string | null | undefined }>
|
| 1006 |
-
): AsyncGenerator<ChatCompletionChunk & { _request_id?: string | null | undefined }> {
|
| 1007 |
-
function cloneChunkWithContent(baseChunk: ChatCompletionChunk, content: string): ChatCompletionChunk {
|
| 1008 |
-
return {
|
| 1009 |
-
...baseChunk,
|
| 1010 |
-
choices: [
|
| 1011 |
-
{
|
| 1012 |
-
...baseChunk.choices[0],
|
| 1013 |
-
delta: {
|
| 1014 |
-
...baseChunk.choices[0].delta,
|
| 1015 |
-
content,
|
| 1016 |
-
},
|
| 1017 |
-
},
|
| 1018 |
-
],
|
| 1019 |
-
};
|
| 1020 |
-
}
|
| 1021 |
-
|
| 1022 |
-
function* splitAndYieldChunk(chunk: ChatCompletionChunk, content: string, token: string) {
|
| 1023 |
-
const [beforeContent, afterContent] = content.split(token, 2);
|
| 1024 |
-
|
| 1025 |
-
if (beforeContent) {
|
| 1026 |
-
yield cloneChunkWithContent(chunk, beforeContent);
|
| 1027 |
-
}
|
| 1028 |
-
yield cloneChunkWithContent(chunk, token);
|
| 1029 |
-
if (afterContent) {
|
| 1030 |
-
yield cloneChunkWithContent(chunk, afterContent);
|
| 1031 |
-
}
|
| 1032 |
-
}
|
| 1033 |
-
|
| 1034 |
-
for await (const chunk of stream) {
|
| 1035 |
-
const content = chunk.choices[0].delta.content;
|
| 1036 |
-
|
| 1037 |
-
if (!content) {
|
| 1038 |
-
yield chunk;
|
| 1039 |
-
continue;
|
| 1040 |
-
}
|
| 1041 |
-
|
| 1042 |
-
if (content.includes(REASONING_START_TOKEN)) {
|
| 1043 |
-
yield* splitAndYieldChunk(chunk, content, REASONING_START_TOKEN);
|
| 1044 |
-
} else if (content.includes(REASONING_END_TOKEN)) {
|
| 1045 |
-
yield* splitAndYieldChunk(chunk, content, REASONING_END_TOKEN);
|
| 1046 |
-
} else {
|
| 1047 |
-
yield chunk;
|
| 1048 |
-
}
|
| 1049 |
-
}
|
| 1050 |
-
}
|
|
|
|
| 15 |
PatchedResponseReasoningItem,
|
| 16 |
PatchedResponseStreamEvent,
|
| 17 |
ReasoningTextContent,
|
| 18 |
+
PatchedDeltaWithReasoning,
|
| 19 |
} from "../openai_patch";
|
| 20 |
import type {
|
| 21 |
ChatCompletionCreateParamsStreaming,
|
| 22 |
ChatCompletionMessageParam,
|
| 23 |
ChatCompletionTool,
|
|
|
|
| 24 |
} from "openai/resources/chat/completions.js";
|
| 25 |
import type { FunctionParameters } from "openai/resources/shared.js";
|
| 26 |
import { callMcpTool, connectMcpServer } from "../mcp.js";
|
|
|
|
| 27 |
|
| 28 |
class StreamingError extends Error {
|
| 29 |
constructor(message: string) {
|
|
|
|
| 35 |
type IncompleteResponse = Omit<Response, "incomplete_details" | "output_text" | "parallel_tool_calls">;
|
| 36 |
const SEQUENCE_NUMBER_PLACEHOLDER = -1;
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
export const postCreateResponse = async (
|
| 39 |
req: ValidatedRequest<CreateResponseParams>,
|
| 40 |
res: ExpressResponse
|
|
|
|
| 493 |
baseURL: process.env.OPENAI_BASE_URL ?? "https://router.huggingface.co/v1",
|
| 494 |
apiKey: apiKey,
|
| 495 |
});
|
| 496 |
+
const stream = await client.chat.completions.create(payload);
|
| 497 |
let previousInputTokens = responseObject.usage?.input_tokens ?? 0;
|
| 498 |
let previousOutputTokens = responseObject.usage?.output_tokens ?? 0;
|
| 499 |
let previousTotalTokens = responseObject.usage?.total_tokens ?? 0;
|
|
|
|
| 511 |
};
|
| 512 |
}
|
| 513 |
|
| 514 |
+
const delta = chunk.choices[0].delta as PatchedDeltaWithReasoning;
|
| 515 |
|
| 516 |
+
if (delta.content || delta.reasoning) {
|
| 517 |
let currentOutputItem = responseObject.output.at(-1);
|
|
|
|
| 518 |
|
| 519 |
// If start or end of reasoning, skip token and update the current text mode
|
| 520 |
+
if (delta.reasoning) {
|
| 521 |
+
if (currentTextMode === "text") {
|
| 522 |
+
for await (const event of closeLastOutputItem(responseObject, payload, mcpToolsMapping)) {
|
| 523 |
+
yield event;
|
| 524 |
+
}
|
| 525 |
+
}
|
| 526 |
currentTextMode = "reasoning";
|
| 527 |
+
} else if (delta.content) {
|
| 528 |
+
if (currentTextMode === "reasoning") {
|
| 529 |
+
for await (const event of closeLastOutputItem(responseObject, payload, mcpToolsMapping)) {
|
| 530 |
+
yield event;
|
| 531 |
+
}
|
| 532 |
}
|
| 533 |
+
currentTextMode = "text";
|
| 534 |
}
|
| 535 |
|
| 536 |
// If start of a new message, create it
|
|
|
|
| 610 |
item_id: currentOutputMessage.id,
|
| 611 |
output_index: responseObject.output.length - 1,
|
| 612 |
content_index: currentOutputMessage.content.length - 1,
|
| 613 |
+
delta: delta.content as string,
|
| 614 |
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
| 615 |
};
|
| 616 |
} else if (currentTextMode === "reasoning") {
|
|
|
|
| 635 |
|
| 636 |
// Add text delta
|
| 637 |
const contentPart = currentReasoningItem.content.at(-1) as ReasoningTextContent;
|
| 638 |
+
contentPart.text += delta.reasoning;
|
| 639 |
yield {
|
| 640 |
type: "response.reasoning_text.delta",
|
| 641 |
item_id: currentReasoningItem.id,
|
| 642 |
output_index: responseObject.output.length - 1,
|
| 643 |
content_index: currentReasoningItem.content.length - 1,
|
| 644 |
+
delta: delta.reasoning as string,
|
| 645 |
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
| 646 |
};
|
| 647 |
}
|
|
|
|
| 991 |
}
|
| 992 |
}
|
| 993 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/schemas.ts
CHANGED
|
@@ -101,8 +101,8 @@ export const createResponseParamsSchema = z.object({
|
|
| 101 |
z.object({
|
| 102 |
type: z.literal("output_text"),
|
| 103 |
text: z.string(),
|
| 104 |
-
annotations: z.array(z.object({})).optional(), // TODO: incomplete
|
| 105 |
-
logprobs: z.array(z.object({})).optional(), // TODO: incomplete
|
| 106 |
}),
|
| 107 |
z.object({
|
| 108 |
type: z.literal("refusal"),
|
|
|
|
| 101 |
z.object({
|
| 102 |
type: z.literal("output_text"),
|
| 103 |
text: z.string(),
|
| 104 |
+
annotations: z.array(z.object({})).nullable().optional(), // TODO: incomplete
|
| 105 |
+
logprobs: z.array(z.object({})).nullable().optional(), // TODO: incomplete
|
| 106 |
}),
|
| 107 |
z.object({
|
| 108 |
type: z.literal("refusal"),
|
src/server.ts
CHANGED
|
@@ -2,7 +2,7 @@ import express, { type Express } from "express";
|
|
| 2 |
import { createResponseParamsSchema } from "./schemas.js";
|
| 3 |
import { validateBody } from "./middleware/validation.js";
|
| 4 |
import { requestLogger } from "./middleware/logging.js";
|
| 5 |
-
import { getLandingPageHtml, postCreateResponse } from "./routes/index.js";
|
| 6 |
|
| 7 |
export const createApp = (): Express => {
|
| 8 |
const app: Express = express();
|
|
@@ -14,6 +14,8 @@ export const createApp = (): Express => {
|
|
| 14 |
// Routes
|
| 15 |
app.get("/", getLandingPageHtml);
|
| 16 |
|
|
|
|
|
|
|
| 17 |
app.post("/v1/responses", validateBody(createResponseParamsSchema), postCreateResponse);
|
| 18 |
|
| 19 |
return app;
|
|
|
|
| 2 |
import { createResponseParamsSchema } from "./schemas.js";
|
| 3 |
import { validateBody } from "./middleware/validation.js";
|
| 4 |
import { requestLogger } from "./middleware/logging.js";
|
| 5 |
+
import { getLandingPageHtml, postCreateResponse, getHealth } from "./routes/index.js";
|
| 6 |
|
| 7 |
export const createApp = (): Express => {
|
| 8 |
const app: Express = express();
|
|
|
|
| 14 |
// Routes
|
| 15 |
app.get("/", getLandingPageHtml);
|
| 16 |
|
| 17 |
+
app.get("/health", getHealth);
|
| 18 |
+
|
| 19 |
app.post("/v1/responses", validateBody(createResponseParamsSchema), postCreateResponse);
|
| 20 |
|
| 21 |
return app;
|