From 527045bfb03ec339150f67d1bb5a581a8e91ec0a Mon Sep 17 00:00:00 2001 From: Masashi Yoshimura Date: Thu, 14 May 2026 02:22:44 +0900 Subject: [PATCH 1/5] flush the gpu profile timestamp before the queryset is overflowed (#22995) --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index b24101c78b0..401c75c1230 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -3148,6 +3148,16 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str } ctx->param_arena.reset(); commands.clear(); +#ifdef GGML_WEBGPU_GPU_PROFILE + // flush before the next batch can overflow the QuerySet + if (ctx->profile_timestamp_query_count + 2 * ctx->global_ctx->command_submit_batch_size >= + WEBGPU_MAX_PROFILE_QUERY_COUNT) { + ggml_backend_webgpu_collect_profile_results(ctx, profile_pipeline_names, num_inflight_batches); + // reset profile timestamp state + ctx->profile_timestamp_query_count = 0; + profile_pipeline_names.clear(); + } +#endif } node_idx += num_encoded_ops; From 1e4579fbb80454aa63417b3feafbbea66c4ef671 Mon Sep 17 00:00:00 2001 From: lhez Date: Wed, 13 May 2026 11:24:33 -0700 Subject: [PATCH 2/5] opencl: fix crash when warming up MoE on Adreno (#22876) --- ggml/src/ggml-opencl/ggml-opencl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 61bdc62cd10..248124c2896 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -13132,7 +13132,7 @@ static void moe_router_reoerder(ggml_backend_t backend, const ggml_tensor * src, CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne02)); size_t histogram_global_size[] = {(size_t)(((ne21 + 63) / 64) * 64), static_cast(ne20), 1}; - size_t histogram_local_size[] = {64, static_cast(ne20), 1}; + size_t histogram_local_size[] = {64, 1, 1}; backend_ctx->enqueue_ndrange_kernel(kernel, 3, histogram_global_size, histogram_local_size, src); // Scan From 95d469a915333eb2e14eff9b02b49292716389cd Mon Sep 17 00:00:00 2001 From: Pascal Date: Wed, 13 May 2026 20:47:58 +0200 Subject: [PATCH 3/5] server, webui: accept continue_final_message flag for vLLM API compat (#23012) * server, webui: accept continue_final_message flag for vLLM API compat Add the continue_final_message body flag from the vLLM and transformers API. When set together with add_generation_prompt false, it triggers the existing prefill_assistant code path, regardless of the server side opt.prefill_assistant option. Mutual exclusion with add_generation_prompt true is enforced, matching vLLM behavior. WebUI sends continue_final_message and add_generation_prompt false on the Continue button, with the matching opt in option on the chat service. Pure API alignment, no change to the prefill logic itself. Paves the way for the upcoming per-template prefill plumbing in common/chat. * test: add coverage for continue_final_message vLLM compat flag Two cases on top of the existing assistant prefill coverage. First, continue_final_message true with add_generation_prompt false produces the same rendered prompt as the prefill_assistant heuristic, proving the new flag is a correct alias of the existing path. Second, both flags set to true is rejected with HTTP 400, matching the vLLM/transformers mutual exclusion contract. * chore: update webui build output --- tools/server/public/bundle.js | 232 +++++++++--------- tools/server/server-common.cpp | 9 +- .../server/tests/unit/test_chat_completion.py | 39 +++ .../webui/src/lib/services/chat.service.ts | 8 +- .../webui/src/lib/stores/chat.svelte.ts | 1 + tools/server/webui/src/lib/types/api.d.ts | 3 + .../server/webui/src/lib/types/settings.d.ts | 2 + 7 files changed, 176 insertions(+), 118 deletions(-) diff --git a/tools/server/public/bundle.js b/tools/server/public/bundle.js index 5a42071e5f2..7258e6bea4a 100644 --- a/tools/server/public/bundle.js +++ b/tools/server/public/bundle.js @@ -6357,20 +6357,20 @@ ChatAttachmentsPreviewCurrentItemPdf($$anchor4,{get currentItem(){return $$props ChatAttachmentsPreviewCurrentItemText($$anchor4,{get displayTextContent(){return $$props.displayTextContent},get language(){return $$props.language}})},consequent_3=$$anchor4=>{ChatAttachmentsPreviewCurrentItemAudio($$anchor4,{get currentItem(){return $$props.currentItem},get audioSrc(){return $$props.audioSrc}})},consequent_4=$$anchor4=>{ChatAttachmentsPreviewCurrentItemUnavailable($$anchor4,{get IconComponent(){return get$3(IconComponent)}})};if_block(node_2,$$render=>{$$props.isPdf?$$render(consequent): $$props.isImage?$$render(consequent_1,1):$$props.isText&&$$props.displayTextContent?$$render(consequent_2,2):$$props.isAudio?$$render(consequent_3,3):get$3(isUnavailable)&&$$render(consequent_4,4)})}append($$anchor3,fragment_2)}),append($$anchor2,fragment_1)};if_block(node2,$$render=>{$$props.currentItem&&$$render(consequent_5)})}append($$anchor,fragment),pop()}class ChatService{static async generateTitle(message,model,signal){let titleResponse="";try{await ChatService.sendMessage([message],{model:model|| void 0,stream:!0,custom:{chat_template_kwargs:{enable_thinking:!1}},onChunk:chunk=>{titleResponse+=chunk}},void 0,signal)}catch{return""}return titleResponse}static async sendMessage(messages,options={},conversationId,signal){const{stream,onChunk,onComplete,onError,onReasoningChunk,onToolCallChunk,onModel,onTimings,tools,temperature,max_tokens,dynatemp_range,dynatemp_exponent,top_k,top_p,min_p,xtc_probability,xtc_threshold,typ_p,repeat_last_n,repeat_penalty,presence_penalty,frequency_penalty,dry_multiplier, -dry_base,dry_allowed_length,dry_penalty_last_n,samplers,backend_sampling,custom:custom2,timings_per_token,disableReasoningParsing,excludeReasoningFromContext}=options,normalizedMessages=messages.map(msg=>{if("id"in msg&&"convId"in msg&&"timestamp"in msg){const dbMsg=msg;return ChatService.convertDbMessageToApiChatMessageData(dbMsg)}else return msg}).filter(msg=>msg.role===MessageRole.SYSTEM?(typeof msg.content=="string"?msg.content:"").trim().length>0:!0);options.model&&!modelsStore.modelSupportsVision( -options.model)&&normalizedMessages.forEach(msg=>{Array.isArray(msg.content)&&(msg.content=msg.content.filter(part=>part.type===ContentPartType.IMAGE_URL?(console.info(`[ChatService] Skipping image attachment in message history (model "${options.model}" does not support vision)`),!1):!0),msg.content.length===1&&msg.content[0].type===ContentPartType.TEXT&&typeof msg.content[0].text=="string"&&(msg.content=msg.content[0].text))});const requestBody={messages:normalizedMessages.map(msg=>{const mapped={ -role:msg.role,content:msg.content,tool_calls:msg.tool_calls,tool_call_id:msg.tool_call_id};return!excludeReasoningFromContext&&msg.reasoning_content&&(mapped.reasoning_content=msg.reasoning_content),mapped}),stream,return_progress:stream?!0:void 0,tools:tools&&tools.length>0?tools:void 0};if(options.model&&(requestBody.model=options.model),requestBody.reasoning_format=disableReasoningParsing?ReasoningFormat.NONE:ReasoningFormat.AUTO,temperature!==void 0&&(requestBody.temperature=temperature),max_tokens!== -void 0&&(requestBody.max_tokens=max_tokens!==null&&max_tokens!==0?max_tokens:-1),dynatemp_range!==void 0&&(requestBody.dynatemp_range=dynatemp_range),dynatemp_exponent!==void 0&&(requestBody.dynatemp_exponent=dynatemp_exponent),top_k!==void 0&&(requestBody.top_k=top_k),top_p!==void 0&&(requestBody.top_p=top_p),min_p!==void 0&&(requestBody.min_p=min_p),xtc_probability!==void 0&&(requestBody.xtc_probability=xtc_probability),xtc_threshold!==void 0&&(requestBody.xtc_threshold=xtc_threshold),typ_p!== -void 0&&(requestBody.typ_p=typ_p),repeat_last_n!==void 0&&(requestBody.repeat_last_n=repeat_last_n),repeat_penalty!==void 0&&(requestBody.repeat_penalty=repeat_penalty),presence_penalty!==void 0&&(requestBody.presence_penalty=presence_penalty),frequency_penalty!==void 0&&(requestBody.frequency_penalty=frequency_penalty),dry_multiplier!==void 0&&(requestBody.dry_multiplier=dry_multiplier),dry_base!==void 0&&(requestBody.dry_base=dry_base),dry_allowed_length!==void 0&&(requestBody.dry_allowed_length= -dry_allowed_length),dry_penalty_last_n!==void 0&&(requestBody.dry_penalty_last_n=dry_penalty_last_n),samplers!==void 0&&(requestBody.samplers=typeof samplers=="string"?samplers.split(";").filter(s2=>s2.trim()):samplers),backend_sampling!==void 0&&(requestBody.backend_sampling=backend_sampling),timings_per_token!==void 0&&(requestBody.timings_per_token=timings_per_token),custom2)try{const customParams=typeof custom2=="string"?JSON.parse(custom2):custom2;Object.assign(requestBody,customParams)}catch(error2){ -console.warn("Failed to parse custom parameters:",error2)}try{const response=await fetch("./v1/chat/completions",{method:"POST",headers:getJsonHeaders(),body:JSON.stringify(requestBody),signal});if(!response.ok){const error2=await ChatService.parseErrorResponse(response);throw onError&&onError(error2),error2}if(stream){await ChatService.handleStreamResponse(response,onChunk,onComplete,onError,onReasoningChunk,onToolCallChunk,onModel,onTimings,conversationId,signal);return}else return ChatService. -handleNonStreamResponse(response,onComplete,onError,onToolCallChunk,onModel)}catch(error2){if(isAbortError(error2)){console.log("Chat completion request was aborted");return}let userFriendlyError;throw error2 instanceof Error?error2.name==="TypeError"&&error2.message.includes("fetch")?(userFriendlyError=new Error("Unable to connect to server - please check if the server is running"),userFriendlyError.name="NetworkError"):error2.message.includes("ECONNREFUSED")?(userFriendlyError=new Error("Conne\ -ction refused - server may be offline"),userFriendlyError.name="NetworkError"):error2.message.includes("ETIMEDOUT")?(userFriendlyError=new Error("Request timed out - the server took too long to respond"),userFriendlyError.name="TimeoutError"):userFriendlyError=error2:userFriendlyError=new Error("Unknown error occurred while sending message"),console.error("Error in sendMessage:",error2),onError&&onError(userFriendlyError),userFriendlyError}}static async areAllSlotsIdle(model,signal){try{const url2=model? -`./slots?model=${encodeURIComponent(model)}`:"./slots",res=await fetch(url2,{signal});return res.ok?(await res.json()).every(s2=>!s2.is_processing):!0}catch{return!0}}static async preEncode(messages,model,excludeReasoning,signal){const requestBody={messages:messages.map(msg=>"id"in msg&&"convId"in msg&&"timestamp"in msg?ChatService.convertDbMessageToApiChatMessageData(msg):msg).filter(msg=>msg.role===MessageRole.SYSTEM?(typeof msg.content=="string"?msg.content:"").trim().length>0:!0).map(msg=>{const mapped={ -role:msg.role,content:excludeReasoning?ChatService.stripReasoningContent(msg.content):msg.content,tool_calls:msg.tool_calls,tool_call_id:msg.tool_call_id};return!excludeReasoning&&msg.reasoning_content&&(mapped.reasoning_content=msg.reasoning_content),mapped}),stream:!1,n_predict:0};model&&(requestBody.model=model);try{await fetch("./v1/chat/completions",{method:"POST",headers:getJsonHeaders(),body:JSON.stringify(requestBody),signal})}catch(error2){isAbortError(error2)||console.warn("[ChatServic\ -e] Pre-encode request failed:",error2)}}static async handleStreamResponse(response,onChunk,onComplete,onError,onReasoningChunk,onToolCallChunk,onModel,onTimings,conversationId,abortSignal){const reader=response.body?.getReader();if(!reader)throw new Error("No response body");const decoder=new TextDecoder;let aggregatedContent="",fullReasoningContent="",aggregatedToolCalls=[],lastTimings,streamFinished=!1,modelEmitted=!1,toolCallIndexOffset=0,hasOpenToolCallBatch=!1;const finalizeOpenToolCallBatch=()=>{ -hasOpenToolCallBatch&&(toolCallIndexOffset=aggregatedToolCalls.length,hasOpenToolCallBatch=!1)},processToolCallDelta=toolCalls=>{if(!toolCalls||toolCalls.length===0||(aggregatedToolCalls=ChatService.mergeToolCallDeltas(aggregatedToolCalls,toolCalls,toolCallIndexOffset),aggregatedToolCalls.length===0))return;hasOpenToolCallBatch=!0;const serializedToolCalls=JSON.stringify(aggregatedToolCalls);serializedToolCalls&&(abortSignal?.aborted||onToolCallChunk?.(serializedToolCalls))};try{let chunk="";for(;!abortSignal?. -aborted;){const{done,value}=await reader.read();if(done||abortSignal?.aborted)break;chunk+=decoder.decode(value,{stream:!0});const lines=chunk.split(` +dry_base,dry_allowed_length,dry_penalty_last_n,samplers,backend_sampling,custom:custom2,timings_per_token,disableReasoningParsing,excludeReasoningFromContext,continueFinalMessage}=options,normalizedMessages=messages.map(msg=>{if("id"in msg&&"convId"in msg&&"timestamp"in msg){const dbMsg=msg;return ChatService.convertDbMessageToApiChatMessageData(dbMsg)}else return msg}).filter(msg=>msg.role===MessageRole.SYSTEM?(typeof msg.content=="string"?msg.content:"").trim().length>0:!0);options.model&&!modelsStore. +modelSupportsVision(options.model)&&normalizedMessages.forEach(msg=>{Array.isArray(msg.content)&&(msg.content=msg.content.filter(part=>part.type===ContentPartType.IMAGE_URL?(console.info(`[ChatService] Skipping image attachment in message history (model "${options.model}" does not support vision)`),!1):!0),msg.content.length===1&&msg.content[0].type===ContentPartType.TEXT&&typeof msg.content[0].text=="string"&&(msg.content=msg.content[0].text))});const requestBody={messages:normalizedMessages.map( +msg=>{const mapped={role:msg.role,content:msg.content,tool_calls:msg.tool_calls,tool_call_id:msg.tool_call_id};return!excludeReasoningFromContext&&msg.reasoning_content&&(mapped.reasoning_content=msg.reasoning_content),mapped}),stream,return_progress:stream?!0:void 0,tools:tools&&tools.length>0?tools:void 0};if(options.model&&(requestBody.model=options.model),requestBody.reasoning_format=disableReasoningParsing?ReasoningFormat.NONE:ReasoningFormat.AUTO,continueFinalMessage&&(requestBody.continue_final_message= +!0,requestBody.add_generation_prompt=!1),temperature!==void 0&&(requestBody.temperature=temperature),max_tokens!==void 0&&(requestBody.max_tokens=max_tokens!==null&&max_tokens!==0?max_tokens:-1),dynatemp_range!==void 0&&(requestBody.dynatemp_range=dynatemp_range),dynatemp_exponent!==void 0&&(requestBody.dynatemp_exponent=dynatemp_exponent),top_k!==void 0&&(requestBody.top_k=top_k),top_p!==void 0&&(requestBody.top_p=top_p),min_p!==void 0&&(requestBody.min_p=min_p),xtc_probability!==void 0&&(requestBody. +xtc_probability=xtc_probability),xtc_threshold!==void 0&&(requestBody.xtc_threshold=xtc_threshold),typ_p!==void 0&&(requestBody.typ_p=typ_p),repeat_last_n!==void 0&&(requestBody.repeat_last_n=repeat_last_n),repeat_penalty!==void 0&&(requestBody.repeat_penalty=repeat_penalty),presence_penalty!==void 0&&(requestBody.presence_penalty=presence_penalty),frequency_penalty!==void 0&&(requestBody.frequency_penalty=frequency_penalty),dry_multiplier!==void 0&&(requestBody.dry_multiplier=dry_multiplier),dry_base!== +void 0&&(requestBody.dry_base=dry_base),dry_allowed_length!==void 0&&(requestBody.dry_allowed_length=dry_allowed_length),dry_penalty_last_n!==void 0&&(requestBody.dry_penalty_last_n=dry_penalty_last_n),samplers!==void 0&&(requestBody.samplers=typeof samplers=="string"?samplers.split(";").filter(s2=>s2.trim()):samplers),backend_sampling!==void 0&&(requestBody.backend_sampling=backend_sampling),timings_per_token!==void 0&&(requestBody.timings_per_token=timings_per_token),custom2)try{const customParams=typeof custom2== +"string"?JSON.parse(custom2):custom2;Object.assign(requestBody,customParams)}catch(error2){console.warn("Failed to parse custom parameters:",error2)}try{const response=await fetch("./v1/chat/completions",{method:"POST",headers:getJsonHeaders(),body:JSON.stringify(requestBody),signal});if(!response.ok){const error2=await ChatService.parseErrorResponse(response);throw onError&&onError(error2),error2}if(stream){await ChatService.handleStreamResponse(response,onChunk,onComplete,onError,onReasoningChunk, +onToolCallChunk,onModel,onTimings,conversationId,signal);return}else return ChatService.handleNonStreamResponse(response,onComplete,onError,onToolCallChunk,onModel)}catch(error2){if(isAbortError(error2)){console.log("Chat completion request was aborted");return}let userFriendlyError;throw error2 instanceof Error?error2.name==="TypeError"&&error2.message.includes("fetch")?(userFriendlyError=new Error("Unable to connect to server - please check if the server is running"),userFriendlyError.name="Ne\ +tworkError"):error2.message.includes("ECONNREFUSED")?(userFriendlyError=new Error("Connection refused - server may be offline"),userFriendlyError.name="NetworkError"):error2.message.includes("ETIMEDOUT")?(userFriendlyError=new Error("Request timed out - the server took too long to respond"),userFriendlyError.name="TimeoutError"):userFriendlyError=error2:userFriendlyError=new Error("Unknown error occurred while sending message"),console.error("Error in sendMessage:",error2),onError&&onError(userFriendlyError), +userFriendlyError}}static async areAllSlotsIdle(model,signal){try{const url2=model?`./slots?model=${encodeURIComponent(model)}`:"./slots",res=await fetch(url2,{signal});return res.ok?(await res.json()).every(s2=>!s2.is_processing):!0}catch{return!0}}static async preEncode(messages,model,excludeReasoning,signal){const requestBody={messages:messages.map(msg=>"id"in msg&&"convId"in msg&&"timestamp"in msg?ChatService.convertDbMessageToApiChatMessageData(msg):msg).filter(msg=>msg.role===MessageRole.SYSTEM? +(typeof msg.content=="string"?msg.content:"").trim().length>0:!0).map(msg=>{const mapped={role:msg.role,content:excludeReasoning?ChatService.stripReasoningContent(msg.content):msg.content,tool_calls:msg.tool_calls,tool_call_id:msg.tool_call_id};return!excludeReasoning&&msg.reasoning_content&&(mapped.reasoning_content=msg.reasoning_content),mapped}),stream:!1,n_predict:0};model&&(requestBody.model=model);try{await fetch("./v1/chat/completions",{method:"POST",headers:getJsonHeaders(),body:JSON.stringify( +requestBody),signal})}catch(error2){isAbortError(error2)||console.warn("[ChatService] Pre-encode request failed:",error2)}}static async handleStreamResponse(response,onChunk,onComplete,onError,onReasoningChunk,onToolCallChunk,onModel,onTimings,conversationId,abortSignal){const reader=response.body?.getReader();if(!reader)throw new Error("No response body");const decoder=new TextDecoder;let aggregatedContent="",fullReasoningContent="",aggregatedToolCalls=[],lastTimings,streamFinished=!1,modelEmitted=!1, +toolCallIndexOffset=0,hasOpenToolCallBatch=!1;const finalizeOpenToolCallBatch=()=>{hasOpenToolCallBatch&&(toolCallIndexOffset=aggregatedToolCalls.length,hasOpenToolCallBatch=!1)},processToolCallDelta=toolCalls=>{if(!toolCalls||toolCalls.length===0||(aggregatedToolCalls=ChatService.mergeToolCallDeltas(aggregatedToolCalls,toolCalls,toolCallIndexOffset),aggregatedToolCalls.length===0))return;hasOpenToolCallBatch=!0;const serializedToolCalls=JSON.stringify(aggregatedToolCalls);serializedToolCalls&&(abortSignal?. +aborted||onToolCallChunk?.(serializedToolCalls))};try{let chunk="";for(;!abortSignal?.aborted;){const{done,value}=await reader.read();if(done||abortSignal?.aborted)break;chunk+=decoder.decode(value,{stream:!0});const lines=chunk.split(` `);chunk=lines.pop()||"";for(const line of lines){if(abortSignal?.aborted)break;if(line.startsWith(UrlProtocol.DATA)){const data=line.slice(6);if(data==="[DONE]"){streamFinished=!0;continue}try{const parsed=JSON.parse(data),choice=parsed.choices?.[0],content2=choice?.delta?.content,reasoningContent=choice?.delta?.reasoning_content,toolCalls=choice?.delta?.tool_calls,timings=parsed.timings,promptProgress=parsed.prompt_progress,chunkModel=ChatService.extractModelName(parsed);chunkModel&&!modelEmitted&& (modelEmitted=!0,onModel?.(chunkModel)),promptProgress&&ChatService.notifyTimings(void 0,promptProgress,onTimings),timings&&(ChatService.notifyTimings(timings,promptProgress,onTimings),lastTimings=timings),content2&&(finalizeOpenToolCallBatch(),aggregatedContent+=content2,abortSignal?.aborted||onChunk?.(content2)),reasoningContent&&(finalizeOpenToolCallBatch(),fullReasoningContent+=reasoningContent,abortSignal?.aborted||onReasoningChunk?.(reasoningContent)),processToolCallDelta(toolCalls)}catch(e){ console.error("Error parsing JSON chunk:",e)}}}if(abortSignal?.aborted)break}if(abortSignal?.aborted)return;if(streamFinished){finalizeOpenToolCallBatch();const finalToolCalls=aggregatedToolCalls.length>0?JSON.stringify(aggregatedToolCalls):void 0;onComplete?.(aggregatedContent,fullReasoningContent||void 0,lastTimings,finalToolCalls)}}catch(error2){const err=error2 instanceof Error?error2:new Error("Stream error");throw onError?.(err),err}finally{reader.releaseLock()}}static async handleNonStreamResponse(response,onComplete,onError,onToolCallChunk,onModel){ @@ -6506,108 +6506,108 @@ push("user message")):msg.role===MessageRole.ASSISTANT&&(assistantMessages++,mes return;if(filterByLeafNodeId(allMessages,activeConv.currNode||"",!1).some(m=>m.id===messageId)&&messageToDelete.parent){const siblings2=allMessages.filter(m=>m.parent===messageToDelete.parent&&m.id!==messageId);if(siblings2.length>0){const latestSibling=siblings2.reduce((latest,sibling2)=>sibling2.timestamp>latest.timestamp?sibling2:latest);await conversationsStore.updateCurrentNode(findLeafNode(allMessages,latestSibling.id))}else messageToDelete.parent&&await conversationsStore.updateCurrentNode( findLeafNode(allMessages,messageToDelete.parent))}await DatabaseService.deleteMessageCascading(activeConv.id,messageId),await conversationsStore.refreshActiveMessages(),conversationsStore.updateConversationTimestamp()}catch(error2){console.error("Failed to delete message:",error2)}}async continueAssistantMessage(messageId){const activeConv=conversationsStore.activeConversation;if(!activeConv||this.isChatLoadingInternal(activeConv.id))return;const result=this.getMessageByIdWithRole(messageId,MessageRole. ASSISTANT);if(!result)return;const{message:msg,index:idx}=result;try{this.showErrorDialog(null),this.setChatLoading(activeConv.id,!0),this.clearChatStreaming(activeConv.id);const allMessages=await conversationsStore.getConversationMessages(activeConv.id),dbMessage=findMessageById(allMessages,messageId);if(!dbMessage){this.setChatLoading(activeConv.id,!1);return}const originalContent=dbMessage.content,originalReasoning=dbMessage.reasoningContent||"",contextWithContinue=[...conversationsStore.activeMessages. -slice(0,idx),{role:MessageRole.ASSISTANT,content:originalContent,reasoning_content:originalReasoning||void 0}];let appendedContent="",appendedReasoning="",hasReceivedContent=!1;const updateStreamingContent=fullContent=>{this.setChatStreaming(msg.convId,fullContent,msg.id),conversationsStore.updateMessageAtIndex(idx,{content:fullContent})},abortController=this.getOrCreateAbortController(msg.convId);await ChatService.sendMessage(contextWithContinue,{...this.getApiOptions(),onChunk:chunk=>{appendedContent+= -chunk,hasReceivedContent=!0,updateStreamingContent(originalContent+appendedContent)},onReasoningChunk:chunk=>{appendedReasoning+=chunk,hasReceivedContent=!0,this.setChatStreaming(msg.convId,originalContent+appendedContent,msg.id),conversationsStore.updateMessageAtIndex(idx,{reasoningContent:originalReasoning+appendedReasoning})},onTimings:(timings,promptProgress)=>{const tokensPerSecond=timings?.predicted_ms&&timings?.predicted_n?timings.predicted_n/timings.predicted_ms*1e3:0;this.updateProcessingStateFromTimings( -{prompt_n:timings?.prompt_n||0,prompt_ms:timings?.prompt_ms,predicted_n:timings?.predicted_n||0,predicted_per_second:tokensPerSecond,cache_n:timings?.cache_n||0,prompt_progress:promptProgress},msg.convId)},onComplete:async(finalContent,reasoningContent,timings)=>{const finalAppendedContent=hasReceivedContent?appendedContent:finalContent||"",finalAppendedReasoning=hasReceivedContent?appendedReasoning:reasoningContent||"",fullContent=originalContent+finalAppendedContent,fullReasoning=originalReasoning+ -finalAppendedReasoning||void 0;await DatabaseService.updateMessage(msg.id,{content:fullContent,reasoningContent:fullReasoning,timestamp:Date.now(),timings}),conversationsStore.updateMessageAtIndex(idx,{content:fullContent,reasoningContent:fullReasoning,timestamp:Date.now(),timings}),conversationsStore.updateConversationTimestamp(),this.setChatLoading(msg.convId,!1),this.clearChatStreaming(msg.convId),this.setProcessingState(msg.convId,null)},onError:async error2=>{if(isAbortError(error2)){hasReceivedContent&& -appendedContent&&(await DatabaseService.updateMessage(msg.id,{content:originalContent+appendedContent,reasoningContent:originalReasoning+appendedReasoning||void 0,timestamp:Date.now()}),conversationsStore.updateMessageAtIndex(idx,{content:originalContent+appendedContent,reasoningContent:originalReasoning+appendedReasoning||void 0,timestamp:Date.now()})),this.setChatLoading(msg.convId,!1),this.clearChatStreaming(msg.convId),this.setProcessingState(msg.convId,null);return}console.error("Continue g\ -eneration error:",error2),conversationsStore.updateMessageAtIndex(idx,{content:originalContent}),await DatabaseService.updateMessage(msg.id,{content:originalContent}),this.setChatLoading(msg.convId,!1),this.clearChatStreaming(msg.convId),this.setProcessingState(msg.convId,null),this.showErrorDialog({type:error2.name==="TimeoutError"?ErrorDialogType.TIMEOUT:ErrorDialogType.SERVER,message:error2.message})}},msg.convId,abortController.signal)}catch(error2){isAbortError(error2)||console.error("Faile\ -d to continue message:",error2),activeConv&&this.setChatLoading(activeConv.id,!1)}}async editAssistantMessage(messageId,newContent,shouldBranch){const activeConv=conversationsStore.activeConversation;if(!activeConv||this.isChatLoadingInternal(activeConv.id))return;const result=this.getMessageByIdWithRole(messageId,MessageRole.ASSISTANT);if(!result)return;const{message:msg,index:idx}=result;try{if(shouldBranch){const newMessage=await DatabaseService.createMessageBranch({convId:msg.convId,type:msg. -type,timestamp:Date.now(),role:msg.role,content:newContent,toolCalls:msg.toolCalls||"",children:[],model:msg.model},msg.parent);await conversationsStore.updateCurrentNode(newMessage.id)}else await DatabaseService.updateMessage(msg.id,{content:newContent}),conversationsStore.updateMessageAtIndex(idx,{content:newContent});conversationsStore.updateConversationTimestamp(),await conversationsStore.refreshActiveMessages()}catch(error2){console.error("Failed to edit assistant message:",error2)}}async editUserMessagePreserveResponses(messageId,newContent,newExtras){ -const activeConv=conversationsStore.activeConversation;if(!activeConv)return;const result=this.getMessageByIdWithRole(messageId,MessageRole.USER);if(!result)return;const{message:msg,index:idx}=result;try{const updateData={content:newContent};newExtras!==void 0&&(updateData.extra=JSON.parse(JSON.stringify(newExtras))),await DatabaseService.updateMessage(messageId,updateData),conversationsStore.updateMessageAtIndex(idx,updateData);const rootMessage=(await conversationsStore.getConversationMessages( -activeConv.id)).find(m=>m.type==="root"&&m.parent===null);rootMessage&&msg.parent===rootMessage.id&&newContent.trim()&&await conversationsStore.updateConversationTitleWithConfirmation(activeConv.id,generateConversationTitle(newContent,!!config$1().titleGenerationUseFirstLine)),conversationsStore.updateConversationTimestamp()}catch(error2){console.error("Failed to edit user message:",error2)}}async editMessageWithBranching(messageId,newContent,newExtras){const activeConv=conversationsStore.activeConversation; -if(!activeConv||this.isChatLoadingInternal(activeConv.id))return;let result=this.getMessageByIdWithRole(messageId,MessageRole.USER);if(result||(result=this.getMessageByIdWithRole(messageId,MessageRole.SYSTEM)),!result)return;const{message:msg,index:idx}=result;try{const allMessages=await conversationsStore.getConversationMessages(activeConv.id),rootMessage=allMessages.find(m=>m.type==="root"&&m.parent===null),isFirstUserMessage=msg.role===MessageRole.USER&&rootMessage&&msg.parent===rootMessage.id, -extrasToUse=newExtras!==void 0?JSON.parse(JSON.stringify(newExtras)):msg.extra?JSON.parse(JSON.stringify(msg.extra)):void 0;let messageIdForResponse;const dbMsg=findMessageById(allMessages,msg.id);if(dbMsg?dbMsg.children.length>0:msg.children.length>0){const parentId=msg.parent||rootMessage?.id;if(!parentId)return;const newMessage=await DatabaseService.createMessageBranch({convId:msg.convId,type:msg.type,timestamp:Date.now(),role:msg.role,content:newContent,toolCalls:msg.toolCalls||"",children:[], -extra:extrasToUse,model:msg.model},parentId);await conversationsStore.updateCurrentNode(newMessage.id),messageIdForResponse=newMessage.id}else{const updates={content:newContent,timestamp:Date.now(),extra:extrasToUse};await DatabaseService.updateMessage(msg.id,updates),conversationsStore.updateMessageAtIndex(idx,updates),messageIdForResponse=msg.id}conversationsStore.updateConversationTimestamp(),isFirstUserMessage&&newContent.trim()&&await conversationsStore.updateConversationTitleWithConfirmation( -activeConv.id,generateConversationTitle(newContent,!!config$1().titleGenerationUseFirstLine)),await conversationsStore.refreshActiveMessages(),msg.role===MessageRole.USER&&await this.generateResponseForMessage(messageIdForResponse)}catch(error2){console.error("Failed to edit message with branching:",error2)}}async generateResponseForMessage(userMessageId){const activeConv=conversationsStore.activeConversation;if(activeConv){this.showErrorDialog(null),this.setChatLoading(activeConv.id,!0),this.clearChatStreaming( -activeConv.id);try{const allMessages=await conversationsStore.getConversationMessages(activeConv.id),conversationPath=filterByLeafNodeId(allMessages,userMessageId,!1),assistantMessage=await DatabaseService.createMessageBranch({convId:activeConv.id,type:MessageType.TEXT,timestamp:Date.now(),role:MessageRole.ASSISTANT,content:"",toolCalls:"",children:[],model:null},userMessageId);conversationsStore.addMessageToActive(assistantMessage),await this.streamChatCompletion(conversationPath,assistantMessage)}catch(error2){ -console.error("Failed to generate response:",error2),this.setChatLoading(activeConv.id,!1)}}}getContextTotal(){const activeConvId=this.activeConversationId,activeState=activeConvId?this.getProcessingState(activeConvId):null;if(activeState&&typeof activeState.contextTotal=="number"&&activeState.contextTotal>0)return activeState.contextTotal;if(isRouterMode()){const modelContextSize=selectedModelContextSize();if(typeof modelContextSize=="number"&&modelContextSize>0)return modelContextSize}else{const propsContextSize=contextSize(); -if(typeof propsContextSize=="number"&&propsContextSize>0)return propsContextSize}return null}updateProcessingStateFromTimings(timingData,conversationId){const processingState=this.parseTimingData(timingData);if(processingState===null){console.warn("Failed to parse timing data - skipping update");return}const targetId=conversationId||this.activeConversationId;targetId&&this.setProcessingState(targetId,processingState)}parseTimingData(timingData){const promptTokens=timingData.prompt_n||0,promptMs=timingData. -prompt_ms||void 0,predictedTokens=timingData.predicted_n||0,tokensPerSecond=timingData.predicted_per_second||0,cacheTokens=timingData.cache_n||0,promptProgress=timingData.prompt_progress,contextTotal=this.getContextTotal(),currentConfig=config$1(),outputTokensMax=currentConfig.max_tokens||-1,contextUsed=promptTokens+cacheTokens+predictedTokens,outputTokensUsed=predictedTokens,progressCache=promptProgress?.cache||0,progressActualDone=(promptProgress?.processed??0)-progressCache,progressActualTotal=(promptProgress?. -total??0)-progressCache,progressPercent=promptProgress?Math.round(progressActualDone/progressActualTotal*100):void 0;return{status:predictedTokens>0?"generating":promptProgress?"preparing":"idle",tokensDecoded:predictedTokens,tokensRemaining:outputTokensMax-predictedTokens,contextUsed,contextTotal,outputTokensUsed,outputTokensMax,hasNextToken:predictedTokens>0,tokensPerSecond,temperature:currentConfig.temperature??.8,topP:currentConfig.top_p??.95,speculative:!1,progressPercent,promptProgress,promptTokens, -promptMs,cacheTokens}}restoreProcessingStateFromMessages(messages,conversationId){for(let i=messages.length-1;i>=0;i--){const message=messages[i];if(message.role===MessageRole.ASSISTANT&&message.timings){const restoredState=this.parseTimingData({prompt_n:message.timings.prompt_n||0,prompt_ms:message.timings.prompt_ms,predicted_n:message.timings.predicted_n||0,predicted_per_second:message.timings.predicted_n&&message.timings.predicted_ms?message.timings.predicted_n/message.timings.predicted_ms*1e3: -0,cache_n:message.timings.cache_n||0});if(restoredState){this.setProcessingState(conversationId,restoredState);return}}}}getConversationModel(messages){for(let i=messages.length-1;i>=0;i--){const message=messages[i];if(message.role===MessageRole.ASSISTANT&&message.model)return message.model}return null}getApiOptions(){const currentConfig=config$1(),hasValue=value=>value!=null&&value!=="",apiOptions={stream:!0,timings_per_token:!0};if(isRouterMode()){const modelName=selectedModelName();modelName&& -(apiOptions.model=modelName)}return currentConfig.systemMessage&&(apiOptions.systemMessage=currentConfig.systemMessage),currentConfig.disableReasoningParsing&&(apiOptions.disableReasoningParsing=!0),currentConfig.excludeReasoningFromContext&&(apiOptions.excludeReasoningFromContext=!0),hasValue(currentConfig.temperature)&&(apiOptions.temperature=Number(currentConfig.temperature)),hasValue(currentConfig.max_tokens)&&(apiOptions.max_tokens=Number(currentConfig.max_tokens)),hasValue(currentConfig.dynatemp_range)&& -(apiOptions.dynatemp_range=Number(currentConfig.dynatemp_range)),hasValue(currentConfig.dynatemp_exponent)&&(apiOptions.dynatemp_exponent=Number(currentConfig.dynatemp_exponent)),hasValue(currentConfig.top_k)&&(apiOptions.top_k=Number(currentConfig.top_k)),hasValue(currentConfig.top_p)&&(apiOptions.top_p=Number(currentConfig.top_p)),hasValue(currentConfig.min_p)&&(apiOptions.min_p=Number(currentConfig.min_p)),hasValue(currentConfig.xtc_probability)&&(apiOptions.xtc_probability=Number(currentConfig. -xtc_probability)),hasValue(currentConfig.xtc_threshold)&&(apiOptions.xtc_threshold=Number(currentConfig.xtc_threshold)),hasValue(currentConfig.typ_p)&&(apiOptions.typ_p=Number(currentConfig.typ_p)),hasValue(currentConfig.repeat_last_n)&&(apiOptions.repeat_last_n=Number(currentConfig.repeat_last_n)),hasValue(currentConfig.repeat_penalty)&&(apiOptions.repeat_penalty=Number(currentConfig.repeat_penalty)),hasValue(currentConfig.presence_penalty)&&(apiOptions.presence_penalty=Number(currentConfig.presence_penalty)), -hasValue(currentConfig.frequency_penalty)&&(apiOptions.frequency_penalty=Number(currentConfig.frequency_penalty)),hasValue(currentConfig.dry_multiplier)&&(apiOptions.dry_multiplier=Number(currentConfig.dry_multiplier)),hasValue(currentConfig.dry_base)&&(apiOptions.dry_base=Number(currentConfig.dry_base)),hasValue(currentConfig.dry_allowed_length)&&(apiOptions.dry_allowed_length=Number(currentConfig.dry_allowed_length)),hasValue(currentConfig.dry_penalty_last_n)&&(apiOptions.dry_penalty_last_n=Number( -currentConfig.dry_penalty_last_n)),currentConfig.samplers&&(apiOptions.samplers=currentConfig.samplers),apiOptions.backend_sampling=currentConfig.backend_sampling,currentConfig.custom&&(apiOptions.custom=currentConfig.custom),apiOptions}cancelPreEncode(){this.preEncodeAbortController&&(this.preEncodeAbortController.abort(),this.preEncodeAbortController=null)}async triggerPreEncode(allMessages,assistantMessage,assistantContent,model,excludeReasoning){this.cancelPreEncode(),this.preEncodeAbortController= -new AbortController;const signal=this.preEncodeAbortController.signal;try{if(!await ChatService.areAllSlotsIdle(model,signal)||signal.aborted)return;const messagesWithAssistant=[...allMessages,{...assistantMessage,content:assistantContent}];await ChatService.preEncode(messagesWithAssistant,model,excludeReasoning,signal)}catch(err){isAbortError(err)||console.warn("[ChatStore] Pre-encode failed:",err)}}}const chatStore=new ChatStore,activeProcessingState=()=>chatStore.activeProcessingState,errorDialog=()=>chatStore. -errorDialogState,getAddFilesHandler=()=>chatStore.getAddFilesHandler(),getAllLoadingChats=()=>chatStore.getAllLoadingChats(),isChatStreaming=()=>chatStore.isStreaming(),isEditing=()=>chatStore.isEditing(),isLoading=()=>chatStore.isLoading,pendingEditMessageId=()=>chatStore.pendingEditMessageId,chatPendingMessageContent=convId=>chatStore.pendingMessageContent(convId),chatPendingMessageExtras=convId=>chatStore.pendingMessageExtras(convId),chatClearPendingMessage=convId=>chatStore.clearPendingMessage( -convId),chatInjectPendingMessage=(convId,content2,extras)=>chatStore.injectPendingMessage(convId,content2,extras);var root$1u=from_html('
',1);function ChatForm($$anchor,$$props){push$1($$props,!0);let attachments=prop($$props,"attachments",19,()=>[]),className=prop($$props,"cla\ -ss",3,""),disabled=prop($$props,"disabled",3,!1),isLoading2=prop($$props,"isLoading",3,!1),placeholder=prop($$props,"placeholder",3,"Type a message..."),showMcpPromptButton=prop($$props,"showMcpPromptButton",3,!1),showAddButton=prop($$props,"showAddButton",3,!0),showModelSelector=prop($$props,"showModelSelector",3,!0),uploadedFiles=prop($$props,"uploadedFiles",31,()=>proxy([])),value=prop($$props,"value",15,""),audioRecorder,chatFormActionsRef=state$1(void 0),fileInputRef=state$1(void 0),pickersRef=state$1( -void 0),textareaRef=state$1(void 0),isRecording=state$1(!1),recordingSupported=state$1(!1),isPromptPickerOpen=state$1(!1),promptSearchQuery=state$1(""),isInlineResourcePickerOpen=state$1(!1),resourceSearchQuery=state$1(""),isResourceDialogOpen=state$1(!1),preSelectedResourceUri=state$1(void 0),currentConfig=user_derived(config$1),pasteLongTextToFileLength=user_derived(()=>{const n=Number(get$3(currentConfig).pasteLongTextToFileLen);return Number.isNaN(n)?Number(SETTING_CONFIG_DEFAULT.pasteLongTextToFileLen): -n}),isRouter=user_derived(isRouterMode),conversationModel=user_derived(()=>chatStore.getConversationModel(activeMessages())),activeModelId=user_derived(()=>{const options=modelOptions();if(!get$3(isRouter))return options.length>0?options[0].model:null;const selectedId=selectedModelId();if(selectedId){const model=options.find(m=>m.id===selectedId);if(model)return model.model}if(get$3(conversationModel)){const model=options.find(m=>m.model===get$3(conversationModel));if(model)return model.model}return null}), -hasModelSelected=user_derived(()=>!get$3(isRouter)||!!get$3(conversationModel)||!!selectedModelId()),hasLoadingAttachments=user_derived(()=>uploadedFiles().some(f=>f.isLoading)),hasAttachments=user_derived(()=>attachments()&&attachments().length>0||uploadedFiles()&&uploadedFiles().length>0),canSubmit=user_derived(()=>value().trim().length>0||get$3(hasAttachments));onMount$1(()=>{set$1(recordingSupported,isAudioRecordingSupported(),!0),audioRecorder=new AudioRecorder});function focus2(){get$3(textareaRef)?. -focus()}function resetTextareaHeight(){get$3(textareaRef)?.resetHeight()}function openModelSelector(){get$3(chatFormActionsRef)?.openModelSelector()}function checkModelSelected(){return get$3(hasModelSelected)?!0:(get$3(chatFormActionsRef)?.openModelSelector(),!1)}function handleFileSelect(files){$$props.onFilesAdd?.(files)}function handleFileUpload(){get$3(fileInputRef)?.click()}function handleFileRemove(fileId){if(fileId.startsWith("attachment-")){const index2=parseInt(fileId.replace("attachme\ -nt-",""),10);!isNaN(index2)&&index2>=0&&index2item.kind==="file").map(item=>item.getAsFile()).filter(file=>file!==null);if(files.length>0){event2.preventDefault(),$$props.onFilesAdd?.(files);return}const text2=event2.clipboardData.getData(MimeTypeText.PLAIN);if(text2.startsWith(CLIPBOARD_CONTENT_QUOTE_PREFIX)){const parsed=parseClipboardContent(text2);if(parsed.textAttachments.length>0||parsed.mcpPromptAttachments.length> -0){if(event2.preventDefault(),value(parsed.message),$$props.onValueChange?.(parsed.message),parsed.textAttachments.length>0){const attachmentFiles=parsed.textAttachments.map(att=>new File([att.content],att.name,{type:MimeTypeText.PLAIN}));$$props.onFilesAdd?.(attachmentFiles)}if(parsed.mcpPromptAttachments.length>0){const mcpPromptFiles=parsed.mcpPromptAttachments.map(att=>({id:uuid$1(),name:att.name,size:att.content.length,type:SpecialFileType.MCP_PROMPT,file:new File([att.content],`${att.name}${FileExtensionText. -TXT}`,{type:MimeTypeText.PLAIN}),isLoading:!1,textContent:att.content,mcpPrompt:{serverName:att.serverName,promptName:att.promptName,arguments:att.arguments}}));uploadedFiles([...uploadedFiles(),...mcpPromptFiles]),$$props.onUploadedFilesChange?.(uploadedFiles())}setTimeout(()=>{get$3(textareaRef)?.focus()},10);return}}if(text2.length>0&&get$3(pasteLongTextToFileLength)>0&&text2.length>get$3(pasteLongTextToFileLength)){event2.preventDefault();const textFile=new File([text2],"Pasted",{type:MimeTypeText. -PLAIN});$$props.onFilesAdd?.([textFile])}}function handlePromptLoadStart(placeholderId,promptInfo,args){value().startsWith(PROMPT_TRIGGER_PREFIX)&&(value(""),$$props.onValueChange?.("")),set$1(isPromptPickerOpen,!1),set$1(promptSearchQuery,"");const promptName=promptInfo.title||promptInfo.name,placeholder2={id:placeholderId,name:promptName,size:INITIAL_FILE_SIZE,type:SpecialFileType.MCP_PROMPT,file:new File([],"loading"),isLoading:!0,mcpPrompt:{serverName:promptInfo.serverName,promptName:promptInfo. -name,arguments:args?{...args}:void 0}};uploadedFiles([...uploadedFiles(),placeholder2]),$$props.onUploadedFilesChange?.(uploadedFiles()),get$3(textareaRef)?.focus()}function handlePromptLoadComplete(placeholderId,result){const promptText=result.messages?.map(msg=>typeof msg.content=="string"?msg.content:msg.content.type===ContentPartType.TEXT?msg.content.text:"").filter(Boolean).join(PROMPT_CONTENT_SEPARATOR);uploadedFiles(uploadedFiles().map(f=>f.id===placeholderId?{...f,isLoading:!1,textContent:promptText, -size:promptText.length,file:new File([promptText],`${f.name}${FileExtensionText.TXT}`,{type:MimeTypeText.PLAIN})}:f)),$$props.onUploadedFilesChange?.(uploadedFiles())}function handlePromptLoadError(placeholderId,error2){uploadedFiles(uploadedFiles().map(f=>f.id===placeholderId?{...f,isLoading:!1,loadError:error2}:f)),$$props.onUploadedFilesChange?.(uploadedFiles())}function handlePromptPickerClose(){set$1(isPromptPickerOpen,!1),set$1(promptSearchQuery,""),get$3(textareaRef)?.focus()}function handleInlineResourcePickerClose(){ -set$1(isInlineResourcePickerOpen,!1),set$1(resourceSearchQuery,""),get$3(textareaRef)?.focus()}function handleInlineResourceSelect(){value().startsWith(RESOURCE_TRIGGER_PREFIX)&&(value(""),$$props.onValueChange?.("")),set$1(isInlineResourcePickerOpen,!1),set$1(resourceSearchQuery,""),get$3(textareaRef)?.focus()}function handleBrowseResources(){set$1(isInlineResourcePickerOpen,!1),set$1(resourceSearchQuery,""),value().startsWith(RESOURCE_TRIGGER_PREFIX)&&(value(""),$$props.onValueChange?.("")),set$1( -isResourceDialogOpen,!0)}async function handleMicClick(){if(!audioRecorder||!get$3(recordingSupported)){console.warn("Audio recording not supported");return}if(get$3(isRecording)){set$1(isRecording,!1);try{const audioBlob=await audioRecorder.stopRecording(),wavBlob=await convertToWav(audioBlob),audioFile=createAudioFile(wavBlob);$$props.onFilesAdd?.([audioFile])}catch(error2){console.error("Failed to stop recording:",error2)}}else try{await audioRecorder.startRecording(),set$1(isRecording,!0)}catch(error2){ -console.error("Failed to start recording:",error2)}}var $$exports={focus:focus2,resetTextareaHeight,openModelSelector,checkModelSelected},fragment=root$1u(),node2=first_child(fragment);bind_this(ChatFormFileInputInvisible(node2,{onFileSelect:handleFileSelect}),$$value=>set$1(fileInputRef,$$value,!0),()=>get$3(fileInputRef));var form=sibling(node2,2),node_1=child(form);bind_this(ChatFormPickers(node_1,{get isPromptPickerOpen(){return get$3(isPromptPickerOpen)},get promptSearchQuery(){return get$3( -promptSearchQuery)},get isInlineResourcePickerOpen(){return get$3(isInlineResourcePickerOpen)},get resourceSearchQuery(){return get$3(resourceSearchQuery)},onPromptPickerClose:handlePromptPickerClose,onInlineResourcePickerClose:handleInlineResourcePickerClose,onInlineResourceSelect:handleInlineResourceSelect,onPromptLoadStart:handlePromptLoadStart,onPromptLoadComplete:handlePromptLoadComplete,onPromptLoadError:handlePromptLoadError,onInlineResourceBrowse:handleBrowseResources}),$$value=>set$1(pickersRef, -$$value,!0),()=>get$3(pickersRef));var div=sibling(node_1,2),node_2=child(div);{let $0=user_derived(()=>get$3(activeModelId)??void 0);ChatAttachmentsList(node_2,{get attachments(){return attachments()},onFileRemove:handleFileRemove,limitToSingleRow:!0,class:"py-5",style:"scroll-padding: 1rem;",get activeModelId(){return get$3($0)},get uploadedFiles(){return uploadedFiles()},set uploadedFiles($$value){uploadedFiles($$value)}})}var div_1=sibling(node_2,2),node_3=child(div_1);bind_this(ChatFormTextarea( -node_3,{class:"px-5 py-1.5 md:pt-0",onKeydown:handleKeydown,onInput:()=>{handleInput(),$$props.onValueChange?.(value())},get disabled(){return disabled()},get placeholder(){return placeholder()},get value(){return value()},set value($$value){value($$value)}}),$$value=>set$1(textareaRef,$$value,!0),()=>get$3(textareaRef));var node_4=sibling(node_3,2);{var consequent=$$anchor2=>{ChatFormMcpResourcesList($$anchor2,{class:"mb-3",onResourceClick:uri2=>{set$1(preSelectedResourceUri,uri2,!0),set$1(isResourceDialogOpen, -!0)}})},d2=user_derived(()=>mcpHasResourceAttachments());if_block(node_4,$$render=>{get$3(d2)&&$$render(consequent)})}var node_5=sibling(node_4,2);{let $0=user_derived(()=>showMcpPromptButton()?()=>set$1(isPromptPickerOpen,!0):void 0);bind_this(ChatFormActions(node_5,{class:"px-3",get canSend(){return get$3(canSubmit)},get disabled(){return disabled()},get isLoading(){return isLoading2()},get isRecording(){return get$3(isRecording)},get showAddButton(){return showAddButton()},get showModelSelector(){ -return showModelSelector()},get uploadedFiles(){return uploadedFiles()},onFileUpload:handleFileUpload,onMicClick:handleMicClick,get onStop(){return $$props.onStop},onSystemPromptClick:()=>$$props.onSystemPromptClick?.({message:value(),files:uploadedFiles()}),get onMcpPromptClick(){return get$3($0)},onMcpResourcesClick:()=>set$1(isResourceDialogOpen,!0)}),$$value=>set$1(chatFormActionsRef,$$value,!0),()=>get$3(chatFormActionsRef))}reset(div_1),reset(div),reset(form);var node_6=sibling(form,2);return DialogMcpResourcesBrowser( -node_6,{get preSelectedUri(){return get$3(preSelectedResourceUri)},onAttach:resource=>{mcpStore.attachResource(resource.uri)},onOpenChange:newOpen=>{newOpen||set$1(preSelectedResourceUri,void 0)},get open(){return get$3(isResourceDialogOpen)},set open($$value){set$1(isResourceDialogOpen,$$value,!0)}}),template_effect(()=>{set_class(form,1,`relative ${className()??""}`),set_class(div,1,`${INPUT_CLASSES??""} overflow-hidden rounded-3xl backdrop-blur-md ${disabled()?"cursor-not-allowed opacity-60": -""}`)}),event("submit",form,event2=>{event2.preventDefault(),!(!get$3(canSubmit)||disabled()||get$3(hasLoadingAttachments))&&$$props.onSubmit?.()}),event("paste",div_1,handlePaste),append($$anchor,fragment),pop($$exports)}function Dropdown_menu_content($$anchor,$$props){push$1($$props,!0);let ref2=prop($$props,"ref",15,null),sideOffset=prop($$props,"sideOffset",3,4),restProps=rest_props($$props,["$$slots","$$events","$$legacy","ref","sideOffset","portalProps","class"]);var fragment=comment$2(),node2=first_child( -fragment);component(node2,()=>Portal$2,($$anchor2,DropdownMenuPrimitive_Portal)=>{DropdownMenuPrimitive_Portal($$anchor2,spread_props(()=>$$props.portalProps,{children:($$anchor3,$$slotProps)=>{var fragment_1=comment$2(),node_1=first_child(fragment_1);{let $0=user_derived(()=>cn$1("z-50 max-h-(--bits-dropdown-menu-content-available-height) min-w-[8rem] origin-(--bits-dropdown-menu-content-transform-origin) overflow-x-hidden overflow-y-auto rounded-md border border-border bg-popover p-1.5 text-po\ -pover-foreground shadow-md outline-none data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:fill-mode-forwards data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95 dark:border-border/20",$$props.class));component(node_1,()=>Dropdown_menu_content$1, -($$anchor4,DropdownMenuPrimitive_Content)=>{DropdownMenuPrimitive_Content($$anchor4,spread_props({"data-slot":"dropdown-menu-content",get sideOffset(){return sideOffset()},get class(){return get$3($0)}},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)}}))})}append($$anchor3,fragment_1)},$$slots:{default:!0}}))}),append($$anchor,fragment),pop()}function Dropdown_menu_item($$anchor,$$props){push$1($$props,!0);let ref2=prop($$props,"ref",15,null),variant=prop($$props,"variant", -3,"default"),restProps=rest_props($$props,["$$slots","$$events","$$legacy","ref","class","inset","variant"]);var fragment=comment$2(),node2=first_child(fragment);{let $0=user_derived(()=>cn$1("relative flex cursor-pointer items-center gap-2 rounded-sm px-2 py-1.5 text-sm outline-hidden select-none data-highlighted:bg-accent data-highlighted:text-accent-foreground data-[disabled]:pointer-events-none data-[disabled]:opacity-50 data-[inset]:pl-8 data-[variant=destructive]:text-destructive data-[var\ -iant=destructive]:data-highlighted:bg-destructive/10 data-[variant=destructive]:data-highlighted:text-destructive dark:data-[variant=destructive]:data-highlighted:bg-destructive/20 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4 [&_svg:not([class*='text-'])]:text-muted-foreground data-[variant=destructive]:*:[svg]:!text-destructive",$$props.class));component(node2,()=>Menu_item,($$anchor2,DropdownMenuPrimitive_Item)=>{DropdownMenuPrimitive_Item($$anchor2,spread_props( -{"data-slot":"dropdown-menu-item",get"data-inset"(){return $$props.inset},get"data-variant"(){return variant()},get class(){return get$3($0)}},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)}}))})}append($$anchor,fragment),pop()}function Dropdown_menu_separator($$anchor,$$props){push$1($$props,!0);let ref2=prop($$props,"ref",15,null),restProps=rest_props($$props,["$$slots","$$events","$$legacy","ref","class"]);var fragment=comment$2(),node2=first_child(fragment);{let $0=user_derived( -()=>cn$1("-mx-1 my-1 h-px bg-border/20",$$props.class));component(node2,()=>Menu_separator,($$anchor2,DropdownMenuPrimitive_Separator)=>{DropdownMenuPrimitive_Separator($$anchor2,spread_props({"data-slot":"dropdown-menu-separator",get class(){return get$3($0)}},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)}}))})}append($$anchor,fragment),pop()}function Dropdown_menu_trigger($$anchor,$$props){push$1($$props,!0);let ref2=prop($$props,"ref",15,null),restProps=rest_props($$props, -["$$slots","$$events","$$legacy","ref"]);var fragment=comment$2(),node2=first_child(fragment);component(node2,()=>Menu_trigger,($$anchor2,DropdownMenuPrimitive_Trigger)=>{DropdownMenuPrimitive_Trigger($$anchor2,spread_props({"data-slot":"dropdown-menu-trigger"},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)}}))}),append($$anchor,fragment),pop()}function Dropdown_menu_sub_content($$anchor,$$props){push$1($$props,!0);let ref2=prop($$props,"ref",15,null),restProps=rest_props( -$$props,["$$slots","$$events","$$legacy","ref","class"]);var fragment=comment$2(),node2=first_child(fragment);{let $0=user_derived(()=>cn$1("z-50 max-h-(--bits-dropdown-menu-content-available-height) min-w-[8rem] origin-(--bits-dropdown-menu-content-transform-origin) overflow-x-hidden overflow-y-auto rounded-md border border-border bg-popover p-1.5 text-popover-foreground shadow-md outline-none data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-\ -in-from-left-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:fill-mode-forwards data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95 dark:border-border/20",$$props.class));component(node2,()=>Menu_sub_content,($$anchor2,DropdownMenuPrimitive_SubContent)=>{DropdownMenuPrimitive_SubContent($$anchor2,spread_props({"data-slot":"dropdown-menu-sub-content",get class(){ -return get$3($0)}},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)}}))})}append($$anchor,fragment),pop()}var root_1$P=from_html(" ",1);function Dropdown_menu_sub_trigger($$anchor,$$props){push$1($$props,!0);let ref2=prop($$props,"ref",15,null),restProps=rest_props($$props,["$$slots","$$events","$$legacy","ref","class","inset","children"]);var fragment=comment$2(),node2=first_child(fragment);{let $0=user_derived(()=>cn$1("flex cursor-default items-center gap-2 rounded\ --sm px-2 py-1.5 text-sm outline-hidden select-none data-highlighted:bg-accent data-highlighted:text-accent-foreground data-[disabled]:pointer-events-none data-[disabled]:opacity-50 data-[inset]:pl-8 data-[state=open]:bg-accent data-[state=open]:text-accent-foreground [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4 [&_svg:not([class*='text-'])]:text-muted-foreground",$$props.class));component(node2,()=>Menu_sub_trigger,($$anchor2,DropdownMenuPrimitive_SubTrigger)=>{ -DropdownMenuPrimitive_SubTrigger($$anchor2,spread_props({"data-slot":"dropdown-menu-sub-trigger",get"data-inset"(){return $$props.inset},get class(){return get$3($0)}},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)},children:($$anchor3,$$slotProps)=>{var fragment_1=root_1$P(),node_1=first_child(fragment_1);snippet(node_1,()=>$$props.children??noop$3);var node_2=sibling(node_1,2);Chevron_right(node_2,{class:"ml-auto size-4"}),append($$anchor3,fragment_1)},$$slots:{default:!0}}))})} -append($$anchor,fragment),pop()}const Sub=Menu_sub,Root$4=Menu;function useAttachmentMenu(getFlags,getCallbacks,close2){const modalityFlags=user_derived(getFlags),callbacks=user_derived(()=>{const cbs=getCallbacks(),wrap2=fn=>()=>{close2(),fn?.()};return{[AttachmentAction.FILE_UPLOAD]:wrap2(cbs.onFileUpload),[AttachmentAction.SYSTEM_PROMPT_CLICK]:wrap2(cbs.onSystemPromptClick),[AttachmentAction.MCP_PROMPT_CLICK]:wrap2(cbs.onMcpPromptClick),[AttachmentAction.MCP_RESOURCES_CLICK]:wrap2(cbs.onMcpResourcesClick)}}); -function isItemEnabled(enabledWhen){return!enabledWhen||enabledWhen==="always"?!0:!!get$3(modalityFlags)[enabledWhen]}function isItemVisible(visibleWhen){return visibleWhen?!!get$3(modalityFlags)[visibleWhen]:!0}function getSystemMessageTooltip(){return page$1.params.id?"Inject custom system message at the beginning of the conversation":"Add custom system message for a new conversation"}return{get callbacks(){return get$3(callbacks)},isItemEnabled,isItemVisible,getSystemMessageTooltip}}var root_6$s=from_html( -" ",1),root_10$f=from_html(" ",1),root_11$f=from_html("

"),root_8$r=from_html(" ",1),root_16$6=from_html(" ",1),root_17$8=from_html("

PDFs will be converted to text. Image-based PDFs may not work properly.

"),root_13$d=from_html(" ",1),root_22$2=from_html(" ",1),root_23$5=from_html("

"),root_20$5=from_html(" ",1),root_26$2=from_html(" ",1),root_3$S=from_html(" \ - ",1),root_1$O=from_html(" ",1),root$1t=from_html("
");function ChatFormActionAddDropdown($$anchor,$$props){push$1($$props,!0);let className=prop($$props,"class",3,""),disabled=prop($$props,"disabled",3,!1),hasAudioModality=prop($$props,"hasAudioModality",3,!1),hasVisionModality=prop($$props,"hasVisionModality",3,!1),hasMcpPromptsSupport=prop($$props,"hasMcpPromptsSupport",3,!1),hasMcpResourcesSupport=prop($$props,"hasMcpResourcesSupport",3,!1),dropdownOpen=state$1(!1); -function handleMcpSettingsClick(){set$1(dropdownOpen,!1),$$props.onMcpSettingsClick?.()}const attachmentMenu=useAttachmentMenu(()=>({hasVisionModality:hasVisionModality(),hasAudioModality:hasAudioModality(),hasMcpPromptsSupport:hasMcpPromptsSupport(),hasMcpResourcesSupport:hasMcpResourcesSupport()}),()=>({onFileUpload:$$props.onFileUpload,onSystemPromptClick:$$props.onSystemPromptClick,onMcpPromptClick:$$props.onMcpPromptClick,onMcpResourcesClick:$$props.onMcpResourcesClick}),()=>{set$1(dropdownOpen, -!1)});var div=root$1t(),node2=child(div);component(node2,()=>Root$4,($$anchor2,DropdownMenu_Root)=>{DropdownMenu_Root($$anchor2,{get open(){return get$3(dropdownOpen)},set open($$value){set$1(dropdownOpen,$$value,!0)},children:($$anchor3,$$slotProps)=>{var fragment=root_1$O(),node_1=first_child(fragment);component(node_1,()=>Dropdown_menu_trigger,($$anchor4,DropdownMenu_Trigger)=>{DropdownMenu_Trigger($$anchor4,{name:"Attach files",get disabled(){return disabled()},children:($$anchor5,$$slotProps2)=>{ -var fragment_1=comment$2(),node_2=first_child(fragment_1);snippet(node_2,()=>$$props.trigger,()=>({disabled:disabled()})),append($$anchor5,fragment_1)},$$slots:{default:!0}})});var node_3=sibling(node_1,2);component(node_3,()=>Dropdown_menu_content,($$anchor4,DropdownMenu_Content)=>{DropdownMenu_Content($$anchor4,{align:"start",class:"w-48",children:($$anchor5,$$slotProps2)=>{var fragment_2=root_3$S(),node_4=first_child(fragment_2);each(node_4,17,()=>ATTACHMENT_FILE_ITEMS,item=>item.id,($$anchor6,item)=>{ -const enabled=user_derived(()=>attachmentMenu.isItemEnabled(get$3(item).enabledWhen));var fragment_3=comment$2(),node_5=first_child(fragment_3);{var consequent=$$anchor7=>{var fragment_4=comment$2(),node_6=first_child(fragment_4);{let $0=user_derived(()=>get$3(item).class??"");component(node_6,()=>Dropdown_menu_item,($$anchor8,DropdownMenu_Item)=>{DropdownMenu_Item($$anchor8,{get class(){return`${get$3($0)??""} flex cursor-pointer items-center gap-2`},onclick:()=>attachmentMenu.callbacks[get$3(item). -action](),children:($$anchor9,$$slotProps3)=>{var fragment_5=root_6$s(),node_7=first_child(fragment_5);component(node_7,()=>get$3(item).icon,($$anchor10,item_icon)=>{item_icon($$anchor10,{class:"h-4 w-4"})});var span=sibling(node_7,2),text2=child(span,!0);reset(span),template_effect(()=>set_text(text2,get$3(item).label)),append($$anchor9,fragment_5)},$$slots:{default:!0}})})}append($$anchor7,fragment_4)},consequent_1=$$anchor7=>{var fragment_6=comment$2(),node_8=first_child(fragment_6);component( -node_8,()=>Root$5,($$anchor8,Tooltip_Root)=>{Tooltip_Root($$anchor8,{get delayDuration(){return TOOLTIP_DELAY_DURATION},children:($$anchor9,$$slotProps3)=>{var fragment_7=root_8$r(),node_9=first_child(fragment_7);component(node_9,()=>Tooltip_trigger,($$anchor10,Tooltip_Trigger)=>{Tooltip_Trigger($$anchor10,{class:"w-full",children:($$anchor11,$$slotProps4)=>{var fragment_8=comment$2(),node_10=first_child(fragment_8);{let $0=user_derived(()=>get$3(item).class??"");component(node_10,()=>Dropdown_menu_item, -($$anchor12,DropdownMenu_Item_1)=>{DropdownMenu_Item_1($$anchor12,{get class(){return`${get$3($0)??""} flex cursor-pointer items-center gap-2`},disabled:!0,children:($$anchor13,$$slotProps5)=>{var fragment_9=root_10$f(),node_11=first_child(fragment_9);component(node_11,()=>get$3(item).icon,($$anchor14,item_icon_1)=>{item_icon_1($$anchor14,{class:"h-4 w-4"})});var span_1=sibling(node_11,2),text_1=child(span_1,!0);reset(span_1),template_effect(()=>set_text(text_1,get$3(item).label)),append($$anchor13, -fragment_9)},$$slots:{default:!0}})})}append($$anchor11,fragment_8)},$$slots:{default:!0}})});var node_12=sibling(node_9,2);component(node_12,()=>Tooltip_content,($$anchor10,Tooltip_Content)=>{Tooltip_Content($$anchor10,{side:"right",children:($$anchor11,$$slotProps4)=>{var p2=root_11$f(),text_2=child(p2,!0);reset(p2),template_effect(()=>set_text(text_2,get$3(item).disabledTooltip)),append($$anchor11,p2)},$$slots:{default:!0}})}),append($$anchor9,fragment_7)},$$slots:{default:!0}})}),append($$anchor7, -fragment_6)};if_block(node_5,$$render=>{get$3(enabled)?$$render(consequent):get$3(item).disabledTooltip&&$$render(consequent_1,1)})}append($$anchor6,fragment_3)});var node_13=sibling(node_4,2);{var consequent_3=$$anchor6=>{var fragment_10=comment$2(),node_14=first_child(fragment_10);component(node_14,()=>Root$5,($$anchor7,Tooltip_Root_1)=>{Tooltip_Root_1($$anchor7,{get delayDuration(){return TOOLTIP_DELAY_DURATION},children:($$anchor8,$$slotProps3)=>{var fragment_11=root_13$d(),node_15=first_child( -fragment_11);component(node_15,()=>Tooltip_trigger,($$anchor9,Tooltip_Trigger_1)=>{Tooltip_Trigger_1($$anchor9,{class:"w-full",children:($$anchor10,$$slotProps4)=>{var fragment_12=comment$2(),node_16=first_child(fragment_12);component(node_16,()=>Dropdown_menu_item,($$anchor11,DropdownMenu_Item_2)=>{DropdownMenu_Item_2($$anchor11,{class:"flex cursor-pointer items-center gap-2",get onclick(){return attachmentMenu.callbacks.onFileUpload},children:($$anchor12,$$slotProps5)=>{const pdfItem=user_derived( -()=>ATTACHMENT_FILE_ITEMS.find(i=>i.id===AttachmentMenuItemId.PDF));var fragment_13=comment$2(),node_17=first_child(fragment_13);{var consequent_2=$$anchor13=>{var fragment_14=root_16$6(),node_18=first_child(fragment_14);component(node_18,()=>get$3(pdfItem).icon,($$anchor14,pdfItem_icon)=>{pdfItem_icon($$anchor14,{class:"h-4 w-4"})});var span_2=sibling(node_18,2),text_3=child(span_2,!0);reset(span_2),template_effect(()=>set_text(text_3,get$3(pdfItem).label)),append($$anchor13,fragment_14)};if_block( -node_17,$$render=>{get$3(pdfItem)&&$$render(consequent_2)})}append($$anchor12,fragment_13)},$$slots:{default:!0}})}),append($$anchor10,fragment_12)},$$slots:{default:!0}})});var node_19=sibling(node_15,2);component(node_19,()=>Tooltip_content,($$anchor9,Tooltip_Content_1)=>{Tooltip_Content_1($$anchor9,{side:"right",children:($$anchor10,$$slotProps4)=>{var p_1=root_17$8();append($$anchor10,p_1)},$$slots:{default:!0}})}),append($$anchor8,fragment_11)},$$slots:{default:!0}})}),append($$anchor6,fragment_10)}, -d2=user_derived(()=>!attachmentMenu.isItemEnabled("hasVisionModality"));if_block(node_13,$$render=>{get$3(d2)&&$$render(consequent_3)})}var node_20=sibling(node_13,2);component(node_20,()=>Dropdown_menu_separator,($$anchor6,DropdownMenu_Separator)=>{DropdownMenu_Separator($$anchor6,{})});var node_21=sibling(node_20,2);each(node_21,17,()=>ATTACHMENT_EXTRA_ITEMS,item=>item.id,($$anchor6,item)=>{var fragment_15=comment$2(),node_22=first_child(fragment_15);{var consequent_4=$$anchor7=>{var fragment_16=comment$2(), -node_23=first_child(fragment_16);component(node_23,()=>Root$5,($$anchor8,Tooltip_Root_2)=>{Tooltip_Root_2($$anchor8,{get delayDuration(){return TOOLTIP_DELAY_DURATION},children:($$anchor9,$$slotProps3)=>{var fragment_17=root_20$5(),node_24=first_child(fragment_17);component(node_24,()=>Tooltip_trigger,($$anchor10,Tooltip_Trigger_2)=>{Tooltip_Trigger_2($$anchor10,{class:"w-full",children:($$anchor11,$$slotProps4)=>{var fragment_18=comment$2(),node_25=first_child(fragment_18);component(node_25,()=>Dropdown_menu_item, -($$anchor12,DropdownMenu_Item_3)=>{DropdownMenu_Item_3($$anchor12,{class:"flex cursor-pointer items-center gap-2",onclick:()=>attachmentMenu.callbacks[get$3(item).action](),children:($$anchor13,$$slotProps5)=>{var fragment_19=root_22$2(),node_26=first_child(fragment_19);component(node_26,()=>get$3(item).icon,($$anchor14,item_icon_2)=>{item_icon_2($$anchor14,{class:"h-4 w-4"})});var span_3=sibling(node_26,2),text_4=child(span_3,!0);reset(span_3),template_effect(()=>set_text(text_4,get$3(item).label)), -append($$anchor13,fragment_19)},$$slots:{default:!0}})}),append($$anchor11,fragment_18)},$$slots:{default:!0}})});var node_27=sibling(node_24,2);component(node_27,()=>Tooltip_content,($$anchor10,Tooltip_Content_2)=>{Tooltip_Content_2($$anchor10,{side:"right",children:($$anchor11,$$slotProps4)=>{var p_2=root_23$5(),text_5=child(p_2,!0);reset(p_2),template_effect($0=>set_text(text_5,$0),[()=>attachmentMenu.getSystemMessageTooltip()]),append($$anchor11,p_2)},$$slots:{default:!0}})}),append($$anchor9, -fragment_17)},$$slots:{default:!0}})}),append($$anchor7,fragment_16)};if_block(node_22,$$render=>{get$3(item).id===AttachmentMenuItemId.SYSTEM_MESSAGE&&$$render(consequent_4)})}append($$anchor6,fragment_15)});var node_28=sibling(node_21,2);ChatFormActionAddToolsSubmenu(node_28,{});var node_29=sibling(node_28,2);ChatFormActionAddMcpServersSubmenu(node_29,{onMcpSettingsClick:handleMcpSettingsClick});var node_30=sibling(node_29,2);each(node_30,17,()=>ATTACHMENT_MCP_ITEMS,item=>item.id,($$anchor6,item)=>{ -var fragment_20=comment$2(),node_31=first_child(fragment_20);{var consequent_5=$$anchor7=>{var fragment_21=comment$2(),node_32=first_child(fragment_21);component(node_32,()=>Dropdown_menu_item,($$anchor8,DropdownMenu_Item_4)=>{DropdownMenu_Item_4($$anchor8,{class:"flex cursor-pointer items-center gap-2",onclick:()=>attachmentMenu.callbacks[get$3(item).action](),children:($$anchor9,$$slotProps3)=>{var fragment_22=root_26$2(),node_33=first_child(fragment_22);component(node_33,()=>get$3(item).icon, -($$anchor10,item_icon_3)=>{item_icon_3($$anchor10,{class:"h-4 w-4"})});var span_4=sibling(node_33,2),text_6=child(span_4,!0);reset(span_4),template_effect(()=>set_text(text_6,get$3(item).label)),append($$anchor9,fragment_22)},$$slots:{default:!0}})}),append($$anchor7,fragment_21)},d_12=user_derived(()=>attachmentMenu.isItemVisible(get$3(item).visibleWhen));if_block(node_31,$$render=>{get$3(d_12)&&$$render(consequent_5)})}append($$anchor6,fragment_20)}),append($$anchor5,fragment_2)},$$slots:{default:!0}})}), -append($$anchor3,fragment)},$$slots:{default:!0}})}),reset(div),template_effect(()=>set_class(div,1,`flex items-center gap-1 ${className()??""}`)),append($$anchor,div),pop()}function Sheet_overlay($$anchor,$$props){push$1($$props,!0);let ref2=prop($$props,"ref",15,null),restProps=rest_props($$props,["$$slots","$$events","$$legacy","ref","class"]);var fragment=comment$2(),node2=first_child(fragment);{let $0=user_derived(()=>cn$1("fixed inset-0 z-50 bg-black/50 data-[state=closed]:animate-out data\ --[state=closed]:fade-out-0 data-[state=closed]:fill-mode-forwards data-[state=open]:animate-in data-[state=open]:fade-in-0",$$props.class));component(node2,()=>Dialog_overlay$1,($$anchor2,SheetPrimitive_Overlay)=>{SheetPrimitive_Overlay($$anchor2,spread_props({"data-slot":"sheet-overlay",get class(){return get$3($0)}},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)}}))})}append($$anchor,fragment),pop()}const sheetVariants=tv({base:`border-border/30 dark:border-border/20 da\ -ta-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fill-mode-forwards fixed z-50 flex flex-col gap-4 shadow-sm transition ease-in-out data-[state=closed]:duration-300 data-[state=open]:duration-500 ${PANEL_CLASSES}`,variants:{side:{top:"data-[state=closed]:slide-out-to-top data-[state=open]:slide-in-from-top inset-x-0 top-0 h-auto border-b",bottom:"data-[state=closed]:slide-out-to-bottom data-[state=open]:slide-in-from-bottom inset-x-0 bottom-0 h-auto border-t",left:"\ -data-[state=closed]:slide-out-to-left data-[state=open]:slide-in-from-left inset-y-0 left-0 h-full w-3/4 border-r sm:max-w-sm",right:"data-[state=closed]:slide-out-to-right data-[state=open]:slide-in-from-right inset-y-0 right-0 h-full w-3/4 border-l sm:max-w-sm"}},defaultVariants:{side:"right"}});var root_3$R=from_html(' Close',1),root_2$12=from_html(" ",1),root_1$N=from_html(" ",1);function Sheet_content($$anchor,$$props){push$1($$props,!0);let ref2=prop( -$$props,"ref",15,null),side=prop($$props,"side",3,"right"),restProps=rest_props($$props,["$$slots","$$events","$$legacy","ref","class","side","portalProps","children"]);var fragment=comment$2(),node2=first_child(fragment);component(node2,()=>Portal$2,($$anchor2,SheetPrimitive_Portal)=>{SheetPrimitive_Portal($$anchor2,spread_props(()=>$$props.portalProps,{children:($$anchor3,$$slotProps)=>{var fragment_1=root_1$N(),node_1=first_child(fragment_1);Sheet_overlay(node_1,{});var node_2=sibling(node_1, -2);{let $0=user_derived(()=>cn$1(sheetVariants({side:side()}),$$props.class));component(node_2,()=>Dialog_content$1,($$anchor4,SheetPrimitive_Content)=>{SheetPrimitive_Content($$anchor4,spread_props({"data-slot":"sheet-content",get class(){return get$3($0)}},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)},children:($$anchor5,$$slotProps2)=>{var fragment_2=root_2$12(),node_3=first_child(fragment_2);snippet(node_3,()=>$$props.children??noop$3);var node_4=sibling(node_3,2);component( -node_4,()=>Dialog_close,($$anchor6,SheetPrimitive_Close)=>{SheetPrimitive_Close($$anchor6,{class:"absolute top-4 right-4 rounded-xs opacity-70 ring-offset-background transition-opacity hover:opacity-100 focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 focus-visible:outline-hidden disabled:pointer-events-none",children:($$anchor7,$$slotProps3)=>{var fragment_3=root_3$R(),node_5=first_child(fragment_3);X(node_5,{class:"size-4"}),next$1(2),append($$anchor7,fragment_3)},$$slots:{ -default:!0}})}),append($$anchor5,fragment_2)},$$slots:{default:!0}}))})}append($$anchor3,fragment_1)},$$slots:{default:!0}}))}),append($$anchor,fragment),pop()}var root$1s=from_html("
");function Sheet_header($$anchor,$$props){push$1($$props,!0);let ref2=prop($$props,"ref",15,null),restProps=rest_props($$props,["$$slots","$$events","$$legacy","ref","class","children"]);var div=root$1s();attribute_effect(div,$0=>({"data-slot":"sheet-header",class:$0,...restProps}),[()=>cn$1("flex fle\ -x-col gap-1.5 p-4",$$props.class)]);var node2=child(div);snippet(node2,()=>$$props.children??noop$3),reset(div),bind_this(div,$$value=>ref2($$value),()=>ref2()),append($$anchor,div),pop()}function Sheet_title($$anchor,$$props){push$1($$props,!0);let ref2=prop($$props,"ref",15,null),restProps=rest_props($$props,["$$slots","$$events","$$legacy","ref","class"]);var fragment=comment$2(),node2=first_child(fragment);{let $0=user_derived(()=>cn$1("font-semibold text-foreground",$$props.class));component( -node2,()=>Dialog_title$1,($$anchor2,SheetPrimitive_Title)=>{SheetPrimitive_Title($$anchor2,spread_props({"data-slot":"sheet-title",get class(){return get$3($0)}},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)}}))})}append($$anchor,fragment),pop()}function Sheet_description($$anchor,$$props){push$1($$props,!0);let ref2=prop($$props,"ref",15,null),restProps=rest_props($$props,["$$slots","$$events","$$legacy","ref","class"]);var fragment=comment$2(),node2=first_child(fragment); -{let $0=user_derived(()=>cn$1("text-sm text-muted-foreground",$$props.class));component(node2,()=>Dialog_description$1,($$anchor2,SheetPrimitive_Description)=>{SheetPrimitive_Description($$anchor2,spread_props({"data-slot":"sheet-description",get class(){return get$3($0)}},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)}}))})}append($$anchor,fragment),pop()}const Root$3=Dialog;var root_3$Q=from_html(" ",1),root_7$r=from_html(''),root_11$e=from_html("

"),root_9$k=from_html(" ",1),root_15$9=from_html(''),root_16$5=from_html("

PDFs will be converted to text. Image-based PDFs may not work properly.

"),root_14$5=from_html(" ",1),root_20$4=from_html(''),root_21$3=from_html("

"),root_19$9=from_html(" ",1),root_23$4=from_html(''),root_2$11=from_html(' ',1),root_1$M=from_html(" ",1),root$1r=from_html("
");function ChatFormActionAddSheet($$anchor,$$props){ +slice(0,idx),{role:MessageRole.ASSISTANT,content:originalContent,reasoning_content:originalReasoning||void 0}];let appendedContent="",appendedReasoning="",hasReceivedContent=!1;const updateStreamingContent=fullContent=>{this.setChatStreaming(msg.convId,fullContent,msg.id),conversationsStore.updateMessageAtIndex(idx,{content:fullContent})},abortController=this.getOrCreateAbortController(msg.convId);await ChatService.sendMessage(contextWithContinue,{...this.getApiOptions(),continueFinalMessage:!0, +onChunk:chunk=>{appendedContent+=chunk,hasReceivedContent=!0,updateStreamingContent(originalContent+appendedContent)},onReasoningChunk:chunk=>{appendedReasoning+=chunk,hasReceivedContent=!0,this.setChatStreaming(msg.convId,originalContent+appendedContent,msg.id),conversationsStore.updateMessageAtIndex(idx,{reasoningContent:originalReasoning+appendedReasoning})},onTimings:(timings,promptProgress)=>{const tokensPerSecond=timings?.predicted_ms&&timings?.predicted_n?timings.predicted_n/timings.predicted_ms* +1e3:0;this.updateProcessingStateFromTimings({prompt_n:timings?.prompt_n||0,prompt_ms:timings?.prompt_ms,predicted_n:timings?.predicted_n||0,predicted_per_second:tokensPerSecond,cache_n:timings?.cache_n||0,prompt_progress:promptProgress},msg.convId)},onComplete:async(finalContent,reasoningContent,timings)=>{const finalAppendedContent=hasReceivedContent?appendedContent:finalContent||"",finalAppendedReasoning=hasReceivedContent?appendedReasoning:reasoningContent||"",fullContent=originalContent+finalAppendedContent, +fullReasoning=originalReasoning+finalAppendedReasoning||void 0;await DatabaseService.updateMessage(msg.id,{content:fullContent,reasoningContent:fullReasoning,timestamp:Date.now(),timings}),conversationsStore.updateMessageAtIndex(idx,{content:fullContent,reasoningContent:fullReasoning,timestamp:Date.now(),timings}),conversationsStore.updateConversationTimestamp(),this.setChatLoading(msg.convId,!1),this.clearChatStreaming(msg.convId),this.setProcessingState(msg.convId,null)},onError:async error2=>{ +if(isAbortError(error2)){hasReceivedContent&&appendedContent&&(await DatabaseService.updateMessage(msg.id,{content:originalContent+appendedContent,reasoningContent:originalReasoning+appendedReasoning||void 0,timestamp:Date.now()}),conversationsStore.updateMessageAtIndex(idx,{content:originalContent+appendedContent,reasoningContent:originalReasoning+appendedReasoning||void 0,timestamp:Date.now()})),this.setChatLoading(msg.convId,!1),this.clearChatStreaming(msg.convId),this.setProcessingState(msg. +convId,null);return}console.error("Continue generation error:",error2),conversationsStore.updateMessageAtIndex(idx,{content:originalContent}),await DatabaseService.updateMessage(msg.id,{content:originalContent}),this.setChatLoading(msg.convId,!1),this.clearChatStreaming(msg.convId),this.setProcessingState(msg.convId,null),this.showErrorDialog({type:error2.name==="TimeoutError"?ErrorDialogType.TIMEOUT:ErrorDialogType.SERVER,message:error2.message})}},msg.convId,abortController.signal)}catch(error2){ +isAbortError(error2)||console.error("Failed to continue message:",error2),activeConv&&this.setChatLoading(activeConv.id,!1)}}async editAssistantMessage(messageId,newContent,shouldBranch){const activeConv=conversationsStore.activeConversation;if(!activeConv||this.isChatLoadingInternal(activeConv.id))return;const result=this.getMessageByIdWithRole(messageId,MessageRole.ASSISTANT);if(!result)return;const{message:msg,index:idx}=result;try{if(shouldBranch){const newMessage=await DatabaseService.createMessageBranch( +{convId:msg.convId,type:msg.type,timestamp:Date.now(),role:msg.role,content:newContent,toolCalls:msg.toolCalls||"",children:[],model:msg.model},msg.parent);await conversationsStore.updateCurrentNode(newMessage.id)}else await DatabaseService.updateMessage(msg.id,{content:newContent}),conversationsStore.updateMessageAtIndex(idx,{content:newContent});conversationsStore.updateConversationTimestamp(),await conversationsStore.refreshActiveMessages()}catch(error2){console.error("Failed to edit assistan\ +t message:",error2)}}async editUserMessagePreserveResponses(messageId,newContent,newExtras){const activeConv=conversationsStore.activeConversation;if(!activeConv)return;const result=this.getMessageByIdWithRole(messageId,MessageRole.USER);if(!result)return;const{message:msg,index:idx}=result;try{const updateData={content:newContent};newExtras!==void 0&&(updateData.extra=JSON.parse(JSON.stringify(newExtras))),await DatabaseService.updateMessage(messageId,updateData),conversationsStore.updateMessageAtIndex( +idx,updateData);const rootMessage=(await conversationsStore.getConversationMessages(activeConv.id)).find(m=>m.type==="root"&&m.parent===null);rootMessage&&msg.parent===rootMessage.id&&newContent.trim()&&await conversationsStore.updateConversationTitleWithConfirmation(activeConv.id,generateConversationTitle(newContent,!!config$1().titleGenerationUseFirstLine)),conversationsStore.updateConversationTimestamp()}catch(error2){console.error("Failed to edit user message:",error2)}}async editMessageWithBranching(messageId,newContent,newExtras){ +const activeConv=conversationsStore.activeConversation;if(!activeConv||this.isChatLoadingInternal(activeConv.id))return;let result=this.getMessageByIdWithRole(messageId,MessageRole.USER);if(result||(result=this.getMessageByIdWithRole(messageId,MessageRole.SYSTEM)),!result)return;const{message:msg,index:idx}=result;try{const allMessages=await conversationsStore.getConversationMessages(activeConv.id),rootMessage=allMessages.find(m=>m.type==="root"&&m.parent===null),isFirstUserMessage=msg.role===MessageRole. +USER&&rootMessage&&msg.parent===rootMessage.id,extrasToUse=newExtras!==void 0?JSON.parse(JSON.stringify(newExtras)):msg.extra?JSON.parse(JSON.stringify(msg.extra)):void 0;let messageIdForResponse;const dbMsg=findMessageById(allMessages,msg.id);if(dbMsg?dbMsg.children.length>0:msg.children.length>0){const parentId=msg.parent||rootMessage?.id;if(!parentId)return;const newMessage=await DatabaseService.createMessageBranch({convId:msg.convId,type:msg.type,timestamp:Date.now(),role:msg.role,content:newContent, +toolCalls:msg.toolCalls||"",children:[],extra:extrasToUse,model:msg.model},parentId);await conversationsStore.updateCurrentNode(newMessage.id),messageIdForResponse=newMessage.id}else{const updates={content:newContent,timestamp:Date.now(),extra:extrasToUse};await DatabaseService.updateMessage(msg.id,updates),conversationsStore.updateMessageAtIndex(idx,updates),messageIdForResponse=msg.id}conversationsStore.updateConversationTimestamp(),isFirstUserMessage&&newContent.trim()&&await conversationsStore. +updateConversationTitleWithConfirmation(activeConv.id,generateConversationTitle(newContent,!!config$1().titleGenerationUseFirstLine)),await conversationsStore.refreshActiveMessages(),msg.role===MessageRole.USER&&await this.generateResponseForMessage(messageIdForResponse)}catch(error2){console.error("Failed to edit message with branching:",error2)}}async generateResponseForMessage(userMessageId){const activeConv=conversationsStore.activeConversation;if(activeConv){this.showErrorDialog(null),this. +setChatLoading(activeConv.id,!0),this.clearChatStreaming(activeConv.id);try{const allMessages=await conversationsStore.getConversationMessages(activeConv.id),conversationPath=filterByLeafNodeId(allMessages,userMessageId,!1),assistantMessage=await DatabaseService.createMessageBranch({convId:activeConv.id,type:MessageType.TEXT,timestamp:Date.now(),role:MessageRole.ASSISTANT,content:"",toolCalls:"",children:[],model:null},userMessageId);conversationsStore.addMessageToActive(assistantMessage),await this. +streamChatCompletion(conversationPath,assistantMessage)}catch(error2){console.error("Failed to generate response:",error2),this.setChatLoading(activeConv.id,!1)}}}getContextTotal(){const activeConvId=this.activeConversationId,activeState=activeConvId?this.getProcessingState(activeConvId):null;if(activeState&&typeof activeState.contextTotal=="number"&&activeState.contextTotal>0)return activeState.contextTotal;if(isRouterMode()){const modelContextSize=selectedModelContextSize();if(typeof modelContextSize== +"number"&&modelContextSize>0)return modelContextSize}else{const propsContextSize=contextSize();if(typeof propsContextSize=="number"&&propsContextSize>0)return propsContextSize}return null}updateProcessingStateFromTimings(timingData,conversationId){const processingState=this.parseTimingData(timingData);if(processingState===null){console.warn("Failed to parse timing data - skipping update");return}const targetId=conversationId||this.activeConversationId;targetId&&this.setProcessingState(targetId,processingState)}parseTimingData(timingData){ +const promptTokens=timingData.prompt_n||0,promptMs=timingData.prompt_ms||void 0,predictedTokens=timingData.predicted_n||0,tokensPerSecond=timingData.predicted_per_second||0,cacheTokens=timingData.cache_n||0,promptProgress=timingData.prompt_progress,contextTotal=this.getContextTotal(),currentConfig=config$1(),outputTokensMax=currentConfig.max_tokens||-1,contextUsed=promptTokens+cacheTokens+predictedTokens,outputTokensUsed=predictedTokens,progressCache=promptProgress?.cache||0,progressActualDone=(promptProgress?. +processed??0)-progressCache,progressActualTotal=(promptProgress?.total??0)-progressCache,progressPercent=promptProgress?Math.round(progressActualDone/progressActualTotal*100):void 0;return{status:predictedTokens>0?"generating":promptProgress?"preparing":"idle",tokensDecoded:predictedTokens,tokensRemaining:outputTokensMax-predictedTokens,contextUsed,contextTotal,outputTokensUsed,outputTokensMax,hasNextToken:predictedTokens>0,tokensPerSecond,temperature:currentConfig.temperature??.8,topP:currentConfig. +top_p??.95,speculative:!1,progressPercent,promptProgress,promptTokens,promptMs,cacheTokens}}restoreProcessingStateFromMessages(messages,conversationId){for(let i=messages.length-1;i>=0;i--){const message=messages[i];if(message.role===MessageRole.ASSISTANT&&message.timings){const restoredState=this.parseTimingData({prompt_n:message.timings.prompt_n||0,prompt_ms:message.timings.prompt_ms,predicted_n:message.timings.predicted_n||0,predicted_per_second:message.timings.predicted_n&&message.timings.predicted_ms? +message.timings.predicted_n/message.timings.predicted_ms*1e3:0,cache_n:message.timings.cache_n||0});if(restoredState){this.setProcessingState(conversationId,restoredState);return}}}}getConversationModel(messages){for(let i=messages.length-1;i>=0;i--){const message=messages[i];if(message.role===MessageRole.ASSISTANT&&message.model)return message.model}return null}getApiOptions(){const currentConfig=config$1(),hasValue=value=>value!=null&&value!=="",apiOptions={stream:!0,timings_per_token:!0};if(isRouterMode()){ +const modelName=selectedModelName();modelName&&(apiOptions.model=modelName)}return currentConfig.systemMessage&&(apiOptions.systemMessage=currentConfig.systemMessage),currentConfig.disableReasoningParsing&&(apiOptions.disableReasoningParsing=!0),currentConfig.excludeReasoningFromContext&&(apiOptions.excludeReasoningFromContext=!0),hasValue(currentConfig.temperature)&&(apiOptions.temperature=Number(currentConfig.temperature)),hasValue(currentConfig.max_tokens)&&(apiOptions.max_tokens=Number(currentConfig. +max_tokens)),hasValue(currentConfig.dynatemp_range)&&(apiOptions.dynatemp_range=Number(currentConfig.dynatemp_range)),hasValue(currentConfig.dynatemp_exponent)&&(apiOptions.dynatemp_exponent=Number(currentConfig.dynatemp_exponent)),hasValue(currentConfig.top_k)&&(apiOptions.top_k=Number(currentConfig.top_k)),hasValue(currentConfig.top_p)&&(apiOptions.top_p=Number(currentConfig.top_p)),hasValue(currentConfig.min_p)&&(apiOptions.min_p=Number(currentConfig.min_p)),hasValue(currentConfig.xtc_probability)&& +(apiOptions.xtc_probability=Number(currentConfig.xtc_probability)),hasValue(currentConfig.xtc_threshold)&&(apiOptions.xtc_threshold=Number(currentConfig.xtc_threshold)),hasValue(currentConfig.typ_p)&&(apiOptions.typ_p=Number(currentConfig.typ_p)),hasValue(currentConfig.repeat_last_n)&&(apiOptions.repeat_last_n=Number(currentConfig.repeat_last_n)),hasValue(currentConfig.repeat_penalty)&&(apiOptions.repeat_penalty=Number(currentConfig.repeat_penalty)),hasValue(currentConfig.presence_penalty)&&(apiOptions. +presence_penalty=Number(currentConfig.presence_penalty)),hasValue(currentConfig.frequency_penalty)&&(apiOptions.frequency_penalty=Number(currentConfig.frequency_penalty)),hasValue(currentConfig.dry_multiplier)&&(apiOptions.dry_multiplier=Number(currentConfig.dry_multiplier)),hasValue(currentConfig.dry_base)&&(apiOptions.dry_base=Number(currentConfig.dry_base)),hasValue(currentConfig.dry_allowed_length)&&(apiOptions.dry_allowed_length=Number(currentConfig.dry_allowed_length)),hasValue(currentConfig. +dry_penalty_last_n)&&(apiOptions.dry_penalty_last_n=Number(currentConfig.dry_penalty_last_n)),currentConfig.samplers&&(apiOptions.samplers=currentConfig.samplers),apiOptions.backend_sampling=currentConfig.backend_sampling,currentConfig.custom&&(apiOptions.custom=currentConfig.custom),apiOptions}cancelPreEncode(){this.preEncodeAbortController&&(this.preEncodeAbortController.abort(),this.preEncodeAbortController=null)}async triggerPreEncode(allMessages,assistantMessage,assistantContent,model,excludeReasoning){ +this.cancelPreEncode(),this.preEncodeAbortController=new AbortController;const signal=this.preEncodeAbortController.signal;try{if(!await ChatService.areAllSlotsIdle(model,signal)||signal.aborted)return;const messagesWithAssistant=[...allMessages,{...assistantMessage,content:assistantContent}];await ChatService.preEncode(messagesWithAssistant,model,excludeReasoning,signal)}catch(err){isAbortError(err)||console.warn("[ChatStore] Pre-encode failed:",err)}}}const chatStore=new ChatStore,activeProcessingState=()=>chatStore. +activeProcessingState,errorDialog=()=>chatStore.errorDialogState,getAddFilesHandler=()=>chatStore.getAddFilesHandler(),getAllLoadingChats=()=>chatStore.getAllLoadingChats(),isChatStreaming=()=>chatStore.isStreaming(),isEditing=()=>chatStore.isEditing(),isLoading=()=>chatStore.isLoading,pendingEditMessageId=()=>chatStore.pendingEditMessageId,chatPendingMessageContent=convId=>chatStore.pendingMessageContent(convId),chatPendingMessageExtras=convId=>chatStore.pendingMessageExtras(convId),chatClearPendingMessage=convId=>chatStore. +clearPendingMessage(convId),chatInjectPendingMessage=(convId,content2,extras)=>chatStore.injectPendingMessage(convId,content2,extras);var root$1u=from_html('
',1);function ChatForm($$anchor,$$props){push$1($$props,!0);let attachments=prop($$props,"attachments",19,()=>[]),className=prop( +$$props,"class",3,""),disabled=prop($$props,"disabled",3,!1),isLoading2=prop($$props,"isLoading",3,!1),placeholder=prop($$props,"placeholder",3,"Type a message..."),showMcpPromptButton=prop($$props,"showMcpPromptButton",3,!1),showAddButton=prop($$props,"showAddButton",3,!0),showModelSelector=prop($$props,"showModelSelector",3,!0),uploadedFiles=prop($$props,"uploadedFiles",31,()=>proxy([])),value=prop($$props,"value",15,""),audioRecorder,chatFormActionsRef=state$1(void 0),fileInputRef=state$1(void 0), +pickersRef=state$1(void 0),textareaRef=state$1(void 0),isRecording=state$1(!1),recordingSupported=state$1(!1),isPromptPickerOpen=state$1(!1),promptSearchQuery=state$1(""),isInlineResourcePickerOpen=state$1(!1),resourceSearchQuery=state$1(""),isResourceDialogOpen=state$1(!1),preSelectedResourceUri=state$1(void 0),currentConfig=user_derived(config$1),pasteLongTextToFileLength=user_derived(()=>{const n=Number(get$3(currentConfig).pasteLongTextToFileLen);return Number.isNaN(n)?Number(SETTING_CONFIG_DEFAULT. +pasteLongTextToFileLen):n}),isRouter=user_derived(isRouterMode),conversationModel=user_derived(()=>chatStore.getConversationModel(activeMessages())),activeModelId=user_derived(()=>{const options=modelOptions();if(!get$3(isRouter))return options.length>0?options[0].model:null;const selectedId=selectedModelId();if(selectedId){const model=options.find(m=>m.id===selectedId);if(model)return model.model}if(get$3(conversationModel)){const model=options.find(m=>m.model===get$3(conversationModel));if(model) +return model.model}return null}),hasModelSelected=user_derived(()=>!get$3(isRouter)||!!get$3(conversationModel)||!!selectedModelId()),hasLoadingAttachments=user_derived(()=>uploadedFiles().some(f=>f.isLoading)),hasAttachments=user_derived(()=>attachments()&&attachments().length>0||uploadedFiles()&&uploadedFiles().length>0),canSubmit=user_derived(()=>value().trim().length>0||get$3(hasAttachments));onMount$1(()=>{set$1(recordingSupported,isAudioRecordingSupported(),!0),audioRecorder=new AudioRecorder}); +function focus2(){get$3(textareaRef)?.focus()}function resetTextareaHeight(){get$3(textareaRef)?.resetHeight()}function openModelSelector(){get$3(chatFormActionsRef)?.openModelSelector()}function checkModelSelected(){return get$3(hasModelSelected)?!0:(get$3(chatFormActionsRef)?.openModelSelector(),!1)}function handleFileSelect(files){$$props.onFilesAdd?.(files)}function handleFileUpload(){get$3(fileInputRef)?.click()}function handleFileRemove(fileId){if(fileId.startsWith("attachment-")){const index2=parseInt( +fileId.replace("attachment-",""),10);!isNaN(index2)&&index2>=0&&index2item.kind==="file").map(item=>item.getAsFile()).filter(file=>file!==null);if(files.length>0){event2.preventDefault(),$$props.onFilesAdd?.(files);return}const text2=event2.clipboardData.getData(MimeTypeText.PLAIN);if(text2.startsWith(CLIPBOARD_CONTENT_QUOTE_PREFIX)){const parsed=parseClipboardContent(text2);if(parsed.textAttachments.length>0||parsed. +mcpPromptAttachments.length>0){if(event2.preventDefault(),value(parsed.message),$$props.onValueChange?.(parsed.message),parsed.textAttachments.length>0){const attachmentFiles=parsed.textAttachments.map(att=>new File([att.content],att.name,{type:MimeTypeText.PLAIN}));$$props.onFilesAdd?.(attachmentFiles)}if(parsed.mcpPromptAttachments.length>0){const mcpPromptFiles=parsed.mcpPromptAttachments.map(att=>({id:uuid$1(),name:att.name,size:att.content.length,type:SpecialFileType.MCP_PROMPT,file:new File( +[att.content],`${att.name}${FileExtensionText.TXT}`,{type:MimeTypeText.PLAIN}),isLoading:!1,textContent:att.content,mcpPrompt:{serverName:att.serverName,promptName:att.promptName,arguments:att.arguments}}));uploadedFiles([...uploadedFiles(),...mcpPromptFiles]),$$props.onUploadedFilesChange?.(uploadedFiles())}setTimeout(()=>{get$3(textareaRef)?.focus()},10);return}}if(text2.length>0&&get$3(pasteLongTextToFileLength)>0&&text2.length>get$3(pasteLongTextToFileLength)){event2.preventDefault();const textFile=new File( +[text2],"Pasted",{type:MimeTypeText.PLAIN});$$props.onFilesAdd?.([textFile])}}function handlePromptLoadStart(placeholderId,promptInfo,args){value().startsWith(PROMPT_TRIGGER_PREFIX)&&(value(""),$$props.onValueChange?.("")),set$1(isPromptPickerOpen,!1),set$1(promptSearchQuery,"");const promptName=promptInfo.title||promptInfo.name,placeholder2={id:placeholderId,name:promptName,size:INITIAL_FILE_SIZE,type:SpecialFileType.MCP_PROMPT,file:new File([],"loading"),isLoading:!0,mcpPrompt:{serverName:promptInfo. +serverName,promptName:promptInfo.name,arguments:args?{...args}:void 0}};uploadedFiles([...uploadedFiles(),placeholder2]),$$props.onUploadedFilesChange?.(uploadedFiles()),get$3(textareaRef)?.focus()}function handlePromptLoadComplete(placeholderId,result){const promptText=result.messages?.map(msg=>typeof msg.content=="string"?msg.content:msg.content.type===ContentPartType.TEXT?msg.content.text:"").filter(Boolean).join(PROMPT_CONTENT_SEPARATOR);uploadedFiles(uploadedFiles().map(f=>f.id===placeholderId? +{...f,isLoading:!1,textContent:promptText,size:promptText.length,file:new File([promptText],`${f.name}${FileExtensionText.TXT}`,{type:MimeTypeText.PLAIN})}:f)),$$props.onUploadedFilesChange?.(uploadedFiles())}function handlePromptLoadError(placeholderId,error2){uploadedFiles(uploadedFiles().map(f=>f.id===placeholderId?{...f,isLoading:!1,loadError:error2}:f)),$$props.onUploadedFilesChange?.(uploadedFiles())}function handlePromptPickerClose(){set$1(isPromptPickerOpen,!1),set$1(promptSearchQuery,""), +get$3(textareaRef)?.focus()}function handleInlineResourcePickerClose(){set$1(isInlineResourcePickerOpen,!1),set$1(resourceSearchQuery,""),get$3(textareaRef)?.focus()}function handleInlineResourceSelect(){value().startsWith(RESOURCE_TRIGGER_PREFIX)&&(value(""),$$props.onValueChange?.("")),set$1(isInlineResourcePickerOpen,!1),set$1(resourceSearchQuery,""),get$3(textareaRef)?.focus()}function handleBrowseResources(){set$1(isInlineResourcePickerOpen,!1),set$1(resourceSearchQuery,""),value().startsWith( +RESOURCE_TRIGGER_PREFIX)&&(value(""),$$props.onValueChange?.("")),set$1(isResourceDialogOpen,!0)}async function handleMicClick(){if(!audioRecorder||!get$3(recordingSupported)){console.warn("Audio recording not supported");return}if(get$3(isRecording)){set$1(isRecording,!1);try{const audioBlob=await audioRecorder.stopRecording(),wavBlob=await convertToWav(audioBlob),audioFile=createAudioFile(wavBlob);$$props.onFilesAdd?.([audioFile])}catch(error2){console.error("Failed to stop recording:",error2)}}else +try{await audioRecorder.startRecording(),set$1(isRecording,!0)}catch(error2){console.error("Failed to start recording:",error2)}}var $$exports={focus:focus2,resetTextareaHeight,openModelSelector,checkModelSelected},fragment=root$1u(),node2=first_child(fragment);bind_this(ChatFormFileInputInvisible(node2,{onFileSelect:handleFileSelect}),$$value=>set$1(fileInputRef,$$value,!0),()=>get$3(fileInputRef));var form=sibling(node2,2),node_1=child(form);bind_this(ChatFormPickers(node_1,{get isPromptPickerOpen(){ +return get$3(isPromptPickerOpen)},get promptSearchQuery(){return get$3(promptSearchQuery)},get isInlineResourcePickerOpen(){return get$3(isInlineResourcePickerOpen)},get resourceSearchQuery(){return get$3(resourceSearchQuery)},onPromptPickerClose:handlePromptPickerClose,onInlineResourcePickerClose:handleInlineResourcePickerClose,onInlineResourceSelect:handleInlineResourceSelect,onPromptLoadStart:handlePromptLoadStart,onPromptLoadComplete:handlePromptLoadComplete,onPromptLoadError:handlePromptLoadError, +onInlineResourceBrowse:handleBrowseResources}),$$value=>set$1(pickersRef,$$value,!0),()=>get$3(pickersRef));var div=sibling(node_1,2),node_2=child(div);{let $0=user_derived(()=>get$3(activeModelId)??void 0);ChatAttachmentsList(node_2,{get attachments(){return attachments()},onFileRemove:handleFileRemove,limitToSingleRow:!0,class:"py-5",style:"scroll-padding: 1rem;",get activeModelId(){return get$3($0)},get uploadedFiles(){return uploadedFiles()},set uploadedFiles($$value){uploadedFiles($$value)}})} +var div_1=sibling(node_2,2),node_3=child(div_1);bind_this(ChatFormTextarea(node_3,{class:"px-5 py-1.5 md:pt-0",onKeydown:handleKeydown,onInput:()=>{handleInput(),$$props.onValueChange?.(value())},get disabled(){return disabled()},get placeholder(){return placeholder()},get value(){return value()},set value($$value){value($$value)}}),$$value=>set$1(textareaRef,$$value,!0),()=>get$3(textareaRef));var node_4=sibling(node_3,2);{var consequent=$$anchor2=>{ChatFormMcpResourcesList($$anchor2,{class:"mb\ +-3",onResourceClick:uri2=>{set$1(preSelectedResourceUri,uri2,!0),set$1(isResourceDialogOpen,!0)}})},d2=user_derived(()=>mcpHasResourceAttachments());if_block(node_4,$$render=>{get$3(d2)&&$$render(consequent)})}var node_5=sibling(node_4,2);{let $0=user_derived(()=>showMcpPromptButton()?()=>set$1(isPromptPickerOpen,!0):void 0);bind_this(ChatFormActions(node_5,{class:"px-3",get canSend(){return get$3(canSubmit)},get disabled(){return disabled()},get isLoading(){return isLoading2()},get isRecording(){ +return get$3(isRecording)},get showAddButton(){return showAddButton()},get showModelSelector(){return showModelSelector()},get uploadedFiles(){return uploadedFiles()},onFileUpload:handleFileUpload,onMicClick:handleMicClick,get onStop(){return $$props.onStop},onSystemPromptClick:()=>$$props.onSystemPromptClick?.({message:value(),files:uploadedFiles()}),get onMcpPromptClick(){return get$3($0)},onMcpResourcesClick:()=>set$1(isResourceDialogOpen,!0)}),$$value=>set$1(chatFormActionsRef,$$value,!0),()=>get$3( +chatFormActionsRef))}reset(div_1),reset(div),reset(form);var node_6=sibling(form,2);return DialogMcpResourcesBrowser(node_6,{get preSelectedUri(){return get$3(preSelectedResourceUri)},onAttach:resource=>{mcpStore.attachResource(resource.uri)},onOpenChange:newOpen=>{newOpen||set$1(preSelectedResourceUri,void 0)},get open(){return get$3(isResourceDialogOpen)},set open($$value){set$1(isResourceDialogOpen,$$value,!0)}}),template_effect(()=>{set_class(form,1,`relative ${className()??""}`),set_class(div, +1,`${INPUT_CLASSES??""} overflow-hidden rounded-3xl backdrop-blur-md ${disabled()?"cursor-not-allowed opacity-60":""}`)}),event("submit",form,event2=>{event2.preventDefault(),!(!get$3(canSubmit)||disabled()||get$3(hasLoadingAttachments))&&$$props.onSubmit?.()}),event("paste",div_1,handlePaste),append($$anchor,fragment),pop($$exports)}function Dropdown_menu_content($$anchor,$$props){push$1($$props,!0);let ref2=prop($$props,"ref",15,null),sideOffset=prop($$props,"sideOffset",3,4),restProps=rest_props( +$$props,["$$slots","$$events","$$legacy","ref","sideOffset","portalProps","class"]);var fragment=comment$2(),node2=first_child(fragment);component(node2,()=>Portal$2,($$anchor2,DropdownMenuPrimitive_Portal)=>{DropdownMenuPrimitive_Portal($$anchor2,spread_props(()=>$$props.portalProps,{children:($$anchor3,$$slotProps)=>{var fragment_1=comment$2(),node_1=first_child(fragment_1);{let $0=user_derived(()=>cn$1("z-50 max-h-(--bits-dropdown-menu-content-available-height) min-w-[8rem] origin-(--bits-dro\ +pdown-menu-content-transform-origin) overflow-x-hidden overflow-y-auto rounded-md border border-border bg-popover p-1.5 text-popover-foreground shadow-md outline-none data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:fill-mode-forwards data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=op\ +en]:fade-in-0 data-[state=open]:zoom-in-95 dark:border-border/20",$$props.class));component(node_1,()=>Dropdown_menu_content$1,($$anchor4,DropdownMenuPrimitive_Content)=>{DropdownMenuPrimitive_Content($$anchor4,spread_props({"data-slot":"dropdown-menu-content",get sideOffset(){return sideOffset()},get class(){return get$3($0)}},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)}}))})}append($$anchor3,fragment_1)},$$slots:{default:!0}}))}),append($$anchor,fragment),pop()}function Dropdown_menu_item($$anchor,$$props){ +push$1($$props,!0);let ref2=prop($$props,"ref",15,null),variant=prop($$props,"variant",3,"default"),restProps=rest_props($$props,["$$slots","$$events","$$legacy","ref","class","inset","variant"]);var fragment=comment$2(),node2=first_child(fragment);{let $0=user_derived(()=>cn$1("relative flex cursor-pointer items-center gap-2 rounded-sm px-2 py-1.5 text-sm outline-hidden select-none data-highlighted:bg-accent data-highlighted:text-accent-foreground data-[disabled]:pointer-events-none data-[disab\ +led]:opacity-50 data-[inset]:pl-8 data-[variant=destructive]:text-destructive data-[variant=destructive]:data-highlighted:bg-destructive/10 data-[variant=destructive]:data-highlighted:text-destructive dark:data-[variant=destructive]:data-highlighted:bg-destructive/20 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4 [&_svg:not([class*='text-'])]:text-muted-foreground data-[variant=destructive]:*:[svg]:!text-destructive",$$props.class));component(node2,()=>Menu_item, +($$anchor2,DropdownMenuPrimitive_Item)=>{DropdownMenuPrimitive_Item($$anchor2,spread_props({"data-slot":"dropdown-menu-item",get"data-inset"(){return $$props.inset},get"data-variant"(){return variant()},get class(){return get$3($0)}},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)}}))})}append($$anchor,fragment),pop()}function Dropdown_menu_separator($$anchor,$$props){push$1($$props,!0);let ref2=prop($$props,"ref",15,null),restProps=rest_props($$props,["$$slots","$$events", +"$$legacy","ref","class"]);var fragment=comment$2(),node2=first_child(fragment);{let $0=user_derived(()=>cn$1("-mx-1 my-1 h-px bg-border/20",$$props.class));component(node2,()=>Menu_separator,($$anchor2,DropdownMenuPrimitive_Separator)=>{DropdownMenuPrimitive_Separator($$anchor2,spread_props({"data-slot":"dropdown-menu-separator",get class(){return get$3($0)}},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)}}))})}append($$anchor,fragment),pop()}function Dropdown_menu_trigger($$anchor,$$props){ +push$1($$props,!0);let ref2=prop($$props,"ref",15,null),restProps=rest_props($$props,["$$slots","$$events","$$legacy","ref"]);var fragment=comment$2(),node2=first_child(fragment);component(node2,()=>Menu_trigger,($$anchor2,DropdownMenuPrimitive_Trigger)=>{DropdownMenuPrimitive_Trigger($$anchor2,spread_props({"data-slot":"dropdown-menu-trigger"},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)}}))}),append($$anchor,fragment),pop()}function Dropdown_menu_sub_content($$anchor,$$props){ +push$1($$props,!0);let ref2=prop($$props,"ref",15,null),restProps=rest_props($$props,["$$slots","$$events","$$legacy","ref","class"]);var fragment=comment$2(),node2=first_child(fragment);{let $0=user_derived(()=>cn$1("z-50 max-h-(--bits-dropdown-menu-content-available-height) min-w-[8rem] origin-(--bits-dropdown-menu-content-transform-origin) overflow-x-hidden overflow-y-auto rounded-md border border-border bg-popover p-1.5 text-popover-foreground shadow-md outline-none data-[side=bottom]:slide-\ +in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:fill-mode-forwards data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95 dark:border-border/20",$$props.class));component(node2,()=>Menu_sub_content,($$anchor2,DropdownMenuPrimitive_SubContent)=>{DropdownMenuPrimitive_SubContent( +$$anchor2,spread_props({"data-slot":"dropdown-menu-sub-content",get class(){return get$3($0)}},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)}}))})}append($$anchor,fragment),pop()}var root_1$P=from_html(" ",1);function Dropdown_menu_sub_trigger($$anchor,$$props){push$1($$props,!0);let ref2=prop($$props,"ref",15,null),restProps=rest_props($$props,["$$slots","$$events","$$legacy","ref","class","inset","children"]);var fragment=comment$2(),node2=first_child(fragment);{ +let $0=user_derived(()=>cn$1("flex cursor-default items-center gap-2 rounded-sm px-2 py-1.5 text-sm outline-hidden select-none data-highlighted:bg-accent data-highlighted:text-accent-foreground data-[disabled]:pointer-events-none data-[disabled]:opacity-50 data-[inset]:pl-8 data-[state=open]:bg-accent data-[state=open]:text-accent-foreground [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4 [&_svg:not([class*='text-'])]:text-muted-foreground",$$props.class));component( +node2,()=>Menu_sub_trigger,($$anchor2,DropdownMenuPrimitive_SubTrigger)=>{DropdownMenuPrimitive_SubTrigger($$anchor2,spread_props({"data-slot":"dropdown-menu-sub-trigger",get"data-inset"(){return $$props.inset},get class(){return get$3($0)}},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)},children:($$anchor3,$$slotProps)=>{var fragment_1=root_1$P(),node_1=first_child(fragment_1);snippet(node_1,()=>$$props.children??noop$3);var node_2=sibling(node_1,2);Chevron_right(node_2, +{class:"ml-auto size-4"}),append($$anchor3,fragment_1)},$$slots:{default:!0}}))})}append($$anchor,fragment),pop()}const Sub=Menu_sub,Root$4=Menu;function useAttachmentMenu(getFlags,getCallbacks,close2){const modalityFlags=user_derived(getFlags),callbacks=user_derived(()=>{const cbs=getCallbacks(),wrap2=fn=>()=>{close2(),fn?.()};return{[AttachmentAction.FILE_UPLOAD]:wrap2(cbs.onFileUpload),[AttachmentAction.SYSTEM_PROMPT_CLICK]:wrap2(cbs.onSystemPromptClick),[AttachmentAction.MCP_PROMPT_CLICK]:wrap2( +cbs.onMcpPromptClick),[AttachmentAction.MCP_RESOURCES_CLICK]:wrap2(cbs.onMcpResourcesClick)}});function isItemEnabled(enabledWhen){return!enabledWhen||enabledWhen==="always"?!0:!!get$3(modalityFlags)[enabledWhen]}function isItemVisible(visibleWhen){return visibleWhen?!!get$3(modalityFlags)[visibleWhen]:!0}function getSystemMessageTooltip(){return page$1.params.id?"Inject custom system message at the beginning of the conversation":"Add custom system message for a new conversation"}return{get callbacks(){ +return get$3(callbacks)},isItemEnabled,isItemVisible,getSystemMessageTooltip}}var root_6$s=from_html(" ",1),root_10$f=from_html(" ",1),root_11$f=from_html("

"),root_8$r=from_html(" ",1),root_16$6=from_html(" ",1),root_17$8=from_html("

PDFs will be converted to text. Image-based PDFs may not work properly.

"),root_13$d=from_html(" ",1),root_22$2=from_html(" ",1),root_23$5=from_html("

"),root_20$5=from_html( +" ",1),root_26$2=from_html(" ",1),root_3$S=from_html(" ",1),root_1$O=from_html(" ",1),root$1t=from_html("
");function ChatFormActionAddDropdown($$anchor,$$props){push$1($$props,!0);let className=prop($$props,"class",3,""),disabled=prop($$props,"disabled",3,!1),hasAudioModality=prop($$props,"hasAudioModality",3,!1),hasVisionModality=prop($$props,"hasVisionModality",3,!1),hasMcpPromptsSupport=prop($$props,"hasMcpPromptsSupport",3, +!1),hasMcpResourcesSupport=prop($$props,"hasMcpResourcesSupport",3,!1),dropdownOpen=state$1(!1);function handleMcpSettingsClick(){set$1(dropdownOpen,!1),$$props.onMcpSettingsClick?.()}const attachmentMenu=useAttachmentMenu(()=>({hasVisionModality:hasVisionModality(),hasAudioModality:hasAudioModality(),hasMcpPromptsSupport:hasMcpPromptsSupport(),hasMcpResourcesSupport:hasMcpResourcesSupport()}),()=>({onFileUpload:$$props.onFileUpload,onSystemPromptClick:$$props.onSystemPromptClick,onMcpPromptClick:$$props. +onMcpPromptClick,onMcpResourcesClick:$$props.onMcpResourcesClick}),()=>{set$1(dropdownOpen,!1)});var div=root$1t(),node2=child(div);component(node2,()=>Root$4,($$anchor2,DropdownMenu_Root)=>{DropdownMenu_Root($$anchor2,{get open(){return get$3(dropdownOpen)},set open($$value){set$1(dropdownOpen,$$value,!0)},children:($$anchor3,$$slotProps)=>{var fragment=root_1$O(),node_1=first_child(fragment);component(node_1,()=>Dropdown_menu_trigger,($$anchor4,DropdownMenu_Trigger)=>{DropdownMenu_Trigger($$anchor4, +{name:"Attach files",get disabled(){return disabled()},children:($$anchor5,$$slotProps2)=>{var fragment_1=comment$2(),node_2=first_child(fragment_1);snippet(node_2,()=>$$props.trigger,()=>({disabled:disabled()})),append($$anchor5,fragment_1)},$$slots:{default:!0}})});var node_3=sibling(node_1,2);component(node_3,()=>Dropdown_menu_content,($$anchor4,DropdownMenu_Content)=>{DropdownMenu_Content($$anchor4,{align:"start",class:"w-48",children:($$anchor5,$$slotProps2)=>{var fragment_2=root_3$S(),node_4=first_child( +fragment_2);each(node_4,17,()=>ATTACHMENT_FILE_ITEMS,item=>item.id,($$anchor6,item)=>{const enabled=user_derived(()=>attachmentMenu.isItemEnabled(get$3(item).enabledWhen));var fragment_3=comment$2(),node_5=first_child(fragment_3);{var consequent=$$anchor7=>{var fragment_4=comment$2(),node_6=first_child(fragment_4);{let $0=user_derived(()=>get$3(item).class??"");component(node_6,()=>Dropdown_menu_item,($$anchor8,DropdownMenu_Item)=>{DropdownMenu_Item($$anchor8,{get class(){return`${get$3($0)??""}\ + flex cursor-pointer items-center gap-2`},onclick:()=>attachmentMenu.callbacks[get$3(item).action](),children:($$anchor9,$$slotProps3)=>{var fragment_5=root_6$s(),node_7=first_child(fragment_5);component(node_7,()=>get$3(item).icon,($$anchor10,item_icon)=>{item_icon($$anchor10,{class:"h-4 w-4"})});var span=sibling(node_7,2),text2=child(span,!0);reset(span),template_effect(()=>set_text(text2,get$3(item).label)),append($$anchor9,fragment_5)},$$slots:{default:!0}})})}append($$anchor7,fragment_4)},consequent_1=$$anchor7=>{ +var fragment_6=comment$2(),node_8=first_child(fragment_6);component(node_8,()=>Root$5,($$anchor8,Tooltip_Root)=>{Tooltip_Root($$anchor8,{get delayDuration(){return TOOLTIP_DELAY_DURATION},children:($$anchor9,$$slotProps3)=>{var fragment_7=root_8$r(),node_9=first_child(fragment_7);component(node_9,()=>Tooltip_trigger,($$anchor10,Tooltip_Trigger)=>{Tooltip_Trigger($$anchor10,{class:"w-full",children:($$anchor11,$$slotProps4)=>{var fragment_8=comment$2(),node_10=first_child(fragment_8);{let $0=user_derived( +()=>get$3(item).class??"");component(node_10,()=>Dropdown_menu_item,($$anchor12,DropdownMenu_Item_1)=>{DropdownMenu_Item_1($$anchor12,{get class(){return`${get$3($0)??""} flex cursor-pointer items-center gap-2`},disabled:!0,children:($$anchor13,$$slotProps5)=>{var fragment_9=root_10$f(),node_11=first_child(fragment_9);component(node_11,()=>get$3(item).icon,($$anchor14,item_icon_1)=>{item_icon_1($$anchor14,{class:"h-4 w-4"})});var span_1=sibling(node_11,2),text_1=child(span_1,!0);reset(span_1),template_effect( +()=>set_text(text_1,get$3(item).label)),append($$anchor13,fragment_9)},$$slots:{default:!0}})})}append($$anchor11,fragment_8)},$$slots:{default:!0}})});var node_12=sibling(node_9,2);component(node_12,()=>Tooltip_content,($$anchor10,Tooltip_Content)=>{Tooltip_Content($$anchor10,{side:"right",children:($$anchor11,$$slotProps4)=>{var p2=root_11$f(),text_2=child(p2,!0);reset(p2),template_effect(()=>set_text(text_2,get$3(item).disabledTooltip)),append($$anchor11,p2)},$$slots:{default:!0}})}),append($$anchor9, +fragment_7)},$$slots:{default:!0}})}),append($$anchor7,fragment_6)};if_block(node_5,$$render=>{get$3(enabled)?$$render(consequent):get$3(item).disabledTooltip&&$$render(consequent_1,1)})}append($$anchor6,fragment_3)});var node_13=sibling(node_4,2);{var consequent_3=$$anchor6=>{var fragment_10=comment$2(),node_14=first_child(fragment_10);component(node_14,()=>Root$5,($$anchor7,Tooltip_Root_1)=>{Tooltip_Root_1($$anchor7,{get delayDuration(){return TOOLTIP_DELAY_DURATION},children:($$anchor8,$$slotProps3)=>{ +var fragment_11=root_13$d(),node_15=first_child(fragment_11);component(node_15,()=>Tooltip_trigger,($$anchor9,Tooltip_Trigger_1)=>{Tooltip_Trigger_1($$anchor9,{class:"w-full",children:($$anchor10,$$slotProps4)=>{var fragment_12=comment$2(),node_16=first_child(fragment_12);component(node_16,()=>Dropdown_menu_item,($$anchor11,DropdownMenu_Item_2)=>{DropdownMenu_Item_2($$anchor11,{class:"flex cursor-pointer items-center gap-2",get onclick(){return attachmentMenu.callbacks.onFileUpload},children:($$anchor12,$$slotProps5)=>{ +const pdfItem=user_derived(()=>ATTACHMENT_FILE_ITEMS.find(i=>i.id===AttachmentMenuItemId.PDF));var fragment_13=comment$2(),node_17=first_child(fragment_13);{var consequent_2=$$anchor13=>{var fragment_14=root_16$6(),node_18=first_child(fragment_14);component(node_18,()=>get$3(pdfItem).icon,($$anchor14,pdfItem_icon)=>{pdfItem_icon($$anchor14,{class:"h-4 w-4"})});var span_2=sibling(node_18,2),text_3=child(span_2,!0);reset(span_2),template_effect(()=>set_text(text_3,get$3(pdfItem).label)),append($$anchor13, +fragment_14)};if_block(node_17,$$render=>{get$3(pdfItem)&&$$render(consequent_2)})}append($$anchor12,fragment_13)},$$slots:{default:!0}})}),append($$anchor10,fragment_12)},$$slots:{default:!0}})});var node_19=sibling(node_15,2);component(node_19,()=>Tooltip_content,($$anchor9,Tooltip_Content_1)=>{Tooltip_Content_1($$anchor9,{side:"right",children:($$anchor10,$$slotProps4)=>{var p_1=root_17$8();append($$anchor10,p_1)},$$slots:{default:!0}})}),append($$anchor8,fragment_11)},$$slots:{default:!0}})}), +append($$anchor6,fragment_10)},d2=user_derived(()=>!attachmentMenu.isItemEnabled("hasVisionModality"));if_block(node_13,$$render=>{get$3(d2)&&$$render(consequent_3)})}var node_20=sibling(node_13,2);component(node_20,()=>Dropdown_menu_separator,($$anchor6,DropdownMenu_Separator)=>{DropdownMenu_Separator($$anchor6,{})});var node_21=sibling(node_20,2);each(node_21,17,()=>ATTACHMENT_EXTRA_ITEMS,item=>item.id,($$anchor6,item)=>{var fragment_15=comment$2(),node_22=first_child(fragment_15);{var consequent_4=$$anchor7=>{ +var fragment_16=comment$2(),node_23=first_child(fragment_16);component(node_23,()=>Root$5,($$anchor8,Tooltip_Root_2)=>{Tooltip_Root_2($$anchor8,{get delayDuration(){return TOOLTIP_DELAY_DURATION},children:($$anchor9,$$slotProps3)=>{var fragment_17=root_20$5(),node_24=first_child(fragment_17);component(node_24,()=>Tooltip_trigger,($$anchor10,Tooltip_Trigger_2)=>{Tooltip_Trigger_2($$anchor10,{class:"w-full",children:($$anchor11,$$slotProps4)=>{var fragment_18=comment$2(),node_25=first_child(fragment_18); +component(node_25,()=>Dropdown_menu_item,($$anchor12,DropdownMenu_Item_3)=>{DropdownMenu_Item_3($$anchor12,{class:"flex cursor-pointer items-center gap-2",onclick:()=>attachmentMenu.callbacks[get$3(item).action](),children:($$anchor13,$$slotProps5)=>{var fragment_19=root_22$2(),node_26=first_child(fragment_19);component(node_26,()=>get$3(item).icon,($$anchor14,item_icon_2)=>{item_icon_2($$anchor14,{class:"h-4 w-4"})});var span_3=sibling(node_26,2),text_4=child(span_3,!0);reset(span_3),template_effect( +()=>set_text(text_4,get$3(item).label)),append($$anchor13,fragment_19)},$$slots:{default:!0}})}),append($$anchor11,fragment_18)},$$slots:{default:!0}})});var node_27=sibling(node_24,2);component(node_27,()=>Tooltip_content,($$anchor10,Tooltip_Content_2)=>{Tooltip_Content_2($$anchor10,{side:"right",children:($$anchor11,$$slotProps4)=>{var p_2=root_23$5(),text_5=child(p_2,!0);reset(p_2),template_effect($0=>set_text(text_5,$0),[()=>attachmentMenu.getSystemMessageTooltip()]),append($$anchor11,p_2)}, +$$slots:{default:!0}})}),append($$anchor9,fragment_17)},$$slots:{default:!0}})}),append($$anchor7,fragment_16)};if_block(node_22,$$render=>{get$3(item).id===AttachmentMenuItemId.SYSTEM_MESSAGE&&$$render(consequent_4)})}append($$anchor6,fragment_15)});var node_28=sibling(node_21,2);ChatFormActionAddToolsSubmenu(node_28,{});var node_29=sibling(node_28,2);ChatFormActionAddMcpServersSubmenu(node_29,{onMcpSettingsClick:handleMcpSettingsClick});var node_30=sibling(node_29,2);each(node_30,17,()=>ATTACHMENT_MCP_ITEMS, +item=>item.id,($$anchor6,item)=>{var fragment_20=comment$2(),node_31=first_child(fragment_20);{var consequent_5=$$anchor7=>{var fragment_21=comment$2(),node_32=first_child(fragment_21);component(node_32,()=>Dropdown_menu_item,($$anchor8,DropdownMenu_Item_4)=>{DropdownMenu_Item_4($$anchor8,{class:"flex cursor-pointer items-center gap-2",onclick:()=>attachmentMenu.callbacks[get$3(item).action](),children:($$anchor9,$$slotProps3)=>{var fragment_22=root_26$2(),node_33=first_child(fragment_22);component( +node_33,()=>get$3(item).icon,($$anchor10,item_icon_3)=>{item_icon_3($$anchor10,{class:"h-4 w-4"})});var span_4=sibling(node_33,2),text_6=child(span_4,!0);reset(span_4),template_effect(()=>set_text(text_6,get$3(item).label)),append($$anchor9,fragment_22)},$$slots:{default:!0}})}),append($$anchor7,fragment_21)},d_12=user_derived(()=>attachmentMenu.isItemVisible(get$3(item).visibleWhen));if_block(node_31,$$render=>{get$3(d_12)&&$$render(consequent_5)})}append($$anchor6,fragment_20)}),append($$anchor5, +fragment_2)},$$slots:{default:!0}})}),append($$anchor3,fragment)},$$slots:{default:!0}})}),reset(div),template_effect(()=>set_class(div,1,`flex items-center gap-1 ${className()??""}`)),append($$anchor,div),pop()}function Sheet_overlay($$anchor,$$props){push$1($$props,!0);let ref2=prop($$props,"ref",15,null),restProps=rest_props($$props,["$$slots","$$events","$$legacy","ref","class"]);var fragment=comment$2(),node2=first_child(fragment);{let $0=user_derived(()=>cn$1("fixed inset-0 z-50 bg-black/5\ +0 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:fill-mode-forwards data-[state=open]:animate-in data-[state=open]:fade-in-0",$$props.class));component(node2,()=>Dialog_overlay$1,($$anchor2,SheetPrimitive_Overlay)=>{SheetPrimitive_Overlay($$anchor2,spread_props({"data-slot":"sheet-overlay",get class(){return get$3($0)}},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)}}))})}append($$anchor,fragment),pop()}const sheetVariants=tv({base:`bor\ +der-border/30 dark:border-border/20 data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fill-mode-forwards fixed z-50 flex flex-col gap-4 shadow-sm transition ease-in-out data-[state=closed]:duration-300 data-[state=open]:duration-500 ${PANEL_CLASSES}`,variants:{side:{top:"data-[state=closed]:slide-out-to-top data-[state=open]:slide-in-from-top inset-x-0 top-0 h-auto border-b",bottom:"data-[state=closed]:slide-out-to-bottom data-[state=open]:slide-in-from-bottom inse\ +t-x-0 bottom-0 h-auto border-t",left:"data-[state=closed]:slide-out-to-left data-[state=open]:slide-in-from-left inset-y-0 left-0 h-full w-3/4 border-r sm:max-w-sm",right:"data-[state=closed]:slide-out-to-right data-[state=open]:slide-in-from-right inset-y-0 right-0 h-full w-3/4 border-l sm:max-w-sm"}},defaultVariants:{side:"right"}});var root_3$R=from_html(' Close',1),root_2$12=from_html(" ",1),root_1$N=from_html(" ",1);function Sheet_content($$anchor,$$props){ +push$1($$props,!0);let ref2=prop($$props,"ref",15,null),side=prop($$props,"side",3,"right"),restProps=rest_props($$props,["$$slots","$$events","$$legacy","ref","class","side","portalProps","children"]);var fragment=comment$2(),node2=first_child(fragment);component(node2,()=>Portal$2,($$anchor2,SheetPrimitive_Portal)=>{SheetPrimitive_Portal($$anchor2,spread_props(()=>$$props.portalProps,{children:($$anchor3,$$slotProps)=>{var fragment_1=root_1$N(),node_1=first_child(fragment_1);Sheet_overlay(node_1, +{});var node_2=sibling(node_1,2);{let $0=user_derived(()=>cn$1(sheetVariants({side:side()}),$$props.class));component(node_2,()=>Dialog_content$1,($$anchor4,SheetPrimitive_Content)=>{SheetPrimitive_Content($$anchor4,spread_props({"data-slot":"sheet-content",get class(){return get$3($0)}},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)},children:($$anchor5,$$slotProps2)=>{var fragment_2=root_2$12(),node_3=first_child(fragment_2);snippet(node_3,()=>$$props.children??noop$3); +var node_4=sibling(node_3,2);component(node_4,()=>Dialog_close,($$anchor6,SheetPrimitive_Close)=>{SheetPrimitive_Close($$anchor6,{class:"absolute top-4 right-4 rounded-xs opacity-70 ring-offset-background transition-opacity hover:opacity-100 focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 focus-visible:outline-hidden disabled:pointer-events-none",children:($$anchor7,$$slotProps3)=>{var fragment_3=root_3$R(),node_5=first_child(fragment_3);X(node_5,{class:"size-4"}),next$1( +2),append($$anchor7,fragment_3)},$$slots:{default:!0}})}),append($$anchor5,fragment_2)},$$slots:{default:!0}}))})}append($$anchor3,fragment_1)},$$slots:{default:!0}}))}),append($$anchor,fragment),pop()}var root$1s=from_html("
");function Sheet_header($$anchor,$$props){push$1($$props,!0);let ref2=prop($$props,"ref",15,null),restProps=rest_props($$props,["$$slots","$$events","$$legacy","ref","class","children"]);var div=root$1s();attribute_effect(div,$0=>({"data-slot":"sheet-header",class:$0, +...restProps}),[()=>cn$1("flex flex-col gap-1.5 p-4",$$props.class)]);var node2=child(div);snippet(node2,()=>$$props.children??noop$3),reset(div),bind_this(div,$$value=>ref2($$value),()=>ref2()),append($$anchor,div),pop()}function Sheet_title($$anchor,$$props){push$1($$props,!0);let ref2=prop($$props,"ref",15,null),restProps=rest_props($$props,["$$slots","$$events","$$legacy","ref","class"]);var fragment=comment$2(),node2=first_child(fragment);{let $0=user_derived(()=>cn$1("font-semibold text-fo\ +reground",$$props.class));component(node2,()=>Dialog_title$1,($$anchor2,SheetPrimitive_Title)=>{SheetPrimitive_Title($$anchor2,spread_props({"data-slot":"sheet-title",get class(){return get$3($0)}},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)}}))})}append($$anchor,fragment),pop()}function Sheet_description($$anchor,$$props){push$1($$props,!0);let ref2=prop($$props,"ref",15,null),restProps=rest_props($$props,["$$slots","$$events","$$legacy","ref","class"]);var fragment=comment$2(), +node2=first_child(fragment);{let $0=user_derived(()=>cn$1("text-sm text-muted-foreground",$$props.class));component(node2,()=>Dialog_description$1,($$anchor2,SheetPrimitive_Description)=>{SheetPrimitive_Description($$anchor2,spread_props({"data-slot":"sheet-description",get class(){return get$3($0)}},()=>restProps,{get ref(){return ref2()},set ref($$value){ref2($$value)}}))})}append($$anchor,fragment),pop()}const Root$3=Dialog;var root_3$Q=from_html(" ",1),root_7$r=from_html(''),root_10$e=from_html(''),root_11$e=from_html("

"),root_9$k=from_html(" ",1),root_15$9=from_html(''),root_16$5=from_html("

PDFs will be converted to text. Image-based PDFs may not work properly.

"),root_14$5=from_html(" ",1),root_20$4=from_html(''),root_21$3=from_html("

"), +root_19$9=from_html(" ",1),root_23$4=from_html(''),root_2$11=from_html(' ',1),root_1$M=from_html(" ",1),root$1r=from_html("
");function ChatFormActionAddSheet($$anchor,$$props){ push$1($$props,!0);let className=prop($$props,"class",3,""),disabled=prop($$props,"disabled",3,!1),hasAudioModality=prop($$props,"hasAudioModality",3,!1),hasVisionModality=prop($$props,"hasVisionModality",3,!1),hasMcpPromptsSupport=prop($$props,"hasMcpPromptsSupport",3,!1),hasMcpResourcesSupport=prop($$props,"hasMcpResourcesSupport",3,!1),sheetOpen=state$1(!1);const attachmentMenu=useAttachmentMenu(()=>({hasVisionModality:hasVisionModality(),hasAudioModality:hasAudioModality(),hasMcpPromptsSupport:hasMcpPromptsSupport(), hasMcpResourcesSupport:hasMcpResourcesSupport()}),()=>({onFileUpload:$$props.onFileUpload,onSystemPromptClick:$$props.onSystemPromptClick,onMcpPromptClick:$$props.onMcpPromptClick,onMcpResourcesClick:$$props.onMcpResourcesClick}),()=>{set$1(sheetOpen,!1)}),sheetItemClass="flex w-full items-center gap-3 rounded-md px-3 py-2.5 text-left text-sm transition-colors hover:bg-accent active:bg-accent disabled:cursor-not-allowed disabled:opacity-50";var div=root$1r(),node2=child(div);component(node2,()=>Root$3, ($$anchor2,Sheet_Root)=>{Sheet_Root($$anchor2,{get open(){return get$3(sheetOpen)},set open($$value){set$1(sheetOpen,$$value,!0)},children:($$anchor3,$$slotProps)=>{var fragment=root_1$M(),node_1=first_child(fragment);snippet(node_1,()=>$$props.trigger,()=>({disabled:disabled(),onclick:()=>set$1(sheetOpen,!0)}));var node_2=sibling(node_1,2);component(node_2,()=>Sheet_content,($$anchor4,Sheet_Content)=>{Sheet_Content($$anchor4,{side:"bottom",class:"max-h-[85vh] gap-0 overflow-y-auto",children:($$anchor5,$$slotProps2)=>{ diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 0675ce31d06..73de0d3bba1 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1040,6 +1040,10 @@ json oaicompat_chat_params_parse( inputs.use_jinja = opt.use_jinja; inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", caps["supports_parallel_tool_calls"]); inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true); + const bool continue_final_message = json_value(body, "continue_final_message", false); + if (continue_final_message && inputs.add_generation_prompt) { + throw std::invalid_argument("Cannot set both add_generation_prompt and continue_final_message to true."); + } inputs.reasoning_format = opt.reasoning_format; if (body.contains("reasoning_format")) { inputs.reasoning_format = common_reasoning_format_from_name(body.at("reasoning_format").get()); @@ -1071,7 +1075,10 @@ json oaicompat_chat_params_parse( // if the assistant message appears at the end of list, we do not add end-of-turn token // for ex. this can be useful to modify the reasoning process in reasoning models - bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant; + // continue_final_message is the explicit opt in alias from the vLLM/transformers API, + // equivalent to the prefill_assistant heuristic + bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" + && (continue_final_message || opt.prefill_assistant); common_chat_msg last_message; if (prefill_assistant_message) { last_message = inputs.messages.back(); diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py index edef0a93b49..243e4160578 100644 --- a/tools/server/tests/unit/test_chat_completion.py +++ b/tools/server/tests/unit/test_chat_completion.py @@ -178,6 +178,45 @@ def test_chat_template_assistant_prefill(prefill, re_prefill): assert res.body["__verbose"]["prompt"] == f" <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{re_prefill}" +def test_chat_template_continue_final_message_vllm_compat(): + """continue_final_message is the vLLM/transformers explicit alias for the prefill_assistant heuristic. + Both must produce the same prompt.""" + global server + server.chat_template = "llama3" + server.debug = True + server.start() + res = server.make_request("POST", "/chat/completions", data={ + "max_tokens": 8, + "add_generation_prompt": False, + "continue_final_message": True, + "messages": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + {"role": "assistant", "content": "Whill"}, + ] + }) + assert res.status_code == 200 + assert "__verbose" in res.body + assert res.body["__verbose"]["prompt"] == " <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nWhill" + + +def test_chat_template_continue_final_message_mutual_exclusion(): + """add_generation_prompt and continue_final_message both set to true must be rejected""" + global server + server.chat_template = "llama3" + server.start() + res = server.make_request("POST", "/chat/completions", data={ + "max_tokens": 8, + "add_generation_prompt": True, + "continue_final_message": True, + "messages": [ + {"role": "user", "content": "Hi"}, + {"role": "assistant", "content": "Hello"}, + ] + }) + assert res.status_code == 400 + + def test_apply_chat_template(): global server server.chat_template = "command-r" diff --git a/tools/server/webui/src/lib/services/chat.service.ts b/tools/server/webui/src/lib/services/chat.service.ts index a26de0d5d6e..587b48b6f19 100644 --- a/tools/server/webui/src/lib/services/chat.service.ts +++ b/tools/server/webui/src/lib/services/chat.service.ts @@ -130,7 +130,8 @@ export class ChatService { timings_per_token, // Config options disableReasoningParsing, - excludeReasoningFromContext + excludeReasoningFromContext, + continueFinalMessage } = options; const normalizedMessages: ApiChatMessageData[] = messages @@ -209,6 +210,11 @@ export class ChatService { ? ReasoningFormat.NONE : ReasoningFormat.AUTO; + if (continueFinalMessage) { + requestBody.continue_final_message = true; + requestBody.add_generation_prompt = false; + } + if (temperature !== undefined) requestBody.temperature = temperature; if (max_tokens !== undefined) { // Set max_tokens to -1 (infinite) when explicitly configured as 0 or null diff --git a/tools/server/webui/src/lib/stores/chat.svelte.ts b/tools/server/webui/src/lib/stores/chat.svelte.ts index 7b4a4e04293..7c34579ca56 100644 --- a/tools/server/webui/src/lib/stores/chat.svelte.ts +++ b/tools/server/webui/src/lib/stores/chat.svelte.ts @@ -1301,6 +1301,7 @@ class ChatStore { contextWithContinue, { ...this.getApiOptions(), + continueFinalMessage: true, onChunk: (chunk: string) => { appendedContent += chunk; hasReceivedContent = true; diff --git a/tools/server/webui/src/lib/types/api.d.ts b/tools/server/webui/src/lib/types/api.d.ts index c1a02342357..63a464cf194 100644 --- a/tools/server/webui/src/lib/types/api.d.ts +++ b/tools/server/webui/src/lib/types/api.d.ts @@ -239,6 +239,9 @@ export interface ApiChatCompletionRequest { // Custom parameters (JSON string) custom?: Record; timings_per_token?: boolean; + // Continuation control (vLLM compat) + add_generation_prompt?: boolean; + continue_final_message?: boolean; } export interface ApiChatCompletionToolCallFunctionDelta { diff --git a/tools/server/webui/src/lib/types/settings.d.ts b/tools/server/webui/src/lib/types/settings.d.ts index 8f5f164bd72..1ab7a7e5d54 100644 --- a/tools/server/webui/src/lib/types/settings.d.ts +++ b/tools/server/webui/src/lib/types/settings.d.ts @@ -92,6 +92,8 @@ export interface SettingsChatServiceOptions { // Custom parameters custom?: string; timings_per_token?: boolean; + // Continuation control (vLLM compat), opt in to the explicit continue final message flag + continueFinalMessage?: boolean; // Callbacks onChunk?: (chunk: string) => void; onReasoningChunk?: (chunk: string) => void; From ec562eb673ab6fef464d0cf976bcb57925b5eff0 Mon Sep 17 00:00:00 2001 From: shaofeiqi Date: Wed, 13 May 2026 11:57:31 -0700 Subject: [PATCH 4/5] opencl: add q5_0 and q5_1 MoE for Adreno (#22985) * opencl: add q5_0 moe support * opencl: add q5_1 moe support * opencl: avoid potential leak * opencl: suppress unused var warning when building for non-Adreno --------- Co-authored-by: Li He --- ggml/src/ggml-opencl/CMakeLists.txt | 4 + ggml/src/ggml-opencl/ggml-opencl.cpp | 1019 +++++++++++++++-- ggml/src/ggml-opencl/kernels/cvt.cl | 204 ++++ .../kernels/gemm_moe_q5_0_f32_ns.cl | 256 +++++ .../kernels/gemm_moe_q5_1_f32_ns.cl | 258 +++++ .../kernels/gemv_moe_q5_0_f32_ns.cl | 119 ++ .../kernels/gemv_moe_q5_1_f32_ns.cl | 121 ++ 7 files changed, 1914 insertions(+), 67 deletions(-) create mode 100644 ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl create mode 100644 ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl create mode 100644 ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl create mode 100644 ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index 0b39c011371..c6aba608736 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -106,6 +106,10 @@ set(GGML_OPENCL_KERNELS gemv_moe_q4_0_f32_ns gemm_moe_q4_1_f32_ns gemv_moe_q4_1_f32_ns + gemm_moe_q5_0_f32_ns + gemv_moe_q5_0_f32_ns + gemm_moe_q5_1_f32_ns + gemv_moe_q5_1_f32_ns gemm_moe_mxfp4_f32 gemv_moe_mxfp4_f32 gemm_moe_mxfp4_f32_ns diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 248124c2896..0e511592d53 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -556,6 +556,8 @@ struct ggml_backend_opencl_context { cl_kernel kernel_convert_block_q4_0_trans4_ns, kernel_restore_block_q4_0_trans4_ns; cl_kernel kernel_convert_block_q4_1, kernel_restore_block_q4_1; cl_kernel kernel_convert_block_q4_1_trans4_ns, kernel_restore_block_q4_1_trans4_ns; + cl_kernel kernel_convert_block_q5_0_trans4_ns, kernel_restore_block_q5_0_trans4_ns; + cl_kernel kernel_convert_block_q5_1_trans4_ns, kernel_restore_block_q5_1_trans4_ns; cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans; cl_kernel kernel_convert_block_mxfp4_trans4_ns, kernel_restore_block_mxfp4_trans4_ns; cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0, kernel_restore_block_q8_0_trans; @@ -615,6 +617,8 @@ struct ggml_backend_opencl_context { cl_kernel kernel_timestep_embedding; cl_kernel kernel_gemv_moe_q4_0_f32_ns, kernel_gemm_moe_q4_0_f32_ns; cl_kernel kernel_gemv_moe_q4_1_f32_ns, kernel_gemm_moe_q4_1_f32_ns; + cl_kernel kernel_gemv_moe_q5_0_f32_ns, kernel_gemm_moe_q5_0_f32_ns; + cl_kernel kernel_gemv_moe_q5_1_f32_ns, kernel_gemm_moe_q5_1_f32_ns; cl_kernel kernel_gemv_moe_mxfp4_f32, kernel_gemm_moe_mxfp4_f32; cl_kernel kernel_gemv_moe_mxfp4_f32_ns, kernel_gemm_moe_mxfp4_f32_ns; cl_kernel kernel_moe_reorder_b; @@ -973,6 +977,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve CL_CHECK((backend_ctx->kernel_restore_block_q4_1 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q4_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_1_trans4_ns", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q4_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_0_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_block_q5_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_1_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q5_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_1_trans4_ns", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans4_ns", &err), err)); @@ -2995,6 +3003,74 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // gemv_moe_q5_0_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemv_moe_q5_0_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemv_moe_q5_0_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemv_moe_q5_0_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q5_0_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemm_moe_q5_0_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemm_moe_q5_0_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemm_moe_q5_0_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemm_moe_q5_0_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q5_0_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemv_moe_q5_1_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemv_moe_q5_1_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemv_moe_q5_1_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemv_moe_q5_1_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q5_1_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemm_moe_q5_1_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemm_moe_q5_1_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemm_moe_q5_1_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemm_moe_q5_1_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q5_1_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + // gemv_moe_mxfp4_f32_ns { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -3852,6 +3928,122 @@ struct ggml_tensor_extra_cl_q4_1 { } }; +struct ggml_tensor_extra_cl_q5_0 { + // Quantized values. + cl_mem qs = nullptr; + // Quantized values in image1d_buffer_t. + cl_mem qs_img = nullptr; + // 5-th bit values. + cl_mem qh = nullptr; + // 5-th bit values in image1d_buffer_t. + cl_mem qh_img = nullptr; + // Scales. + cl_mem d = nullptr; + // Scales in image1d_buffer_t. + cl_mem d_img = nullptr; + // Size of quantized values. + size_t size_qs = 0; + // Size of 5-th bit values. + size_t size_qh = 0; + // Size of scales. + size_t size_d = 0; + + ~ggml_tensor_extra_cl_q5_0() { + reset(); + } + + void reset() { + if (qs != nullptr) { + CL_CHECK(clReleaseMemObject(qs)); + qs = nullptr; + } + if (qh != nullptr) { + CL_CHECK(clReleaseMemObject(qh)); + qh = nullptr; + } + if (d != nullptr) { + CL_CHECK(clReleaseMemObject(d)); + d = nullptr; + } + if (qs_img != nullptr) { + CL_CHECK(clReleaseMemObject(qs_img)); + qs_img = nullptr; + } + + qh_img = nullptr; + d_img = nullptr; + size_qs = 0; + size_qh = 0; + size_d = 0; + } +}; + +struct ggml_tensor_extra_cl_q5_1 { + // Quantized values. + cl_mem qs = nullptr; + // Quantized values in image1d_buffer_t. + cl_mem qs_img = nullptr; + // 5-th bit values. + cl_mem qh = nullptr; + // 5-th bit values in image1d_buffer_t. + cl_mem qh_img = nullptr; + // Scales. + cl_mem d = nullptr; + // Scales in image1d_buffer_t. + cl_mem d_img = nullptr; + // Min + cl_mem m = nullptr; + // Min in image1d_buffer_t. + cl_mem m_img = nullptr; + // Size of quantized values. + size_t size_qs = 0; + // Size of 5-th bit values. + size_t size_qh = 0; + // Size of scales. + size_t size_d = 0; + // Size of min values. + size_t size_m = 0; + + ~ggml_tensor_extra_cl_q5_1() { + reset(); + } + + void reset() { + // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer. + // They must be properly released so that the original buffer can be + // properly released to avoid memory leak. + if (qs != nullptr) { + CL_CHECK(clReleaseMemObject(qs)); + qs = nullptr; + } + if (qh != nullptr) { + CL_CHECK(clReleaseMemObject(qh)); + qh = nullptr; + } + if (d != nullptr) { + CL_CHECK(clReleaseMemObject(d)); + d = nullptr; + } + if (m != nullptr) { + CL_CHECK(clReleaseMemObject(m)); + m = nullptr; + } + if (qs_img != nullptr) { + CL_CHECK(clReleaseMemObject(qs_img)); + qs_img = nullptr; + } + // qh_img, d_img, and m_img are not currently allocated separately. + // TODO: initialize them for non SMALL_PATH path, or remove them. + qh_img = nullptr; + d_img = nullptr; + m_img = nullptr; + size_qs = 0; + size_qh = 0; + size_d = 0; + size_m = 0; + } +}; + struct ggml_tensor_extra_cl_mxfp4 { // Quantized values. cl_mem q = nullptr; @@ -4506,7 +4698,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te } // q4_0, q8_0 and mxfp4 have general MUL_MAT_ID support, // the quantizations here currently do not - they are only supported by Adreno with certain shapes - if (op->src[0]->type == GGML_TYPE_Q4_1) { + if (op->src[0]->type == GGML_TYPE_Q4_1 || + op->src[0]->type == GGML_TYPE_Q5_0 || + op->src[0]->type == GGML_TYPE_Q5_1) { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS if (op->src[1]->type == GGML_TYPE_F32) { return use_adreno_moe_kernels(backend_ctx, op->src[0]) @@ -4692,6 +4886,18 @@ struct ggml_backend_opencl_buffer_context { for (ggml_tensor_extra_cl_q4_1 * e : temp_tensor_extras_q4_1_in_use) { delete e; } + for (ggml_tensor_extra_cl_q5_0 * e : temp_tensor_extras_q5_0) { + delete e; + } + for (ggml_tensor_extra_cl_q5_0 * e : temp_tensor_extras_q5_0_in_use) { + delete e; + } + for (ggml_tensor_extra_cl_q5_1 * e : temp_tensor_extras_q5_1) { + delete e; + } + for (ggml_tensor_extra_cl_q5_1 * e : temp_tensor_extras_q5_1_in_use) { + delete e; + } for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4) { delete e; } @@ -4775,6 +4981,36 @@ struct ggml_backend_opencl_buffer_context { return extra; } + ggml_tensor_extra_cl_q5_0 * ggml_opencl_alloc_temp_tensor_extra_q5_0() { + ggml_tensor_extra_cl_q5_0 * extra; + if (temp_tensor_extras_q5_0.empty()) { + extra = new ggml_tensor_extra_cl_q5_0(); + } else { + extra = temp_tensor_extras_q5_0.back(); + temp_tensor_extras_q5_0.pop_back(); + } + + temp_tensor_extras_q5_0_in_use.push_back(extra); + + extra->reset(); + return extra; + } + + ggml_tensor_extra_cl_q5_1 * ggml_opencl_alloc_temp_tensor_extra_q5_1() { + ggml_tensor_extra_cl_q5_1 * extra; + if (temp_tensor_extras_q5_1.empty()) { + extra = new ggml_tensor_extra_cl_q5_1(); + } else { + extra = temp_tensor_extras_q5_1.back(); + temp_tensor_extras_q5_1.pop_back(); + } + + temp_tensor_extras_q5_1_in_use.push_back(extra); + + extra->reset(); + return extra; + } + ggml_tensor_extra_cl_mxfp4 * ggml_opencl_alloc_temp_tensor_extra_mxfp4() { ggml_tensor_extra_cl_mxfp4 * extra; if (temp_tensor_extras_mxfp4.empty()) { @@ -4881,6 +5117,16 @@ struct ggml_backend_opencl_buffer_context { } temp_tensor_extras_q4_1_in_use.clear(); + for (ggml_tensor_extra_cl_q5_0 * e : temp_tensor_extras_q5_0_in_use) { + temp_tensor_extras_q5_0.push_back(e); + } + temp_tensor_extras_q5_0_in_use.clear(); + + for (ggml_tensor_extra_cl_q5_1 * e : temp_tensor_extras_q5_1_in_use) { + temp_tensor_extras_q5_1.push_back(e); + } + temp_tensor_extras_q5_1_in_use.clear(); + for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4_in_use) { temp_tensor_extras_mxfp4.push_back(e); } @@ -4923,6 +5169,10 @@ struct ggml_backend_opencl_buffer_context { std::vector temp_tensor_extras_q4_0_in_use; std::vector temp_tensor_extras_q4_1; std::vector temp_tensor_extras_q4_1_in_use; + std::vector temp_tensor_extras_q5_0; + std::vector temp_tensor_extras_q5_0_in_use; + std::vector temp_tensor_extras_q5_1; + std::vector temp_tensor_extras_q5_1_in_use; std::vector temp_tensor_extras_mxfp4; std::vector temp_tensor_extras_mxfp4_in_use; std::vector temp_tensor_extras_q8_0; @@ -5286,17 +5536,18 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, #endif // GGML_OPENCL_USE_ADRENO_KERNELS return; } - if (tensor->type == GGML_TYPE_MXFP4) { + if (tensor->type == GGML_TYPE_Q5_0) { ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra; GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized"); // Allocate the new extra and create aliases from the original. ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; - ggml_tensor_extra_cl_mxfp4 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_mxfp4(); + ggml_tensor_extra_cl_q5_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q5_0(); - size_t size_e = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(char); - size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2; - GGML_ASSERT(size_e + size_q == ggml_nbytes(tensor) && "Incorrect tensor size"); + size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t); + size_t size_qs = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2; + size_t size_qh = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(int32_t); + GGML_ASSERT(size_d + size_qs + size_qh == ggml_nbytes(tensor) && "Incorrect tensor size"); cl_int err; cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, @@ -5306,40 +5557,48 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, queue, data_device, CL_TRUE, 0, ggml_nbytes(tensor), data, 0, NULL, NULL)); - // The original tensor memory is divided into scales and quants, i.e., - // we first store scales, then quants. cl_buffer_region region; // Create subbuffer for scales. region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment); - region.size = size_e; - extra->e = clCreateSubBuffer( + region.size = size_d; + extra->d = clCreateSubBuffer( extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); CL_CHECK(err); auto previous_origin = region.origin; - // Create subbuffer for quants. - region.origin = align_to(previous_origin + size_e, backend_ctx->alignment); - region.size = size_q; - extra->q = clCreateSubBuffer( + // Create subbuffer for qh. + region.origin = align_to(previous_origin + size_d, backend_ctx->alignment); + region.size = size_qh; + extra->qh = clCreateSubBuffer( + extra_orig->data_device, CL_MEM_READ_WRITE, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); + previous_origin = region.origin; + + // Create subbuffer for qs. + region.origin = align_to(previous_origin + size_qh, backend_ctx->alignment); + region.size = size_qs; + extra->qs = clCreateSubBuffer( extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); CL_CHECK(err); #ifdef GGML_OPENCL_USE_ADRENO_KERNELS - // Adreno moe mxfp4 kernel needs special transpose and unshuffling + // Adreno moe q5_0 kernel needs special transpose and unshuffling if (use_adreno_moe_kernels(backend_ctx, tensor)) { - cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4_trans4_ns; + cl_kernel kernel = backend_ctx->kernel_convert_block_q5_0_trans4_ns; int ne00 = tensor->ne[0]; int ne01 = tensor->ne[1]; int ne02 = tensor->ne[2]; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); size_t global_work_size[3] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 32), static_cast(ne02)}; size_t local_work_size[3] = {64, 2, 1}; @@ -5348,61 +5607,36 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); CL_CHECK(clWaitForEvents(1, &evt)); CL_CHECK(clReleaseMemObject(data_device)); - tensor->extra = extra; // Create image for Q - cl_image_format img_format_q = {CL_R, CL_UNSIGNED_INT32}; - cl_image_desc img_desc_q = { + cl_image_format img_format_qs = {CL_R, CL_UNSIGNED_INT32}; + cl_image_desc img_desc_qs = { CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ggml_nelements(tensor) / 8), 0, 0, 0, 0, 0, 0, 0, - { extra->q } + { extra->qs } }; - extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err); + extra->qs_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_qs, &img_desc_qs, NULL, &err); tensor->extra = extra; return; } - #endif // GGML_OPENCL_USE_ADRENO_KERNELS - cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4; - - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e)); - - size_t global_work_size[3] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; - size_t local_work_size[3] = {64, 1, 1}; - - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - CL_CHECK(clWaitForEvents(1, &evt)); - CL_CHECK(clReleaseMemObject(data_device)); - - // Create image for Q - cl_image_format img_format_q = {CL_RG, CL_UNSIGNED_INT32}; - cl_image_desc img_desc_q = { - CL_MEM_OBJECT_IMAGE1D_BUFFER, - static_cast(ggml_nelements(tensor)/32*2), - 0, 0, 0, 0, 0, 0, 0, - { extra->q } - }; - extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err); - tensor->extra = extra; - return; } - if (tensor->type == GGML_TYPE_Q8_0) { + if (tensor->type == GGML_TYPE_Q5_1) { ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra; GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized"); // Allocate the new extra and create aliases from the original. ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; - ggml_tensor_extra_cl_q8_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q8_0(); + ggml_tensor_extra_cl_q5_1 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q5_1(); size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t); - size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*(ggml_blck_size(tensor->type)*sizeof(char)); - GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size"); + size_t size_m = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t); + size_t size_qs = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2; + size_t size_qh = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(int32_t); + GGML_ASSERT(size_d + size_m + size_qs + size_qh == ggml_nbytes(tensor) && "Incorrect tensor size"); cl_int err; cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, @@ -5412,10 +5646,10 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, queue, data_device, CL_TRUE, 0, ggml_nbytes(tensor), data, 0, NULL, NULL)); - // The original tensor memory is divided into scales and quants, i.e., - // we first store scales, then quants. cl_buffer_region region; + // The original tensor memory is divided into scales and quants, i.e., + // we first store scales, mins, then quants. // Create subbuffer for scales. region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment); region.size = size_d; @@ -5425,22 +5659,227 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, CL_CHECK(err); auto previous_origin = region.origin; - // Create subbuffer for quants. + // Create subbuffer for mins. region.origin = align_to(previous_origin + size_d, backend_ctx->alignment); - region.size = size_q; - extra->q = clCreateSubBuffer( + region.size = size_m; + extra->m = clCreateSubBuffer( extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); CL_CHECK(err); + previous_origin = region.origin; - cl_kernel kernel = backend_ctx->kernel_convert_block_q8_0; + // Create subbuffer for qh. + region.origin = align_to(previous_origin + size_m, backend_ctx->alignment); + region.size = size_qh; + extra->qh = clCreateSubBuffer( + extra_orig->data_device, CL_MEM_READ_WRITE, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); + previous_origin = region.origin; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); + // Create subbuffer for qs. + region.origin = align_to(previous_origin + size_qh, backend_ctx->alignment); + region.size = size_qs; + extra->qs = clCreateSubBuffer( + extra_orig->data_device, CL_MEM_READ_WRITE, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); - size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; - size_t local_work_size[] = {64, 1, 1}; +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + // Adreno moe q5_1 kernel needs special transpose and unshuffling + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_kernel kernel = backend_ctx->kernel_convert_block_q5_1_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->m)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01)); + + size_t global_work_size[3] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 32), static_cast(ne02)}; + size_t local_work_size[3] = {64, 2, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + + // Create image for Q + cl_image_format img_format_qs = {CL_R, CL_UNSIGNED_INT32}; + cl_image_desc img_desc_qs = { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + static_cast(ggml_nelements(tensor) / 8), + 0, 0, 0, 0, 0, 0, 0, + { extra->qs } + }; + extra->qs_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_qs, &img_desc_qs, NULL, &err); + tensor->extra = extra; + + return; + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + return; + } + if (tensor->type == GGML_TYPE_MXFP4) { + ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra; + GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized"); + + // Allocate the new extra and create aliases from the original. + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + ggml_tensor_extra_cl_mxfp4 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_mxfp4(); + + size_t size_e = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(char); + size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2; + GGML_ASSERT(size_e + size_q == ggml_nbytes(tensor) && "Incorrect tensor size"); + + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + CL_CHECK(clEnqueueWriteBuffer( + queue, data_device, CL_TRUE, 0, + ggml_nbytes(tensor), data, 0, NULL, NULL)); + + // The original tensor memory is divided into scales and quants, i.e., + // we first store scales, then quants. + cl_buffer_region region; + + // Create subbuffer for scales. + region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment); + region.size = size_e; + extra->e = clCreateSubBuffer( + extra_orig->data_device, CL_MEM_READ_WRITE, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); + auto previous_origin = region.origin; + + // Create subbuffer for quants. + region.origin = align_to(previous_origin + size_e, backend_ctx->alignment); + region.size = size_q; + extra->q = clCreateSubBuffer( + extra_orig->data_device, CL_MEM_READ_WRITE, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); + +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + // Adreno moe mxfp4 kernel needs special transpose and unshuffling + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01)); + + size_t global_work_size[3] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 32), static_cast(ne02)}; + size_t local_work_size[3] = {64, 2, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + tensor->extra = extra; + + // Create image for Q + cl_image_format img_format_q = {CL_R, CL_UNSIGNED_INT32}; + cl_image_desc img_desc_q = { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + static_cast(ggml_nelements(tensor) / 8), + 0, 0, 0, 0, 0, 0, 0, + { extra->q } + }; + extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err); + tensor->extra = extra; + + return; + } + +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e)); + + size_t global_work_size[3] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; + size_t local_work_size[3] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + + // Create image for Q + cl_image_format img_format_q = {CL_RG, CL_UNSIGNED_INT32}; + cl_image_desc img_desc_q = { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + static_cast(ggml_nelements(tensor)/32*2), + 0, 0, 0, 0, 0, 0, 0, + { extra->q } + }; + extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err); + tensor->extra = extra; + + return; + } + if (tensor->type == GGML_TYPE_Q8_0) { + ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra; + GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized"); + + // Allocate the new extra and create aliases from the original. + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + ggml_tensor_extra_cl_q8_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q8_0(); + + size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t); + size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*(ggml_blck_size(tensor->type)*sizeof(char)); + GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size"); + + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + CL_CHECK(clEnqueueWriteBuffer( + queue, data_device, CL_TRUE, 0, + ggml_nbytes(tensor), data, 0, NULL, NULL)); + + // The original tensor memory is divided into scales and quants, i.e., + // we first store scales, then quants. + cl_buffer_region region; + + // Create subbuffer for scales. + region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment); + region.size = size_d; + extra->d = clCreateSubBuffer( + extra_orig->data_device, CL_MEM_READ_WRITE, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); + auto previous_origin = region.origin; + + // Create subbuffer for quants. + region.origin = align_to(previous_origin + size_d, backend_ctx->alignment); + region.size = size_q; + extra->q = clCreateSubBuffer( + extra_orig->data_device, CL_MEM_READ_WRITE, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); + + cl_kernel kernel = backend_ctx->kernel_convert_block_q8_0; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); + + size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; + size_t local_work_size[] = {64, 1, 1}; cl_event evt; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); @@ -6109,6 +6548,89 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, CL_CHECK(clReleaseMemObject(data_device)); return; } + if (tensor->type == GGML_TYPE_Q5_0) { + ggml_tensor_extra_cl_q5_0 * extra = (ggml_tensor_extra_cl_q5_0 *)tensor->extra; + +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_int err; + // TODO: use ggml_cl_buffer to manage this temporary buffer + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + + cl_kernel kernel = backend_ctx->kernel_restore_block_q5_0_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->qs)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne01)); + + size_t global_work_size[3] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 32), static_cast(ne02)}; + size_t local_work_size[3] = {64, 2, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, + global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clEnqueueReadBuffer( + queue, data_device, CL_TRUE, offset, + size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); + return; + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + // TODO: normal q5_0 + (void) extra; + return; + } + if (tensor->type == GGML_TYPE_Q5_1) { + ggml_tensor_extra_cl_q5_1 * extra = (ggml_tensor_extra_cl_q5_1 *)tensor->extra; + +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_int err; + // TODO: use ggml_cl_buffer to manage this temporary buffer + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + + cl_kernel kernel = backend_ctx->kernel_restore_block_q5_1_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->qs)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->m)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne01)); + + size_t global_work_size[3] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 32), static_cast(ne02)}; + size_t local_work_size[3] = {64, 2, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, + global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clEnqueueReadBuffer( + queue, data_device, CL_TRUE, offset, + size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); + return; + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + // TODO: normal q5_1 + (void) extra; + return; + } if (tensor->type == GGML_TYPE_MXFP4) { ggml_tensor_extra_cl_mxfp4 * extra = (ggml_tensor_extra_cl_mxfp4 *)tensor->extra; @@ -13209,10 +13731,17 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, #ifdef GGML_OPENCL_SOA_Q ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra; ggml_tensor_extra_cl_q4_1 * extra0_q4_1 = (ggml_tensor_extra_cl_q4_1 *)src0->extra; + ggml_tensor_extra_cl_q5_0 * extra0_q5_0 = (ggml_tensor_extra_cl_q5_0 *)src0->extra; + ggml_tensor_extra_cl_q5_1 * extra0_q5_1 = (ggml_tensor_extra_cl_q5_1 *)src0->extra; ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra; ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra; #endif + // TODO: general MoE for the following types + (void)extra0_q4_1; + (void)extra0_q5_0; + (void)extra0_q5_1; + const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; const int ne02 = src0->ne[2]; @@ -13540,8 +14069,11 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, } else { // for gemm kernel = backend_ctx->kernel_gemm_moe_q4_1_f32_ns; - if (strstr(src0->name, "as") != NULL) { + // Reorder router if called from test-backend-ops or when new router is generated. + // Otherwise reuse the reordered result from previous mul_mat_id call. + if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) { moe_router_reoerder(backend, src2, ne20); + backend_ctx->toggle_reorder = false; } cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image; @@ -13649,6 +14181,359 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, } return; } +#endif //GGML_OPENCL_USE_ADRENO_KERNELS + } + case GGML_TYPE_Q5_0: { +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, src0)) { + cl_int status; + + size_t local_size[3] = {64, 2, 1}; + size_t global_size[3] = {64, 2, 1}; + + if (ne12 == 1) { // for gemv + kernel = backend_ctx->kernel_gemv_moe_q5_0_f32_ns; + + cl_mem src1_sub_buffer, buf_src1_image, buf_src2; + + // create a sub_buffer for src2 + cl_buffer_region region; + region.origin = offset2; + region.size = ne20 * ne21 * sizeof(int); + buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // set thread grid + global_size[0] = static_cast(ne01); + global_size[1] = 4; + global_size[2] = static_cast(ne20); + local_size[1] = 4; + + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // create image for src1 + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}}; + buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->qs)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11)); + + // launch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + // deallocate sub buffers and images + CL_CHECK(clReleaseMemObject(src1_sub_buffer)); + CL_CHECK(clReleaseMemObject(buf_src1_image)); + CL_CHECK(clReleaseMemObject(buf_src2)); + + } else { // for gemm + kernel = backend_ctx->kernel_gemm_moe_q5_0_f32_ns; + + // Reorder router if called from test-backend-ops or when new router is generated. + // Otherwise reuse the reordered result from previous mul_mat_id call. + if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) { + moe_router_reoerder(backend, src2, ne20); + backend_ctx->toggle_reorder = false; + } + + cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image; + cl_mem buf_src2, buf_src2_emap; + + cl_buffer_region region; + region.origin = 0; + region.size = sizeof(int) * max_post_router_tile * n_tile_size; + buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + region.origin = 0; + region.size = sizeof(short) * max_post_router_tile; + buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Reorder activations + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Create image for reordered src1 + // Use pre-allocated placeholder + region.origin = 0; + region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float); + backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size); + buf_src1_reordered = clCreateSubBuffer( + backend_ctx->prealloc_act_trans.buffer, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + cl_image_format image_format_buf_src1; + cl_image_desc image_desc_buf_src1; + image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}}; + image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + unsigned short map_ratio = ne20 / ne11; + GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n"); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem), &sub_buf_src1_pre)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem), &buf_src1_reordered)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int), &ne00)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short), &map_ratio)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int), &n_tile_size)); + + size_t reorder_b_local_size[3] = {256, 1, 1}; + size_t reorder_b_global_size[3] = {static_cast(((ne00 / 4) + 255) / 256 * 256), static_cast(max_post_router_tile * n_tile_size), 1}; + + // Dispatch reorder kernel + backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst); + + // MoE kernel prepare + // Create sub buffer for dst + region.origin = offsetd; + region.size = ne0 * ne1 * ne2 * sizeof(float); + sub_buf_dst = clCreateSubBuffer( + extrad->data_device, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + // Create image for dst + cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT}; + cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}}; + buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->qs_img)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_dst_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + + // set thread grid + global_size[1] = static_cast((ne01 + 63) / 64); + global_size[2] = static_cast(max_post_router_tile); + local_size[1] = 1; + local_size[2] = 1; + + // Dispatch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + clReleaseMemObject(sub_buf_src1_pre); + clReleaseMemObject(buf_src1_reordered); + clReleaseMemObject(image_src1_reordered); + clReleaseMemObject(buf_src2); + clReleaseMemObject(buf_src2_emap); + clReleaseMemObject(sub_buf_dst); + clReleaseMemObject(buf_dst_image); + } + return; + } +#endif //GGML_OPENCL_USE_ADRENO_KERNELS + } + case GGML_TYPE_Q5_1: { +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, src0)) { + cl_int status; + + size_t local_size[3] = {64, 2, 1}; + size_t global_size[3] = {64, 2, 1}; + + if (ne12 == 1) { // for gemv + kernel = backend_ctx->kernel_gemv_moe_q5_1_f32_ns; + + cl_mem src1_sub_buffer, buf_src1_image, buf_src2; + + // create a sub_buffer for src2 + cl_buffer_region region; + region.origin = offset2; + region.size = ne20 * ne21 * sizeof(int); + buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // set thread grid + global_size[0] = static_cast(ne01); + global_size[1] = 4; + global_size[2] = static_cast(ne20); + local_size[1] = 4; + + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // create image for src1 + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}}; + buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->qs)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->m)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11)); + + // launch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + // deallocate sub buffers and images + CL_CHECK(clReleaseMemObject(src1_sub_buffer)); + CL_CHECK(clReleaseMemObject(buf_src1_image)); + CL_CHECK(clReleaseMemObject(buf_src2)); + } else { // for gemm + kernel = backend_ctx->kernel_gemm_moe_q5_1_f32_ns; + + // Reorder router if called from test-backend-ops or when new router is generated. + // Otherwise reuse the reordered result from previous mul_mat_id call. + if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) { + moe_router_reoerder(backend, src2, ne20); + backend_ctx->toggle_reorder = false; + } + + cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image; + cl_mem buf_src2, buf_src2_emap; + + cl_buffer_region region; + region.origin = 0; + region.size = sizeof(int) * max_post_router_tile * n_tile_size; + buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + region.origin = 0; + region.size = sizeof(short) * max_post_router_tile; + buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Reorder activations + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Create image for reordered src1 + // Use pre-allocated placeholder + region.origin = 0; + region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float); + backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size); + buf_src1_reordered = clCreateSubBuffer( + backend_ctx->prealloc_act_trans.buffer, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + cl_image_format image_format_buf_src1; + cl_image_desc image_desc_buf_src1; + image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}}; + image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + unsigned short map_ratio = ne20 / ne11; + GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n"); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem), &sub_buf_src1_pre)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem), &buf_src1_reordered)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int), &ne00)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short), &map_ratio)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int), &n_tile_size)); + + size_t reorder_b_local_size[3] = {256, 1, 1}; + size_t reorder_b_global_size[3] = {static_cast(((ne00 / 4) + 255) / 256 * 256), static_cast(max_post_router_tile * n_tile_size), 1}; + + // Dispatch reorder kernel + backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst); + + // MoE kernel prepare + // Create sub buffer for dst + region.origin = offsetd; + region.size = ne0 * ne1 * ne2 * sizeof(float); + sub_buf_dst = clCreateSubBuffer( + extrad->data_device, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + // Create image for dst + cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT}; + cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}}; + buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->qs_img)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->m)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_dst_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + + // set thread grid + global_size[1] = static_cast((ne01 + 63) / 64); + global_size[2] = static_cast(max_post_router_tile); + local_size[1] = 1; + local_size[2] = 1; + + // Dispatch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + clReleaseMemObject(sub_buf_src1_pre); + clReleaseMemObject(buf_src1_reordered); + clReleaseMemObject(image_src1_reordered); + clReleaseMemObject(buf_src2); + clReleaseMemObject(buf_src2_emap); + clReleaseMemObject(sub_buf_dst); + clReleaseMemObject(buf_dst_image); + } + return; + } #endif //GGML_OPENCL_USE_ADRENO_KERNELS } case GGML_TYPE_Q8_0: { diff --git a/ggml/src/ggml-opencl/kernels/cvt.cl b/ggml/src/ggml-opencl/kernels/cvt.cl index 5bbf09710f9..8f06d570587 100644 --- a/ggml/src/ggml-opencl/kernels/cvt.cl +++ b/ggml/src/ggml-opencl/kernels/cvt.cl @@ -56,6 +56,25 @@ struct block_q4_1 { uchar qs[QK4_1 / 2]; // nibbles / quants }; +//------------------------------------------------------------------------------ +// block_q5_0 +//------------------------------------------------------------------------------ +struct block_q5_0 { + half d; // delta + uchar qh[4]; // 5-th bit of quants + uchar qs[QK5_0 / 2]; // nibbles / quants +}; + +//------------------------------------------------------------------------------ +// block_q5_1 +//------------------------------------------------------------------------------ +struct block_q5_1 { + half d; // delta + half m; // min + uchar qh[4]; // 5-th bit of quants + uchar qs[QK5_1 / 2]; // nibbles / quants +}; + //------------------------------------------------------------------------------ // block_q4_k //------------------------------------------------------------------------------ @@ -460,6 +479,191 @@ kernel void kernel_restore_block_q4_1_trans4_ns( ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block; } +kernel void kernel_convert_block_q5_0_trans4_ns( + __global struct block_q5_0 * src0, + __global uint * dst_qs, + __global uint * dst_qh, + __global half * dst_d, + uint ne00, + uint ne01 +) { + uint i00 = get_global_id(1); + uint i01 = get_global_id(0); + uint i02 = get_global_id(2); + + uint ne00_blk = ne00 / QK5_0; + uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + + global struct block_q5_0 * b = src0 + src_blk_offset; + dst_d[dst_blk_offset] = b->d; + + dst_qh[dst_blk_offset] = ((global uint *)(&(b->qh[0])))[0]; + + // extract quantization and unshuffle + ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0]; + ushort8 post_block = (ushort8)(0); + + uchar * pre_block_ptr = (uchar *)(&pre_block); + uchar * post_block_ptr = (uchar *)(&post_block); + + for (int i = 0; i < QK5_0 / 4; ++i) { + uchar x0 = pre_block_ptr[2*i + 0]; + uchar x1 = pre_block_ptr[2*i + 1]; + + post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4); + post_block_ptr[i + QK5_0 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0); + } + + uint4 q_block = as_uint4(post_block); + + uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01; + dst_qs[offset] = q_block.x; + dst_qs[offset + ne01] = q_block.y; + dst_qs[offset + ne01 * 2] = q_block.z; + dst_qs[offset + ne01 * 3] = q_block.w; +} + +kernel void kernel_restore_block_q5_0_trans4_ns( + __global uint * src_qs, + __global uint * src_qh, + __global half * src_d, + __global struct block_q5_0 * dst0, + uint ne00, + uint ne01 +) { + int i00 = get_global_id(1); + uint i01 = get_global_id(0); + uint i02 = get_global_id(2); + + uint ne00_blk = ne00 / QK5_0; + uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + + __global struct block_q5_0 * b = dst0 + dst_blk_offset; + b->d = src_d[src_blk_offset]; + + ((__global uint *)(&(b->qh[0])))[0] = src_qh[src_blk_offset]; + + // collect transposed quantization parts for a block + uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01; + uint4 q_block; + q_block.x = src_qs[src_q_offset]; + q_block.y = src_qs[src_q_offset + ne01]; + q_block.z = src_qs[src_q_offset + ne01 * 2]; + q_block.w = src_qs[src_q_offset + ne01 * 3]; + + ushort8 post_block = as_ushort8(q_block); + ushort8 pre_block = (ushort8)(0); + + uchar * pre_block_ptr = (uchar *)(&pre_block); + uchar * post_block_ptr = (uchar *)(&post_block); + + for (int i = 0; i < QK5_0 / 4; ++i) { + uchar x0 = post_block_ptr[i + 0]; + uchar x1 = post_block_ptr[i + QK5_0 / 4]; + + pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4); + pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0); + } + + ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block; +} + +kernel void kernel_convert_block_q5_1_trans4_ns( + __global struct block_q5_1 * src0, + __global uint * dst_qs, + __global uint * dst_qh, + __global half * dst_d, + __global half * dst_m, + uint ne00, + uint ne01 +) { + uint i00 = get_global_id(1); + uint i01 = get_global_id(0); + uint i02 = get_global_id(2); + + uint ne00_blk = ne00 / QK5_1; + uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + + global struct block_q5_1 * b = src0 + src_blk_offset; + dst_d[dst_blk_offset] = b->d; + dst_m[dst_blk_offset] = b->m; + + dst_qh[dst_blk_offset] = ((global uint *)(&(b->qh[0])))[0]; + + // extract quantization and unshuffle + ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0]; + ushort8 post_block = (ushort8)(0); + + uchar * pre_block_ptr = (uchar *)(&pre_block); + uchar * post_block_ptr = (uchar *)(&post_block); + + for (int i = 0; i < QK5_1 / 4; ++i) { + uchar x0 = pre_block_ptr[2*i + 0]; + uchar x1 = pre_block_ptr[2*i + 1]; + + post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4); + post_block_ptr[i + QK5_1 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0); + } + + uint4 q_block = as_uint4(post_block); + + uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01; + dst_qs[offset] = q_block.x; + dst_qs[offset + ne01] = q_block.y; + dst_qs[offset + ne01 * 2] = q_block.z; + dst_qs[offset + ne01 * 3] = q_block.w; +} + +kernel void kernel_restore_block_q5_1_trans4_ns( + __global uint * src_qs, + __global uint * src_qh, + __global half * src_d, + __global half * src_m, + __global struct block_q5_1 * dst0, + uint ne00, + uint ne01 +) { + int i00 = get_global_id(1); + uint i01 = get_global_id(0); + uint i02 = get_global_id(2); + + uint ne00_blk = ne00 / QK5_1; + uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + + __global struct block_q5_1 * b = dst0 + dst_blk_offset; + b->d = src_d[src_blk_offset]; + b->m = src_m[src_blk_offset]; + + ((__global uint *)(&(b->qh[0])))[0] = src_qh[src_blk_offset]; + + // collect transposed quantization parts for a block + uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01; + uint4 q_block; + q_block.x = src_qs[src_q_offset]; + q_block.y = src_qs[src_q_offset + ne01]; + q_block.z = src_qs[src_q_offset + ne01 * 2]; + q_block.w = src_qs[src_q_offset + ne01 * 3]; + + ushort8 post_block = as_ushort8(q_block); + ushort8 pre_block = (ushort8)(0); + + uchar * pre_block_ptr = (uchar *)(&pre_block); + uchar * post_block_ptr = (uchar *)(&post_block); + + for (int i = 0; i < QK5_1 / 4; ++i) { + uchar x0 = post_block_ptr[i + 0]; + uchar x1 = post_block_ptr[i + QK5_1 / 4]; + + pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4); + pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0); + } + ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block; +} + //------------------------------------------------------------------------------ // block_mxfp4 //------------------------------------------------------------------------------ diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl new file mode 100644 index 00000000000..3524cb1bdbd --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl @@ -0,0 +1,256 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable +#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable + +#define TILESIZE_K 16 +#define TILESIZE_M 64 +#define TILESIZE_N 32 + + +#define dequantize_q5_0(qs5x16, qh5x16, a_f16, scale) \ + a_f16.s0 = (half)((( qs5x16.s0 & 0x000F) | (( qh5x16.s0 & 0x01) << 4)) - 16) * scale; \ + a_f16.s1 = (half)((((qs5x16.s0 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 1) & 0x01) << 4)) - 16) * scale; \ + a_f16.s2 = (half)((((qs5x16.s0 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 2) & 0x01) << 4)) - 16) * scale; \ + a_f16.s3 = (half)((((qs5x16.s0 & 0xF000) >> 12) | (((qh5x16.s0 >> 3) & 0x01) << 4)) - 16) * scale; \ + a_f16.s4 = (half)((( qs5x16.s1 & 0x000F) | (((qh5x16.s0 >> 4) & 0x01) << 4)) - 16) * scale; \ + a_f16.s5 = (half)((((qs5x16.s1 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 5) & 0x01) << 4)) - 16) * scale; \ + a_f16.s6 = (half)((((qs5x16.s1 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 6) & 0x01) << 4)) - 16) * scale; \ + a_f16.s7 = (half)((((qs5x16.s1 & 0xF000) >> 12) | (((qh5x16.s0 >> 7) & 0x01) << 4)) - 16) * scale; \ + a_f16.s8 = (half)((( qs5x16.s2 & 0x000F) | (( qh5x16.s1 & 0x01) << 4)) - 16) * scale; \ + a_f16.s9 = (half)((((qs5x16.s2 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 1) & 0x01) << 4)) - 16) * scale; \ + a_f16.sa = (half)((((qs5x16.s2 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 2) & 0x01) << 4)) - 16) * scale; \ + a_f16.sb = (half)((((qs5x16.s2 & 0xF000) >> 12) | (((qh5x16.s1 >> 3) & 0x01) << 4)) - 16) * scale; \ + a_f16.sc = (half)((( qs5x16.s3 & 0x000F) | (((qh5x16.s1 >> 4) & 0x01) << 4)) - 16) * scale; \ + a_f16.sd = (half)((((qs5x16.s3 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 5) & 0x01) << 4)) - 16) * scale; \ + a_f16.se = (half)((((qs5x16.s3 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 6) & 0x01) << 4)) - 16) * scale; \ + a_f16.sf = (half)((((qs5x16.s3 & 0xF000) >> 12) | (((qh5x16.s1 >> 7) & 0x01) << 4)) - 16) * scale; \ + + +#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \ + acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \ + acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \ + acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \ + acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \ + acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \ + acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \ + acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \ + acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \ + acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \ + acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \ + acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \ + acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \ + acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \ + acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \ + acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \ + acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \ + acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \ + acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \ + acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \ + acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \ + acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \ + acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \ + acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \ + acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \ + acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \ + acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \ + acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \ + acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \ + acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \ + acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \ + acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \ + acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \ + acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \ + acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \ + acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \ + acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \ + acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \ + acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \ + acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \ + acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \ + acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \ + acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \ + acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \ + acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \ + acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \ + acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \ + acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \ + acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \ + acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \ + acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \ + acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \ + acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \ + acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \ + acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \ + acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \ + acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \ + acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \ + acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \ + acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \ + acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \ + acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \ + acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \ + acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + + +__attribute__((qcom_wave_pair_mode(1))) // 1=force single 2=force pair +kernel void kernel_gemm_moe_q5_0_f32_ns( + __read_only image1d_buffer_t src0_qs, + __global uint * src0_qh, + __global half * src0_d, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global ushort * src2_emap, + __write_only image1d_buffer_t dst, + __global int * total_tiles, + uint ne00, + uint ne01 +) { + uint block_id_m = get_global_id(1); // m_tile + uint block_id_n = get_global_id(2); // n_tile + + // Boundary check + if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) { + return; + } + + __private half16 reg_a; + __private float32 reg_c = (float32)(0); + __local half4 shared_b[128]; + + const ushort expert_id = src2_emap[block_id_n]; + + const uint row = block_id_m * TILESIZE_M; + const uint col = block_id_n * TILESIZE_N; + + uint sub_block_id_m = get_local_id(0); + uint2 b_global_offset; + b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00; + b_global_offset.y = b_global_offset.x + (16 * ne00); + uint2 b_local_offset; + b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2); + b_local_offset.y = b_local_offset.x + 16; + + // Loop along K axis, 32 elements (one block) for each iteration, divided into 2 sub-blocks + for (uint step = 0; step < ne00; step += TILESIZE_K * 2) { + // First sub-block + uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + uint s_sub_offset = row + ((ne01 * step) >> 5) + ((expert_id * ne00 * ne01) >> 5); + uint b_sub_offset = col * ne00 + step; + + // Load scale for current Q5_0 block + uint blk_offset = s_sub_offset + get_global_id(0); + half s = src0_d[blk_offset]; + + // Load 32 qh (5-th bit of each Q5) for the entire block + uchar4 qhx32 = as_uchar4(src0_qh[blk_offset]); + + // Load 16 qs (half block) in transposed layout + uint2 qsx16; + qsx16.x = read_imageui(src0_qs, q_sub_offset + sub_block_id_m).x; + qsx16.y = read_imageui(src0_qs, q_sub_offset + sub_block_id_m + ne01).x; + + // Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements + float8 bx8_f32; + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + // Convert to half and store to LM to share within the subgroup + half8 bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + // Dequantization + dequantize_q5_0(as_ushort4(qsx16), qhx32.lo, reg_a, s); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + // 32 16x16 fp16 dot product with 8 elements reduction for better precision + half16 acc; + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + + // Repeat for second sub-block + uint half_step = step + TILESIZE_K; + q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + b_sub_offset = col * ne00 + half_step; + + // Load next 16 qs in transposed layout + qsx16.x = read_imageui(src0_qs, q_sub_offset + sub_block_id_m).x; + qsx16.y = read_imageui(src0_qs, q_sub_offset + sub_block_id_m + ne01).x; + + // Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + // Convert to half and store to LM to share within the subgroup + bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + // Dequantization + dequantize_q5_0(as_ushort4(qsx16), qhx32.hi, reg_a, s); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + // 32 16x16 fp16 dot product with 3-levels reduction for better precision + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + } + + // Load poster router and share in LM + __local uint out_idx[TILESIZE_N]; + + if (get_local_id(0) < TILESIZE_N) { + uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)]; + if (idx == 0xFFFFFFFF) { + idx = src2[block_id_n * TILESIZE_N + 0]; + } + out_idx[get_local_id(0)] = idx * ne01; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // Scatter results back to original position in output grid + uint m_offset = row + get_local_id(0); + + write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1)); + write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2)); + write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3)); + write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4)); + write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5)); + write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6)); + write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7)); + write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8)); + write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9)); + write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa)); + write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb)); + write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc)); + write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd)); + write_imagef(dst, out_idx[14] + m_offset, (reg_c.se)); + write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf)); + write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg)); + write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh)); + write_imagef(dst, out_idx[18] + m_offset, (reg_c.si)); + write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj)); + write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk)); + write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl)); + write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm)); + write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn)); + write_imagef(dst, out_idx[24] + m_offset, (reg_c.so)); + write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp)); + write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq)); + write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr)); + write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss)); + write_imagef(dst, out_idx[29] + m_offset, (reg_c.st)); + write_imagef(dst, out_idx[30] + m_offset, (reg_c.su)); + write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv)); + + // Store zero padding parts to the index of first output in tile, override correct result in the end + barrier(CLK_GLOBAL_MEM_FENCE); + write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0)); +} diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl new file mode 100644 index 00000000000..5fc2a523234 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl @@ -0,0 +1,258 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable +#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable + +#define TILESIZE_K 16 +#define TILESIZE_M 64 +#define TILESIZE_N 32 + + +#define dequantize_q5_1(qs5x16, qh5x16, a_f16, scale, m) \ + a_f16.s0 = (half)((( qs5x16.s0 & 0x000F) | (( qh5x16.s0 & 0x01) << 4)) * scale + m); \ + a_f16.s1 = (half)((((qs5x16.s0 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 1) & 0x01) << 4)) * scale + m); \ + a_f16.s2 = (half)((((qs5x16.s0 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 2) & 0x01) << 4)) * scale + m); \ + a_f16.s3 = (half)((((qs5x16.s0 & 0xF000) >> 12) | (((qh5x16.s0 >> 3) & 0x01) << 4)) * scale + m); \ + a_f16.s4 = (half)((( qs5x16.s1 & 0x000F) | (((qh5x16.s0 >> 4) & 0x01) << 4)) * scale + m); \ + a_f16.s5 = (half)((((qs5x16.s1 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 5) & 0x01) << 4)) * scale + m); \ + a_f16.s6 = (half)((((qs5x16.s1 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 6) & 0x01) << 4)) * scale + m); \ + a_f16.s7 = (half)((((qs5x16.s1 & 0xF000) >> 12) | (((qh5x16.s0 >> 7) & 0x01) << 4)) * scale + m); \ + a_f16.s8 = (half)((( qs5x16.s2 & 0x000F) | (( qh5x16.s1 & 0x01) << 4)) * scale + m); \ + a_f16.s9 = (half)((((qs5x16.s2 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 1) & 0x01) << 4)) * scale + m); \ + a_f16.sa = (half)((((qs5x16.s2 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 2) & 0x01) << 4)) * scale + m); \ + a_f16.sb = (half)((((qs5x16.s2 & 0xF000) >> 12) | (((qh5x16.s1 >> 3) & 0x01) << 4)) * scale + m); \ + a_f16.sc = (half)((( qs5x16.s3 & 0x000F) | (((qh5x16.s1 >> 4) & 0x01) << 4)) * scale + m); \ + a_f16.sd = (half)((((qs5x16.s3 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 5) & 0x01) << 4)) * scale + m); \ + a_f16.se = (half)((((qs5x16.s3 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 6) & 0x01) << 4)) * scale + m); \ + a_f16.sf = (half)((((qs5x16.s3 & 0xF000) >> 12) | (((qh5x16.s1 >> 7) & 0x01) << 4)) * scale + m); \ + + +#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \ + acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \ + acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \ + acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \ + acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \ + acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \ + acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \ + acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \ + acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \ + acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \ + acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \ + acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \ + acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \ + acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \ + acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \ + acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \ + acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \ + acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \ + acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \ + acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \ + acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \ + acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \ + acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \ + acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \ + acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \ + acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \ + acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \ + acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \ + acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \ + acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \ + acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \ + acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \ + acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \ + acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \ + acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \ + acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \ + acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \ + acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \ + acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \ + acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \ + acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \ + acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \ + acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \ + acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \ + acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \ + acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \ + acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \ + acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \ + acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \ + acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \ + acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \ + acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \ + acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \ + acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \ + acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \ + acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \ + acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \ + acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \ + acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \ + acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \ + acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \ + acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \ + acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \ + acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + + +__attribute__((qcom_wave_pair_mode(1))) // 1=force single 2=force pair +kernel void kernel_gemm_moe_q5_1_f32_ns( + __read_only image1d_buffer_t src0_qs, + __global uint * src0_qh, + __global half * src0_d, + __global half * src0_m, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global ushort * src2_emap, + __write_only image1d_buffer_t dst, + __global int * total_tiles, + uint ne00, + uint ne01 +) { + uint block_id_m = get_global_id(1); // m_tile + uint block_id_n = get_global_id(2); // n_tile + + // Boundary check + if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) { + return; + } + + __private half16 reg_a; + __private float32 reg_c = (float32)(0); + __local half4 shared_b[128]; + + const ushort expert_id = src2_emap[block_id_n]; + + const uint row = block_id_m * TILESIZE_M; + const uint col = block_id_n * TILESIZE_N; + + uint sub_block_id_m = get_local_id(0); + uint2 b_global_offset; + b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00; + b_global_offset.y = b_global_offset.x + (16 * ne00); + uint2 b_local_offset; + b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2); + b_local_offset.y = b_local_offset.x + 16; + + // Loop along K axis, 32 elements (one block) for each iteration, divided into 2 sub-blocks + for (uint step = 0; step < ne00; step += TILESIZE_K * 2) { + // First sub-block + uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + uint s_sub_offset = row + ((ne01 * step) >> 5) + ((expert_id * ne00 * ne01) >> 5); + uint b_sub_offset = col * ne00 + step; + + // Load scale and m for current Q5_1 block + uint blk_offset = s_sub_offset + get_global_id(0); + half s = src0_d[blk_offset]; + half m = src0_m[blk_offset]; + + // Load 32 qh (5-th bit of each Q5) for the entire block + uchar4 qhx32 = as_uchar4(src0_qh[blk_offset]); + + // Load 16 qs (half block) in transposed layout + uint2 qsx16; + qsx16.x = read_imageui(src0_qs, q_sub_offset + sub_block_id_m).x; + qsx16.y = read_imageui(src0_qs, q_sub_offset + sub_block_id_m + ne01).x; + + // Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements + float8 bx8_f32; + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + // Convert to half and store to LM to share within the subgroup + half8 bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + // Dequantization + dequantize_q5_1(as_ushort4(qsx16), qhx32.lo, reg_a, s, m); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + // 32 16x16 fp16 dot product with 8 elements reduction for better precision + half16 acc; + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + + // Repeat for second sub-block + uint half_step = step + TILESIZE_K; + q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + b_sub_offset = col * ne00 + half_step; + + // Load next 16 qs in transposed layout + qsx16.x = read_imageui(src0_qs, q_sub_offset + sub_block_id_m).x; + qsx16.y = read_imageui(src0_qs, q_sub_offset + sub_block_id_m + ne01).x; + + // Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + // Convert to half and store to LM to share within the subgroup + bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + // Dequantization + dequantize_q5_1(as_ushort4(qsx16), qhx32.hi, reg_a, s, m); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + // 32 16x16 fp16 dot product with 3-levels reduction for better precision + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + } + + // Load poster router and share in LM + __local uint out_idx[TILESIZE_N]; + + if (get_local_id(0) < TILESIZE_N) { + uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)]; + if (idx == 0xFFFFFFFF) { + idx = src2[block_id_n * TILESIZE_N + 0]; + } + out_idx[get_local_id(0)] = idx * ne01; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // Scatter results back to original position in output grid + uint m_offset = row + get_local_id(0); + + write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1)); + write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2)); + write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3)); + write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4)); + write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5)); + write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6)); + write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7)); + write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8)); + write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9)); + write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa)); + write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb)); + write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc)); + write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd)); + write_imagef(dst, out_idx[14] + m_offset, (reg_c.se)); + write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf)); + write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg)); + write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh)); + write_imagef(dst, out_idx[18] + m_offset, (reg_c.si)); + write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj)); + write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk)); + write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl)); + write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm)); + write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn)); + write_imagef(dst, out_idx[24] + m_offset, (reg_c.so)); + write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp)); + write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq)); + write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr)); + write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss)); + write_imagef(dst, out_idx[29] + m_offset, (reg_c.st)); + write_imagef(dst, out_idx[30] + m_offset, (reg_c.su)); + write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv)); + + // Store zero padding parts to the index of first output in tile, override correct result in the end + barrier(CLK_GLOBAL_MEM_FENCE); + write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0)); +} diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl new file mode 100644 index 00000000000..938054cf982 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl @@ -0,0 +1,119 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable + +#define QK_Q5_0 32 +#define N_SIMDGROUP 4 +#define SIMDGROUP_WIDTH 64 + +static inline float8 q5_0_to_fp32_packed8(ushort2 qs5x8, uchar qh5x8) { + float8 fp32x8; + fp32x8.s0 = (float)((( qs5x8.s0 & 0x000F) | (( qh5x8 & 0x01) << 4)) - 16); + fp32x8.s1 = (float)((((qs5x8.s0 & 0x00F0) >> 4 ) | (((qh5x8 >> 1) & 0x01) << 4)) - 16); + fp32x8.s2 = (float)((((qs5x8.s0 & 0x0F00) >> 8 ) | (((qh5x8 >> 2) & 0x01) << 4)) - 16); + fp32x8.s3 = (float)((((qs5x8.s0 & 0xF000) >> 12) | (((qh5x8 >> 3) & 0x01) << 4)) - 16); + fp32x8.s4 = (float)((( qs5x8.s1 & 0x000F) | (((qh5x8 >> 4) & 0x01) << 4)) - 16); + fp32x8.s5 = (float)((((qs5x8.s1 & 0x00F0) >> 4 ) | (((qh5x8 >> 5) & 0x01) << 4)) - 16); + fp32x8.s6 = (float)((((qs5x8.s1 & 0x0F00) >> 8 ) | (((qh5x8 >> 6) & 0x01) << 4)) - 16); + fp32x8.s7 = (float)((((qs5x8.s1 & 0xF000) >> 12) | (((qh5x8 >> 7) & 0x01) << 4)) - 16); + return fp32x8; +} + + +__attribute__((qcom_reqd_sub_group_size("half"))) +__kernel void kernel_gemv_moe_q5_0_f32_ns( + __global uint * src0_qs, + __global uint * src0_qh, + __global half * src0_d, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global float * dst, + ulong offsetd, + uint ne00, + uint ne01, + uint ne11 +) { + uint i01 = get_global_id(0); + uint i20 = get_global_id(2); + uint sgid = get_local_id(1); + uint slid = get_sub_group_local_id(); + + uint i11 = i20 % ne11; + + uint expert_id = src2[i20]; + uint expert_offset = expert_id * ne00 * ne01 / 32; + + __private float sum = 0.0f; // each thread calculate partial sum of one output + + // loop along ne00 in block granularity, skip 4 blocks every iter + for (uint ib00 = sgid; ib00 < (ne00 / QK_Q5_0); ib00 += N_SIMDGROUP) { + + // load one block of q + uint4 regQ; + uint block_offset = expert_offset * 4 + ib00 * ne01 * 4 + i01; + + regQ.s0 = src0_qs[block_offset]; + regQ.s1 = src0_qs[block_offset + ne01]; + regQ.s2 = src0_qs[block_offset + ne01 * 2]; + regQ.s3 = src0_qs[block_offset + ne01 * 3]; + + uint offset = i11 * ne00 / 4 + ib00 * 8; + + uchar4 regQh = as_uchar4(src0_qh[ib00 * ne01 + i01 + expert_offset]); + half regS = src0_d[ib00 * ne01 + i01 + expert_offset]; + + float8 fp32x8 = q5_0_to_fp32_packed8(as_ushort2(regQ.s0), regQh.s0); + + float4 shared_y4; + shared_y4 = read_imagef(src1, (offset + 0)); + float4 acc = shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (offset + 1)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q5_0_to_fp32_packed8(as_ushort2(regQ.s1), regQh.s1); + + shared_y4 = read_imagef(src1, (offset + 2)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (offset + 3)); + acc += shared_y4 * fp32x8.hi; + + + fp32x8 = q5_0_to_fp32_packed8(as_ushort2(regQ.s2), regQh.s2); + + shared_y4 = read_imagef(src1, (offset + 4)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (offset + 5)); + acc += shared_y4 * fp32x8.hi; + + + fp32x8 = q5_0_to_fp32_packed8(as_ushort2(regQ.s3), regQh.s3); + + shared_y4 = read_imagef(src1, (offset + 6)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (offset + 7)); + acc += shared_y4 * fp32x8.hi; + + sum += (float)(regS) * ((acc.s0 + acc.s1) + (acc.s2 + acc.s3)); + } + + // reduction in local memory, assumes #subgroups=4 + __local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)]; + if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum; + if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum; + if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid]; + + // 1 outputs per thread in subgroup 0 + if (sgid == 0) { + dst = dst + (offsetd >> 2); + dst[i01 + i20 * ne01] = sum; + } + +} diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl new file mode 100644 index 00000000000..f33a4ef2757 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl @@ -0,0 +1,121 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable + +#define QK_Q5_1 32 +#define N_SIMDGROUP 4 +#define SIMDGROUP_WIDTH 64 + +static inline float8 q5_1_to_fp32_packed8(ushort2 qs5x8, uchar qh5x8, half s, half m) { + float8 fp32x8; + fp32x8.s0 = (float)((( qs5x8.s0 & 0x000F) | (( qh5x8 & 0x01) << 4)) * s + m); + fp32x8.s1 = (float)((((qs5x8.s0 & 0x00F0) >> 4 ) | (((qh5x8 >> 1) & 0x01) << 4)) * s + m); + fp32x8.s2 = (float)((((qs5x8.s0 & 0x0F00) >> 8 ) | (((qh5x8 >> 2) & 0x01) << 4)) * s + m); + fp32x8.s3 = (float)((((qs5x8.s0 & 0xF000) >> 12) | (((qh5x8 >> 3) & 0x01) << 4)) * s + m); + fp32x8.s4 = (float)((( qs5x8.s1 & 0x000F) | (((qh5x8 >> 4) & 0x01) << 4)) * s + m); + fp32x8.s5 = (float)((((qs5x8.s1 & 0x00F0) >> 4 ) | (((qh5x8 >> 5) & 0x01) << 4)) * s + m); + fp32x8.s6 = (float)((((qs5x8.s1 & 0x0F00) >> 8 ) | (((qh5x8 >> 6) & 0x01) << 4)) * s + m); + fp32x8.s7 = (float)((((qs5x8.s1 & 0xF000) >> 12) | (((qh5x8 >> 7) & 0x01) << 4)) * s + m); + return fp32x8; +} + + +__attribute__((qcom_reqd_sub_group_size("half"))) +__kernel void kernel_gemv_moe_q5_1_f32_ns( + __global uint * src0_qs, + __global uint * src0_qh, + __global half * src0_d, + __global half * src0_m, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global float * dst, + ulong offsetd, + uint ne00, + uint ne01, + uint ne11 +) { + uint i01 = get_global_id(0); + uint i20 = get_global_id(2); + uint sgid = get_local_id(1); + uint slid = get_sub_group_local_id(); + + uint i11 = i20 % ne11; + + uint expert_id = src2[i20]; + uint expert_offset = expert_id * ne00 * ne01 / 32; + + __private float sum = 0.0f; // each thread calculate partial sum of one output + + // loop along ne00 in block granularity, skip 4 blocks every iter + for (uint ib00 = sgid; ib00 < (ne00 / QK_Q5_1); ib00 += N_SIMDGROUP) { + + // load one block of q + uint4 regQ; + uint block_offset = expert_offset * 4 + ib00 * ne01 * 4 + i01; + + regQ.s0 = src0_qs[block_offset]; + regQ.s1 = src0_qs[block_offset + ne01]; + regQ.s2 = src0_qs[block_offset + ne01 * 2]; + regQ.s3 = src0_qs[block_offset + ne01 * 3]; + + uint offset = i11 * ne00 / 4 + ib00 * 8; + + uchar4 regQh = as_uchar4(src0_qh[ib00 * ne01 + i01 + expert_offset]); + half regM = src0_m[ib00 * ne01 + i01 + expert_offset]; + half regS = src0_d[ib00 * ne01 + i01 + expert_offset]; + + float8 fp32x8 = q5_1_to_fp32_packed8(as_ushort2(regQ.s0), regQh.s0, regS, regM); + + float4 shared_y4; + shared_y4 = read_imagef(src1, (offset + 0)); + float4 acc = shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (offset + 1)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q5_1_to_fp32_packed8(as_ushort2(regQ.s1), regQh.s1, regS, regM); + + shared_y4 = read_imagef(src1, (offset + 2)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (offset + 3)); + acc += shared_y4 * fp32x8.hi; + + + fp32x8 = q5_1_to_fp32_packed8(as_ushort2(regQ.s2), regQh.s2, regS, regM); + + shared_y4 = read_imagef(src1, (offset + 4)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (offset + 5)); + acc += shared_y4 * fp32x8.hi; + + + fp32x8 = q5_1_to_fp32_packed8(as_ushort2(regQ.s3), regQh.s3, regS, regM); + + shared_y4 = read_imagef(src1, (offset + 6)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (offset + 7)); + acc += shared_y4 * fp32x8.hi; + + sum += ((acc.s0 + acc.s1) + (acc.s2 + acc.s3)); + } + + // reduction in local memory, assumes #subgroups=4 + __local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)]; + if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum; + if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum; + if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid]; + + // 1 outputs per thread in subgroup 0 + if (sgid == 0) { + dst = dst + (offsetd >> 2); + dst[i01 + i20 * ne01] = sum; + } + +} From 7f3f843c31cd32dc4adc10b393342dfee071c332 Mon Sep 17 00:00:00 2001 From: scutler-nv Date: Wed, 13 May 2026 13:36:14 -0700 Subject: [PATCH 5/5] Fix for issue #22974. Cast intermediate results to float before adding and casting the result to the destination type. Avoids half+half operator ambiguity. (#22994) --- ggml/src/ggml-cuda/allreduce.cu | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cuda/allreduce.cu b/ggml/src/ggml-cuda/allreduce.cu index 434689abd95..d56129a227e 100644 --- a/ggml/src/ggml-cuda/allreduce.cu +++ b/ggml/src/ggml-cuda/allreduce.cu @@ -184,13 +184,15 @@ static __global__ void ggml_cuda_ar_kernel( #pragma unroll for (int k = 0; k < ELEMS_PER_VEC; ++k) { const T_wire d_low = ggml_cuda_cast(sendbuf[off + k]); - recvbuf[off + k] = ggml_cuda_cast(d_low) + ggml_cuda_cast(wire[k]); + recvbuf[off + k] = ggml_cuda_cast( + ggml_cuda_cast(d_low) + ggml_cuda_cast(wire[k])); } } if (bid == 0 && tid < count - tail) { const T_wire d_low = ggml_cuda_cast(sendbuf[tail + tid]); - recvbuf[tail + tid] = - ggml_cuda_cast(d_low) + ggml_cuda_cast(host_other[tail + tid]); + recvbuf[tail + tid] = ggml_cuda_cast( + ggml_cuda_cast(d_low) + + ggml_cuda_cast(host_other[tail + tid])); } } } @@ -210,7 +212,8 @@ static __global__ void ggml_cuda_ar_add_kernel( const int nt = gridDim.x * blockDim.x; for (int i = tid; i < count; i += nt) { const T_src d_low = ggml_cuda_cast(dst[i]); - dst[i] = ggml_cuda_cast(d_low) + ggml_cuda_cast(src[i]); + dst[i] = ggml_cuda_cast( + ggml_cuda_cast(d_low) + ggml_cuda_cast(src[i])); } }