From e07ca378d425390b53ba63128be5aed14b0d1e9a Mon Sep 17 00:00:00 2001 From: PaulKoudelka Date: Fri, 10 Apr 2026 18:34:10 +0200 Subject: [PATCH] added functionality to add tokenizer to LLM and embedding models --- .../Assistants/I18N/allTexts.lua | 21 ++++++ .../Components/AttachDocuments.razor.cs | 3 + .../Settings/SettingsPanelEmbeddings.razor.cs | 1 + .../Settings/SettingsPanelProviders.razor.cs | 1 + .../Dialogs/EmbeddingProviderDialog.razor.cs | 10 +++ .../Dialogs/ProviderDialog.razor | 19 +++++ .../Dialogs/ProviderDialog.razor.cs | 75 +++++++++++++++++++ .../Provider/BaseProvider.cs | 3 + app/MindWork AI Studio/Provider/IProvider.cs | 7 +- .../Provider/LLMProvidersExtensions.cs | 40 +++++----- app/MindWork AI Studio/Provider/NoProvider.cs | 5 +- .../Settings/EmbeddingProvider.cs | 11 ++- app/MindWork AI Studio/Settings/Provider.cs | 11 ++- runtime/Cargo.toml | 1 + runtime/src/tokenizer.rs | 36 +++++---- 15 files changed, 201 insertions(+), 43 deletions(-) diff --git a/app/MindWork AI Studio/Assistants/I18N/allTexts.lua b/app/MindWork AI Studio/Assistants/I18N/allTexts.lua index 5ef3bb98..791f11b8 100644 --- a/app/MindWork AI Studio/Assistants/I18N/allTexts.lua +++ b/app/MindWork AI Studio/Assistants/I18N/allTexts.lua @@ -3817,6 +3817,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T1324664716"] = "AP -- Create account UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T1356621346"] = "Create account" +-- Failed to validate the selected tokenizer. Please try again. +UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T1384494471"] = "Failed to validate the selected tokenizer. Please try again." + -- Please enter an embedding model name. UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T1661085403"] = "Please enter an embedding model name." @@ -3838,6 +3841,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2189814010"] = "Mo -- (Optional) API Key UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2331453405"] = "(Optional) API Key" +-- Invalid tokenizer: +UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2448302543"] = "Invalid tokenizer:" + -- Add UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2646845972"] = "Add" @@ -4036,6 +4042,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T1324664716"] = "API Key" -- Create account UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T1356621346"] = "Create account" +-- Failed to validate the selected tokenizer. Please try again. +UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T1384494471"] = "Failed to validate the selected tokenizer. Please try again." + -- Load models UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T15352225"] = "Load models" @@ -4063,12 +4072,18 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T2189814010"] = "Model" -- (Optional) API Key UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T2331453405"] = "(Optional) API Key" +-- Invalid tokenizer: +UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T2448302543"] = "Invalid tokenizer:" + -- Add UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T2646845972"] = "Add" -- Additional API parameters UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T2728244552"] = "Additional API parameters" +-- Selected file path for the custom tokenizer +UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T278585345"] = "Selected file path for the custom tokenizer" + -- No models loaded or available. UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T2810182573"] = "No models loaded or available." @@ -4087,6 +4102,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T3763891899"] = "Show availa -- This host uses the model configured at the provider level. No model selection is available. UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T3783329915"] = "This host uses the model configured at the provider level. No model selection is available." +-- Choose a custom tokenizer here +UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T3787466119"] = "Choose a custom tokenizer here" + -- Duplicate key '{0}' found. UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T3804472591"] = "Duplicate key '{0}' found." @@ -4108,6 +4126,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T900237532"] = "Provider" -- Cancel UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T900713019"] = "Cancel" +-- For better token estimates, you can configure a custom tokenizer for this provider. +UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T961454300"] = "For better token estimates, you can configure a custom tokenizer for this provider." + -- The parameter name. It must be unique within the retrieval process. UI_TEXT_CONTENT["AISTUDIO::DIALOGS::RETRIEVALPROCESSDIALOG::T100726215"] = "The parameter name. It must be unique within the retrieval process." diff --git a/app/MindWork AI Studio/Components/AttachDocuments.razor.cs b/app/MindWork AI Studio/Components/AttachDocuments.razor.cs index acfc0dd2..65a901ef 100644 --- a/app/MindWork AI Studio/Components/AttachDocuments.razor.cs +++ b/app/MindWork AI Studio/Components/AttachDocuments.razor.cs @@ -48,6 +48,9 @@ public partial class AttachDocuments : MSGComponentBase [Parameter] public bool UseSmallForm { get; set; } + [Parameter] + public FileType[]? AllowedFileTypes { get; set; } + /// /// When true, validate media file types before attaching. Default is true. That means that /// the user cannot attach unsupported media file types when the provider or model does not diff --git a/app/MindWork AI Studio/Components/Settings/SettingsPanelEmbeddings.razor.cs b/app/MindWork AI Studio/Components/Settings/SettingsPanelEmbeddings.razor.cs index 775b2ad9..8f9ad19c 100644 --- a/app/MindWork AI Studio/Components/Settings/SettingsPanelEmbeddings.razor.cs +++ b/app/MindWork AI Studio/Components/Settings/SettingsPanelEmbeddings.razor.cs @@ -73,6 +73,7 @@ public partial class SettingsPanelEmbeddings : SettingsPanelProviderBase { x => x.IsSelfHosted, embeddingProvider.IsSelfHosted }, { x => x.IsEditing, true }, { x => x.DataHost, embeddingProvider.Host }, + { x => x.DataTokenizerPath, embeddingProvider.TokenizerPath }, }; var dialogReference = await this.DialogService.ShowAsync(T("Edit Embedding Provider"), dialogParameters, DialogOptions.FULLSCREEN); diff --git a/app/MindWork AI Studio/Components/Settings/SettingsPanelProviders.razor.cs b/app/MindWork AI Studio/Components/Settings/SettingsPanelProviders.razor.cs index 500a4c2d..f4f4d9bd 100644 --- a/app/MindWork AI Studio/Components/Settings/SettingsPanelProviders.razor.cs +++ b/app/MindWork AI Studio/Components/Settings/SettingsPanelProviders.razor.cs @@ -73,6 +73,7 @@ public partial class SettingsPanelProviders : SettingsPanelProviderBase { x => x.DataHost, provider.Host }, { x => x.HFInferenceProviderId, provider.HFInferenceProvider }, { x => x.AdditionalJsonApiParameters, provider.AdditionalJsonApiParameters }, + { x => x.DataTokenizerPath, provider.TokenizerPath }, }; var dialogReference = await this.DialogService.ShowAsync(T("Edit LLM Provider"), dialogParameters, DialogOptions.FULLSCREEN); diff --git a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs index 3892d05d..9e4479a7 100644 --- a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs +++ b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs @@ -69,6 +69,9 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId /// [Parameter] public bool IsEditing { get; init; } + + [Parameter] + public string DataTokenizerPath { get; set; } = string.Empty; [Inject] private RustService RustService { get; init; } = null!; @@ -143,6 +146,7 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId Host = this.DataHost, IsEnterpriseConfiguration = false, EnterpriseConfigurationPluginId = Guid.Empty, + TokenizerPath = this.dataFilePath, }; } @@ -164,6 +168,7 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId if(this.IsEditing) { this.dataEditingPreviousInstanceName = this.DataName.ToLowerInvariant(); + this.dataFilePath = this.DataTokenizerPath; Console.WriteLine($"Previous instance name is '{this.dataEditingPreviousInstanceName}'"); // When using self-hosted embedding, we must copy the model name: @@ -241,7 +246,12 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId var response = await this.RustService.StoreTokenizer(this.DataName, this.dataEditingPreviousInstanceName, this.dataFilePath); Console.WriteLine($"Response from Rust: {response.Message}"); if (!response.Success) + { + this.dataCustomTokenizerValidationIssue = response.Message; + await this.form.Validate(); return; + } + this.dataFilePath = response.Message; // Use the data model to store the provider. // We just return this data to the parent component: diff --git a/app/MindWork AI Studio/Dialogs/ProviderDialog.razor b/app/MindWork AI Studio/Dialogs/ProviderDialog.razor index 4c09da2f..0e61ce5b 100644 --- a/app/MindWork AI Studio/Dialogs/ProviderDialog.razor +++ b/app/MindWork AI Studio/Dialogs/ProviderDialog.razor @@ -1,6 +1,7 @@ @using AIStudio.Provider @using AIStudio.Provider.HuggingFace @using AIStudio.Provider.SelfHosted +@using AIStudio.Tools.Rust @inherits MSGComponentBase @@ -150,6 +151,24 @@ Validation="@this.providerValidation.ValidatingInstanceName" UserAttributes="@SPELLCHECK_ATTRIBUTES" /> + + @if (this.DataLLMProvider != LLMProviders.NONE) + { + + @T("For better token estimates, you can configure a custom tokenizer for this provider.") + + + } diff --git a/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs b/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs index 9e84bea8..fbd9a9b2 100644 --- a/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs +++ b/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs @@ -8,6 +8,7 @@ using AIStudio.Tools.Services; using AIStudio.Tools.Validation; using Microsoft.AspNetCore.Components; +using Microsoft.AspNetCore.Components.Web; using Host = AIStudio.Provider.SelfHosted.Host; @@ -83,6 +84,9 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId [Parameter] public string AdditionalJsonApiParameters { get; set; } = string.Empty; + + [Parameter] + public string DataTokenizerPath { get; set; } = string.Empty; [Inject] private RustService RustService { get; init; } = null!; @@ -104,6 +108,11 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId private string dataAPIKeyStorageIssue = string.Empty; private string dataEditingPreviousInstanceName = string.Empty; private string dataLoadingModelsIssue = string.Empty; + private string dataFilePath = string.Empty; + private string dataCustomTokenizerValidationIssue = string.Empty; + private Task dataTokenizerValidationTask = Task.CompletedTask; + private bool dataStoreWasAttempted; + private int dataTokenizerValidationRevision; private bool showExpertSettings; // We get the form reference from Blazor code to validate it manually: @@ -123,6 +132,7 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId GetUsedInstanceNames = () => this.UsedInstanceNames, GetHost = () => this.DataHost, IsModelProvidedManually = () => this.DataLLMProvider.IsLLMModelProvidedManually(), + GetCustomTokenizerValidationIssue = () => this.dataCustomTokenizerValidationIssue, }; } @@ -158,6 +168,7 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId Host = this.DataHost, HFInferenceProvider = this.HFInferenceProviderId, AdditionalJsonApiParameters = this.AdditionalJsonApiParameters, + TokenizerPath = this.dataFilePath, }; } @@ -182,6 +193,7 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId if(this.IsEditing) { this.dataEditingPreviousInstanceName = this.DataInstanceName.ToLowerInvariant(); + this.dataFilePath = this.DataTokenizerPath; // When using Fireworks or Hugging Face, we must copy the model name: if (this.DataLLMProvider.IsLLMModelProvidedManually()) @@ -237,6 +249,8 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId private async Task Store() { + this.dataStoreWasAttempted = true; + await this.dataTokenizerValidationTask; await this.form.Validate(); if (!string.IsNullOrWhiteSpace(this.dataAPIKeyStorageIssue)) this.dataAPIKeyStorageIssue = string.Empty; @@ -253,6 +267,15 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId // When the data is not valid, we don't store it: if (!this.dataIsValid) return; + + var tokenizerResponse = await this.RustService.StoreTokenizer(this.DataInstanceName, this.dataEditingPreviousInstanceName, this.dataFilePath); + if (!tokenizerResponse.Success) + { + this.dataCustomTokenizerValidationIssue = tokenizerResponse.Message; + await this.form.Validate(); + return; + } + this.dataFilePath = tokenizerResponse.Message; // Use the data model to store the provider. // We just return this data to the parent component: @@ -292,6 +315,58 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId } } + private Task ClearPathTokenizer(MouseEventArgs _) + { + return this.OnDataFilePathChanged(string.Empty); + } + + private async Task OnDataFilePathChanged(string filePath) + { + this.dataFilePath = filePath; + var validationRevision = ++this.dataTokenizerValidationRevision; + this.dataTokenizerValidationTask = this.ValidateCustomTokenizer(filePath, validationRevision); + await this.dataTokenizerValidationTask; + + if (validationRevision != this.dataTokenizerValidationRevision) + return; + + if (this.dataStoreWasAttempted) + await this.form.Validate(); + else + this.form.ResetValidation(); + } + + private async Task ValidateCustomTokenizer(string filePath, int validationRevision) + { + if (string.IsNullOrWhiteSpace(filePath)) + { + if (validationRevision == this.dataTokenizerValidationRevision) + this.dataCustomTokenizerValidationIssue = string.Empty; + + return; + } + + try + { + var response = await this.RustService.ValidateTokenizer(filePath); + if (validationRevision != this.dataTokenizerValidationRevision) + return; + + if (response.Success) + this.dataCustomTokenizerValidationIssue = string.Empty; + else + this.dataCustomTokenizerValidationIssue = T("Invalid tokenizer: ") + response.Message; + } + catch (Exception e) + { + if (validationRevision != this.dataTokenizerValidationRevision) + return; + + this.Logger.LogError(e, "Failed to validate custom tokenizer."); + this.dataCustomTokenizerValidationIssue = T("Failed to validate the selected tokenizer. Please try again."); + } + } + private void OnHostChanged(Host selectedHost) { // When the host changes, reset the model selection state: diff --git a/app/MindWork AI Studio/Provider/BaseProvider.cs b/app/MindWork AI Studio/Provider/BaseProvider.cs index 9b729824..28179223 100644 --- a/app/MindWork AI Studio/Provider/BaseProvider.cs +++ b/app/MindWork AI Studio/Provider/BaseProvider.cs @@ -90,6 +90,9 @@ public abstract class BaseProvider : IProvider, ISecretId /// public string AdditionalJsonApiParameters { get; init; } = string.Empty; + /// + public string TokenizerPath { get; init; } = string.Empty; + /// public abstract IAsyncEnumerable StreamChatCompletion(Model chatModel, ChatThread chatThread, SettingsManager settingsManager, CancellationToken token = default); diff --git a/app/MindWork AI Studio/Provider/IProvider.cs b/app/MindWork AI Studio/Provider/IProvider.cs index ef15dd21..e0842f2e 100644 --- a/app/MindWork AI Studio/Provider/IProvider.cs +++ b/app/MindWork AI Studio/Provider/IProvider.cs @@ -28,6 +28,11 @@ public interface IProvider /// The additional API parameters. /// public string AdditionalJsonApiParameters { get; } + + /// + /// The tokenizer path associated with this provider configuration. + /// + public string TokenizerPath { get; } /// /// Starts a chat completion stream. @@ -101,4 +106,4 @@ public interface IProvider /// >The cancellation token. /// >The list of transcription models. public Task> GetTranscriptionModels(string? apiKeyProvisional = null, CancellationToken token = default); -} \ No newline at end of file +} diff --git a/app/MindWork AI Studio/Provider/LLMProvidersExtensions.cs b/app/MindWork AI Studio/Provider/LLMProvidersExtensions.cs index e71cef95..f04d9af4 100644 --- a/app/MindWork AI Studio/Provider/LLMProvidersExtensions.cs +++ b/app/MindWork AI Studio/Provider/LLMProvidersExtensions.cs @@ -186,7 +186,7 @@ public static class LLMProvidersExtensions /// The provider instance. public static IProvider CreateProvider(this AIStudio.Settings.Provider providerSettings) { - return providerSettings.UsedLLMProvider.CreateProvider(providerSettings.InstanceName, providerSettings.Host, providerSettings.Hostname, providerSettings.Model, providerSettings.HFInferenceProvider, providerSettings.AdditionalJsonApiParameters, providerSettings.IsEnterpriseConfiguration); + return providerSettings.UsedLLMProvider.CreateProvider(providerSettings.InstanceName, providerSettings.Host, providerSettings.Hostname, providerSettings.Model, providerSettings.HFInferenceProvider, providerSettings.TokenizerPath, providerSettings.AdditionalJsonApiParameters, providerSettings.IsEnterpriseConfiguration); } /// @@ -196,7 +196,7 @@ public static class LLMProvidersExtensions /// The provider instance. public static IProvider CreateProvider(this EmbeddingProvider embeddingProviderSettings) { - return embeddingProviderSettings.UsedLLMProvider.CreateProvider(embeddingProviderSettings.Name, embeddingProviderSettings.Host, embeddingProviderSettings.Hostname, embeddingProviderSettings.Model, HFInferenceProvider.NONE, isEnterpriseConfiguration: embeddingProviderSettings.IsEnterpriseConfiguration); + return embeddingProviderSettings.UsedLLMProvider.CreateProvider(embeddingProviderSettings.Name, embeddingProviderSettings.Host, embeddingProviderSettings.Hostname, embeddingProviderSettings.Model, HFInferenceProvider.NONE, embeddingProviderSettings.TokenizerPath, isEnterpriseConfiguration: embeddingProviderSettings.IsEnterpriseConfiguration); } /// @@ -206,33 +206,33 @@ public static class LLMProvidersExtensions /// The provider instance. public static IProvider CreateProvider(this TranscriptionProvider transcriptionProviderSettings) { - return transcriptionProviderSettings.UsedLLMProvider.CreateProvider(transcriptionProviderSettings.Name, transcriptionProviderSettings.Host, transcriptionProviderSettings.Hostname, transcriptionProviderSettings.Model, HFInferenceProvider.NONE, isEnterpriseConfiguration: transcriptionProviderSettings.IsEnterpriseConfiguration); + return transcriptionProviderSettings.UsedLLMProvider.CreateProvider(transcriptionProviderSettings.Name, transcriptionProviderSettings.Host, transcriptionProviderSettings.Hostname, transcriptionProviderSettings.Model, HFInferenceProvider.NONE, string.Empty, isEnterpriseConfiguration: transcriptionProviderSettings.IsEnterpriseConfiguration); } - private static IProvider CreateProvider(this LLMProviders provider, string instanceName, Host host, string hostname, Model model, HFInferenceProvider inferenceProvider, string expertProviderApiParameter = "", bool isEnterpriseConfiguration = false) + private static IProvider CreateProvider(this LLMProviders provider, string instanceName, Host host, string hostname, Model model, HFInferenceProvider inferenceProvider, string tokenizerPath = "", string expertProviderApiParameter = "", bool isEnterpriseConfiguration = false) { try { return provider switch { - LLMProviders.OPEN_AI => new ProviderOpenAI { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration }, - LLMProviders.ANTHROPIC => new ProviderAnthropic { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration }, - LLMProviders.MISTRAL => new ProviderMistral { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration }, - LLMProviders.GOOGLE => new ProviderGoogle { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration }, - LLMProviders.X => new ProviderX { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration }, - LLMProviders.DEEP_SEEK => new ProviderDeepSeek { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration }, - LLMProviders.ALIBABA_CLOUD => new ProviderAlibabaCloud { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration }, - LLMProviders.PERPLEXITY => new ProviderPerplexity { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration }, - LLMProviders.OPEN_ROUTER => new ProviderOpenRouter { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration }, + LLMProviders.OPEN_AI => new ProviderOpenAI { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration }, + LLMProviders.ANTHROPIC => new ProviderAnthropic { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration }, + LLMProviders.MISTRAL => new ProviderMistral { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration }, + LLMProviders.GOOGLE => new ProviderGoogle { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration }, + LLMProviders.X => new ProviderX { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration }, + LLMProviders.DEEP_SEEK => new ProviderDeepSeek { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration }, + LLMProviders.ALIBABA_CLOUD => new ProviderAlibabaCloud { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration }, + LLMProviders.PERPLEXITY => new ProviderPerplexity { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration }, + LLMProviders.OPEN_ROUTER => new ProviderOpenRouter { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration }, - LLMProviders.GROQ => new ProviderGroq { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration }, - LLMProviders.FIREWORKS => new ProviderFireworks { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration }, - LLMProviders.HUGGINGFACE => new ProviderHuggingFace(inferenceProvider, model) { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration }, + LLMProviders.GROQ => new ProviderGroq { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration }, + LLMProviders.FIREWORKS => new ProviderFireworks { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration }, + LLMProviders.HUGGINGFACE => new ProviderHuggingFace(inferenceProvider, model) { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration }, - LLMProviders.SELF_HOSTED => new ProviderSelfHosted(host, hostname) { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration }, + LLMProviders.SELF_HOSTED => new ProviderSelfHosted(host, hostname) { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration }, - LLMProviders.HELMHOLTZ => new ProviderHelmholtz { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration }, - LLMProviders.GWDG => new ProviderGWDG { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration }, + LLMProviders.HELMHOLTZ => new ProviderHelmholtz { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration }, + LLMProviders.GWDG => new ProviderGWDG { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration }, _ => new NoProvider(), }; @@ -442,4 +442,4 @@ public static class LLMProvidersExtensions LLMProviders.HUGGINGFACE => true, _ => false, }; -} \ No newline at end of file +} diff --git a/app/MindWork AI Studio/Provider/NoProvider.cs b/app/MindWork AI Studio/Provider/NoProvider.cs index 3fc8459c..9128ad47 100644 --- a/app/MindWork AI Studio/Provider/NoProvider.cs +++ b/app/MindWork AI Studio/Provider/NoProvider.cs @@ -18,6 +18,9 @@ public class NoProvider : IProvider /// public string AdditionalJsonApiParameters { get; init; } = string.Empty; + /// + public string TokenizerPath { get; init; } = string.Empty; + public Task> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default) => Task.FromResult>([]); public Task> GetImageModels(string? apiKeyProvisional = null, CancellationToken token = default) => Task.FromResult>([]); @@ -45,4 +48,4 @@ public class NoProvider : IProvider public IReadOnlyCollection GetModelCapabilities(Model model) => [ Capability.NONE ]; #endregion -} \ No newline at end of file +} diff --git a/app/MindWork AI Studio/Settings/EmbeddingProvider.cs b/app/MindWork AI Studio/Settings/EmbeddingProvider.cs index d5a6f20a..0f72c6c7 100644 --- a/app/MindWork AI Studio/Settings/EmbeddingProvider.cs +++ b/app/MindWork AI Studio/Settings/EmbeddingProvider.cs @@ -19,7 +19,8 @@ public sealed record EmbeddingProvider( bool IsEnterpriseConfiguration = false, Guid EnterpriseConfigurationPluginId = default, string Hostname = "http://localhost:1234", - Host Host = Host.NONE) : ConfigurationBaseObject, ISecretId + Host Host = Host.NONE, + string TokenizerPath = "") : ConfigurationBaseObject, ISecretId { private static readonly ILogger LOGGER = Program.LOGGER_FACTORY.CreateLogger(); @@ -96,6 +97,13 @@ public sealed record EmbeddingProvider( return false; } + var tokenizerPath = string.Empty; + if (table.TryGetValue("TokenizerPath", out var tokenizerPathValue) && !tokenizerPathValue.TryRead(out tokenizerPath)) + { + LOGGER.LogWarning($"The configured embedding provider {idx} does not contain a valid tokenizer path. (Plugin ID: {configPluginId})"); + tokenizerPath = string.Empty; + } + provider = new EmbeddingProvider { Num = 0, // will be set later by the PluginConfigurationObject @@ -108,6 +116,7 @@ public sealed record EmbeddingProvider( EnterpriseConfigurationPluginId = configPluginId, Hostname = hostname, Host = host, + TokenizerPath = tokenizerPath, }; // Handle encrypted API key if present: diff --git a/app/MindWork AI Studio/Settings/Provider.cs b/app/MindWork AI Studio/Settings/Provider.cs index 0ccf272c..c8276bcd 100644 --- a/app/MindWork AI Studio/Settings/Provider.cs +++ b/app/MindWork AI Studio/Settings/Provider.cs @@ -32,7 +32,8 @@ public sealed record Provider( string Hostname = "http://localhost:1234", Host Host = Host.NONE, HFInferenceProvider HFInferenceProvider = HFInferenceProvider.NONE, - string AdditionalJsonApiParameters = "") : ConfigurationBaseObject, ISecretId + string AdditionalJsonApiParameters = "", + string TokenizerPath = "") : ConfigurationBaseObject, ISecretId { private static readonly ILogger LOGGER = Program.LOGGER_FACTORY.CreateLogger(); @@ -151,6 +152,13 @@ public sealed record Provider( additionalJsonApiParameters = string.Empty; } + var tokenizerPath = string.Empty; + if (table.TryGetValue("TokenizerPath", out var tokenizerPathValue) && !tokenizerPathValue.TryRead(out tokenizerPath)) + { + LOGGER.LogWarning($"The configured provider {idx} does not contain a valid tokenizer path. (Plugin ID: {configPluginId})"); + tokenizerPath = string.Empty; + } + provider = new Provider { Num = 0, // will be set later by the PluginConfigurationObject @@ -165,6 +173,7 @@ public sealed record Provider( Host = host, HFInferenceProvider = hfInferenceProvider, AdditionalJsonApiParameters = additionalJsonApiParameters, + TokenizerPath = tokenizerPath, }; // Handle encrypted API key if present: diff --git a/runtime/Cargo.toml b/runtime/Cargo.toml index 0fb62f1a..4b41800c 100644 --- a/runtime/Cargo.toml +++ b/runtime/Cargo.toml @@ -41,6 +41,7 @@ pptx-to-md = "0.4.0" tempfile = "3.27.0" strum_macros = "0.28.0" sysinfo = "0.38.4" +tokenizers = "0.22.2" # Fixes security vulnerability downstream, where the upstream is not fixed yet: time = "0.3.47" # -> Rocket diff --git a/runtime/src/tokenizer.rs b/runtime/src/tokenizer.rs index 9fe1801e..f45416af 100644 --- a/runtime/src/tokenizer.rs +++ b/runtime/src/tokenizer.rs @@ -1,5 +1,4 @@ -use rocket::yansi::Paint; -use std::fs; +use std::fs; use std::path::{PathBuf}; use std::sync::OnceLock; use rocket::{post}; @@ -75,23 +74,16 @@ fn validate_tokenizer_at_path(path: &PathBuf) -> Result { } let tokenizer = Tokenizer::from_file(path).map_err(|e| { - println!("Failed to load tokenizer from {}: {}", Paint::red(&path.display()), e); TokenizerError::from(format!( "Failed to load tokenizer from '{}': {}", path.display(), e )) })?; - println!("Loaded tokenizer from {}", Paint::green(&path.display())); let test_string = "Hello, world! This is a test string for tokenizer validation."; let encoding = tokenizer.encode(test_string, true).map_err(|e| { - println!( - "Tokenizer failed to encode validation string for {}: {}", - Paint::red(&path.display()), - e - ); TokenizerError::from(format!( "Tokenizer failed to encode validation string: {}", e @@ -114,7 +106,7 @@ fn validate_tokenizer_at_path(path: &PathBuf) -> Result { Ok(token_count) } -fn handle_tokenizer_store(payload: &TokenizerStorage) -> Result<(), std::io::Error> { +fn handle_tokenizer_store(payload: &TokenizerStorage) -> Result { let data_dir = DATA_DIRECTORY .get() .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::Other, "DATA_DIRECTORY not initialized"))?; @@ -124,11 +116,11 @@ fn handle_tokenizer_store(payload: &TokenizerStorage) -> Result<(), std::io::Err // Delete previous model if file_path is empty if payload.file_path.trim().is_empty() { if payload.previous_model_id.trim().is_empty() { - return Ok(()); // Nothing to delete + return Ok(String::from("")); // Nothing to delete } let previous_path = base_path.join(&payload.previous_model_id); fs::remove_dir_all(previous_path)?; - return Ok(()); + return Ok(String::from("")); } // Copy file @@ -136,22 +128,28 @@ fn handle_tokenizer_store(payload: &TokenizerStorage) -> Result<(), std::io::Err let source_name = source_path.file_name() .and_then(|n| n.to_str()) .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::InvalidInput, "Invalid tokenizer file path"))?; - fs::create_dir_all(&base_path.join(&payload.model_id))?; - let destination_path = base_path.join(&payload.model_id).join(source_name); - println!("Moving tokenizer file from {} to {}", source_path.display(), destination_path.display()); + let model_path = &base_path.join(&payload.model_id); + let destination_path = &model_path.join(source_name); + println!("source_path: {}, destination_path: {}", source_path.display(), destination_path.display()); + println!("equals {}", source_path.eq(destination_path)); + if !source_path.eq(destination_path) && model_path.exists() { + fs::remove_dir_all(model_path)?; + } + fs::create_dir_all(model_path)?; + println!("Moving tokenizer file from {} to {}", source_path.display(), destination_path.display()); let previous_path = base_path.join(&payload.previous_model_id); // Delete previous tokenizer folder if specified if !payload.previous_model_id.trim().is_empty() && source_path.starts_with(&previous_path){ fs::rename(&source_path, &destination_path)?; - if previous_path.exists() { + if previous_path.exists() && !previous_path.eq(model_path) { fs::remove_dir_all(previous_path)?; } }else{ fs::copy( & source_path, & destination_path)?; } - Ok(()) + Ok(destination_path.to_str().unwrap().to_string()) } pub fn get_token_count(text: &str) -> Result { @@ -179,10 +177,10 @@ pub fn validate_tokenizer(_token: APIToken, payload: Json) pub fn store_tokenizer(_token: APIToken, payload: Json) -> Json{ println!("Received tokenizer store request: {}, {}, {}", payload.model_id, payload.previous_model_id, payload.file_path); match handle_tokenizer_store(&payload) { - Ok(()) => Json(TokenizerResponse { + Ok(dest_path) => Json(TokenizerResponse { success: true, token_count: 0, - message: "Success".to_string(), + message: dest_path, }), Err(e) => Json(TokenizerResponse { success: false,