From e07ca378d425390b53ba63128be5aed14b0d1e9a Mon Sep 17 00:00:00 2001
From: PaulKoudelka <paulk.business@posteo.com>
Date: Fri, 10 Apr 2026 18:34:10 +0200
Subject: [PATCH] added functionality to add tokenizer to LLM and embedding
 models

---
 .../Assistants/I18N/allTexts.lua              | 21 ++++++
 .../Components/AttachDocuments.razor.cs       |  3 +
 .../Settings/SettingsPanelEmbeddings.razor.cs |  1 +
 .../Settings/SettingsPanelProviders.razor.cs  |  1 +
 .../Dialogs/EmbeddingProviderDialog.razor.cs  | 10 +++
 .../Dialogs/ProviderDialog.razor              | 19 +++++
 .../Dialogs/ProviderDialog.razor.cs           | 75 +++++++++++++++++++
 .../Provider/BaseProvider.cs                  |  3 +
 app/MindWork AI Studio/Provider/IProvider.cs  |  7 +-
 .../Provider/LLMProvidersExtensions.cs        | 40 +++++-----
 app/MindWork AI Studio/Provider/NoProvider.cs |  5 +-
 .../Settings/EmbeddingProvider.cs             | 11 ++-
 app/MindWork AI Studio/Settings/Provider.cs   | 11 ++-
 runtime/Cargo.toml                            |  1 +
 runtime/src/tokenizer.rs                      | 36 +++++----
 15 files changed, 201 insertions(+), 43 deletions(-)

diff --git a/app/MindWork AI Studio/Assistants/I18N/allTexts.lua b/app/MindWork AI Studio/Assistants/I18N/allTexts.lua
index 5ef3bb98..791f11b8 100644
--- a/app/MindWork AI Studio/Assistants/I18N/allTexts.lua	
+++ b/app/MindWork AI Studio/Assistants/I18N/allTexts.lua	
@@ -3817,6 +3817,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T1324664716"] = "AP
 -- Create account
 UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T1356621346"] = "Create account"
 
+-- Failed to validate the selected tokenizer. Please try again.
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T1384494471"] = "Failed to validate the selected tokenizer. Please try again."
+
 -- Please enter an embedding model name.
 UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T1661085403"] = "Please enter an embedding model name."
 
@@ -3838,6 +3841,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2189814010"] = "Mo
 -- (Optional) API Key
 UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2331453405"] = "(Optional) API Key"
 
+-- Invalid tokenizer:
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2448302543"] = "Invalid tokenizer:"
+
 -- Add
 UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2646845972"] = "Add"
 
@@ -4036,6 +4042,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T1324664716"] = "API Key"
 -- Create account
 UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T1356621346"] = "Create account"
 
+-- Failed to validate the selected tokenizer. Please try again.
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T1384494471"] = "Failed to validate the selected tokenizer. Please try again."
+
 -- Load models
 UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T15352225"] = "Load models"
 
@@ -4063,12 +4072,18 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T2189814010"] = "Model"
 -- (Optional) API Key
 UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T2331453405"] = "(Optional) API Key"
 
+-- Invalid tokenizer:
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T2448302543"] = "Invalid tokenizer:"
+
 -- Add
 UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T2646845972"] = "Add"
 
 -- Additional API parameters
 UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T2728244552"] = "Additional API parameters"
 
+-- Selected file path for the custom tokenizer
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T278585345"] = "Selected file path for the custom tokenizer"
+
 -- No models loaded or available.
 UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T2810182573"] = "No models loaded or available."
 
@@ -4087,6 +4102,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T3763891899"] = "Show availa
 -- This host uses the model configured at the provider level. No model selection is available.
 UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T3783329915"] = "This host uses the model configured at the provider level. No model selection is available."
 
+-- Choose a custom tokenizer here
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T3787466119"] = "Choose a custom tokenizer here"
+
 -- Duplicate key '{0}' found.
 UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T3804472591"] = "Duplicate key '{0}' found."
 
@@ -4108,6 +4126,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T900237532"] = "Provider"
 -- Cancel
 UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T900713019"] = "Cancel"
 
+-- For better token estimates, you can configure a custom tokenizer for this provider.
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::PROVIDERDIALOG::T961454300"] = "For better token estimates, you can configure a custom tokenizer for this provider."
+
 -- The parameter name. It must be unique within the retrieval process.
 UI_TEXT_CONTENT["AISTUDIO::DIALOGS::RETRIEVALPROCESSDIALOG::T100726215"] = "The parameter name. It must be unique within the retrieval process."
 
diff --git a/app/MindWork AI Studio/Components/AttachDocuments.razor.cs b/app/MindWork AI Studio/Components/AttachDocuments.razor.cs
index acfc0dd2..65a901ef 100644
--- a/app/MindWork AI Studio/Components/AttachDocuments.razor.cs	
+++ b/app/MindWork AI Studio/Components/AttachDocuments.razor.cs	
@@ -48,6 +48,9 @@ public partial class AttachDocuments : MSGComponentBase
     [Parameter]
     public bool UseSmallForm { get; set; }
     
+    [Parameter]
+    public FileType[]? AllowedFileTypes { get; set; }
+    
     /// <summary>
     /// When true, validate media file types before attaching. Default is true. That means that
     /// the user cannot attach unsupported media file types when the provider or model does not
diff --git a/app/MindWork AI Studio/Components/Settings/SettingsPanelEmbeddings.razor.cs b/app/MindWork AI Studio/Components/Settings/SettingsPanelEmbeddings.razor.cs
index 775b2ad9..8f9ad19c 100644
--- a/app/MindWork AI Studio/Components/Settings/SettingsPanelEmbeddings.razor.cs	
+++ b/app/MindWork AI Studio/Components/Settings/SettingsPanelEmbeddings.razor.cs	
@@ -73,6 +73,7 @@ public partial class SettingsPanelEmbeddings : SettingsPanelProviderBase
             { x => x.IsSelfHosted, embeddingProvider.IsSelfHosted },
             { x => x.IsEditing, true },
             { x => x.DataHost, embeddingProvider.Host },
+            { x => x.DataTokenizerPath, embeddingProvider.TokenizerPath },
         };
 
         var dialogReference = await this.DialogService.ShowAsync<EmbeddingProviderDialog>(T("Edit Embedding Provider"), dialogParameters, DialogOptions.FULLSCREEN);
diff --git a/app/MindWork AI Studio/Components/Settings/SettingsPanelProviders.razor.cs b/app/MindWork AI Studio/Components/Settings/SettingsPanelProviders.razor.cs
index 500a4c2d..f4f4d9bd 100644
--- a/app/MindWork AI Studio/Components/Settings/SettingsPanelProviders.razor.cs	
+++ b/app/MindWork AI Studio/Components/Settings/SettingsPanelProviders.razor.cs	
@@ -73,6 +73,7 @@ public partial class SettingsPanelProviders : SettingsPanelProviderBase
             { x => x.DataHost, provider.Host },
             { x => x.HFInferenceProviderId, provider.HFInferenceProvider },
             { x => x.AdditionalJsonApiParameters, provider.AdditionalJsonApiParameters },
+            { x => x.DataTokenizerPath, provider.TokenizerPath },
         };
 
         var dialogReference = await this.DialogService.ShowAsync<ProviderDialog>(T("Edit LLM Provider"), dialogParameters, DialogOptions.FULLSCREEN);
diff --git a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs
index 3892d05d..9e4479a7 100644
--- a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs	
+++ b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs	
@@ -69,6 +69,9 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId
     /// </summary>
     [Parameter]
     public bool IsEditing { get; init; }
+
+    [Parameter]
+    public string DataTokenizerPath { get; set; } = string.Empty;
     
     [Inject]
     private RustService RustService { get; init; } = null!;
@@ -143,6 +146,7 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId
             Host = this.DataHost,
             IsEnterpriseConfiguration = false,
             EnterpriseConfigurationPluginId = Guid.Empty,
+            TokenizerPath = this.dataFilePath,
         };
     }
     
@@ -164,6 +168,7 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId
         if(this.IsEditing)
         {
             this.dataEditingPreviousInstanceName = this.DataName.ToLowerInvariant();
+            this.dataFilePath = this.DataTokenizerPath;
             Console.WriteLine($"Previous instance name is '{this.dataEditingPreviousInstanceName}'");
             
             // When using self-hosted embedding, we must copy the model name:
@@ -241,7 +246,12 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId
         var response = await this.RustService.StoreTokenizer(this.DataName, this.dataEditingPreviousInstanceName, this.dataFilePath);
         Console.WriteLine($"Response from Rust: {response.Message}");
         if (!response.Success)
+        {
+            this.dataCustomTokenizerValidationIssue = response.Message;
+            await this.form.Validate();
             return;
+        }
+        this.dataFilePath = response.Message;
         
         // Use the data model to store the provider.
         // We just return this data to the parent component:
diff --git a/app/MindWork AI Studio/Dialogs/ProviderDialog.razor b/app/MindWork AI Studio/Dialogs/ProviderDialog.razor
index 4c09da2f..0e61ce5b 100644
--- a/app/MindWork AI Studio/Dialogs/ProviderDialog.razor	
+++ b/app/MindWork AI Studio/Dialogs/ProviderDialog.razor	
@@ -1,6 +1,7 @@
 @using AIStudio.Provider
 @using AIStudio.Provider.HuggingFace
 @using AIStudio.Provider.SelfHosted
+@using AIStudio.Tools.Rust
 @inherits MSGComponentBase
 <MudDialog>
     <DialogContent>
@@ -150,6 +151,24 @@
                 Validation="@this.providerValidation.ValidatingInstanceName"
                 UserAttributes="@SPELLCHECK_ATTRIBUTES"
             />
+
+            @if (this.DataLLMProvider != LLMProviders.NONE)
+            {
+                <MudJustifiedText Typo="Typo.body1" Class="mb-3">
+                    @T("For better token estimates, you can configure a custom tokenizer for this provider.")
+                </MudJustifiedText>
+                <SelectFile
+                    File="@this.dataFilePath"
+                    FileChanged="@this.OnDataFilePathChanged"
+                    Label="@T("Selected file path for the custom tokenizer")"
+                    FileDialogTitle="@T("Choose a custom tokenizer here")"
+                    Filter="[FileTypes.JSON]"
+                    IsClearable="@true"
+                    Error="@(!string.IsNullOrWhiteSpace(this.dataCustomTokenizerValidationIssue))"
+                    ErrorText="@(this.dataCustomTokenizerValidationIssue)"
+                    Validation="@this.providerValidation.ValidatingCustomTokenizer"
+                    OnClear="@this.ClearPathTokenizer" />
+            }
             
             <MudStack>
                 <MudButton OnClick="@this.ToggleExpertSettings">
diff --git a/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs b/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs
index 9e84bea8..fbd9a9b2 100644
--- a/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs	
+++ b/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs	
@@ -8,6 +8,7 @@ using AIStudio.Tools.Services;
 using AIStudio.Tools.Validation;
 
 using Microsoft.AspNetCore.Components;
+using Microsoft.AspNetCore.Components.Web;
 
 using Host = AIStudio.Provider.SelfHosted.Host;
 
@@ -83,6 +84,9 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId
     
     [Parameter]
     public string AdditionalJsonApiParameters { get; set; } = string.Empty;
+
+    [Parameter]
+    public string DataTokenizerPath { get; set; } = string.Empty;
     
     [Inject]
     private RustService RustService { get; init; } = null!;
@@ -104,6 +108,11 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId
     private string dataAPIKeyStorageIssue = string.Empty;
     private string dataEditingPreviousInstanceName = string.Empty;
     private string dataLoadingModelsIssue = string.Empty;
+    private string dataFilePath = string.Empty;
+    private string dataCustomTokenizerValidationIssue = string.Empty;
+    private Task dataTokenizerValidationTask = Task.CompletedTask;
+    private bool dataStoreWasAttempted;
+    private int dataTokenizerValidationRevision;
     private bool showExpertSettings;
     
     // We get the form reference from Blazor code to validate it manually:
@@ -123,6 +132,7 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId
             GetUsedInstanceNames = () => this.UsedInstanceNames,
             GetHost = () => this.DataHost,
             IsModelProvidedManually = () => this.DataLLMProvider.IsLLMModelProvidedManually(),
+            GetCustomTokenizerValidationIssue = () => this.dataCustomTokenizerValidationIssue,
         };
     }
 
@@ -158,6 +168,7 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId
             Host = this.DataHost,
             HFInferenceProvider = this.HFInferenceProviderId,
             AdditionalJsonApiParameters = this.AdditionalJsonApiParameters,
+            TokenizerPath = this.dataFilePath,
         };
     }
 
@@ -182,6 +193,7 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId
         if(this.IsEditing)
         {
             this.dataEditingPreviousInstanceName = this.DataInstanceName.ToLowerInvariant();
+            this.dataFilePath = this.DataTokenizerPath;
             
             // When using Fireworks or Hugging Face, we must copy the model name:
             if (this.DataLLMProvider.IsLLMModelProvidedManually())
@@ -237,6 +249,8 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId
 
     private async Task Store()
     {
+        this.dataStoreWasAttempted = true;
+        await this.dataTokenizerValidationTask;
         await this.form.Validate();
         if (!string.IsNullOrWhiteSpace(this.dataAPIKeyStorageIssue))
             this.dataAPIKeyStorageIssue = string.Empty;
@@ -253,6 +267,15 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId
         // When the data is not valid, we don't store it:
         if (!this.dataIsValid)
             return;
+
+        var tokenizerResponse = await this.RustService.StoreTokenizer(this.DataInstanceName, this.dataEditingPreviousInstanceName, this.dataFilePath);
+        if (!tokenizerResponse.Success)
+        {
+            this.dataCustomTokenizerValidationIssue = tokenizerResponse.Message;
+            await this.form.Validate();
+            return;
+        }
+        this.dataFilePath = tokenizerResponse.Message;
         
         // Use the data model to store the provider.
         // We just return this data to the parent component:
@@ -292,6 +315,58 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId
         }
     }
 
+    private Task ClearPathTokenizer(MouseEventArgs _)
+    {
+        return this.OnDataFilePathChanged(string.Empty);
+    }
+
+    private async Task OnDataFilePathChanged(string filePath)
+    {
+        this.dataFilePath = filePath;
+        var validationRevision = ++this.dataTokenizerValidationRevision;
+        this.dataTokenizerValidationTask = this.ValidateCustomTokenizer(filePath, validationRevision);
+        await this.dataTokenizerValidationTask;
+
+        if (validationRevision != this.dataTokenizerValidationRevision)
+            return;
+
+        if (this.dataStoreWasAttempted)
+            await this.form.Validate();
+        else
+            this.form.ResetValidation();
+    }
+
+    private async Task ValidateCustomTokenizer(string filePath, int validationRevision)
+    {
+        if (string.IsNullOrWhiteSpace(filePath))
+        {
+            if (validationRevision == this.dataTokenizerValidationRevision)
+                this.dataCustomTokenizerValidationIssue = string.Empty;
+
+            return;
+        }
+
+        try
+        {
+            var response = await this.RustService.ValidateTokenizer(filePath);
+            if (validationRevision != this.dataTokenizerValidationRevision)
+                return;
+
+            if (response.Success)
+                this.dataCustomTokenizerValidationIssue = string.Empty;
+            else
+                this.dataCustomTokenizerValidationIssue = T("Invalid tokenizer: ") + response.Message;
+        }
+        catch (Exception e)
+        {
+            if (validationRevision != this.dataTokenizerValidationRevision)
+                return;
+
+            this.Logger.LogError(e, "Failed to validate custom tokenizer.");
+            this.dataCustomTokenizerValidationIssue = T("Failed to validate the selected tokenizer. Please try again.");
+        }
+    }
+
     private void OnHostChanged(Host selectedHost)
     {
         // When the host changes, reset the model selection state:
diff --git a/app/MindWork AI Studio/Provider/BaseProvider.cs b/app/MindWork AI Studio/Provider/BaseProvider.cs
index 9b729824..28179223 100644
--- a/app/MindWork AI Studio/Provider/BaseProvider.cs	
+++ b/app/MindWork AI Studio/Provider/BaseProvider.cs	
@@ -90,6 +90,9 @@ public abstract class BaseProvider : IProvider, ISecretId
     /// <inheritdoc />
     public string AdditionalJsonApiParameters { get; init; } = string.Empty;
 
+    /// <inheritdoc />
+    public string TokenizerPath { get; init; } = string.Empty;
+
     /// <inheritdoc />
     public abstract IAsyncEnumerable<ContentStreamChunk> StreamChatCompletion(Model chatModel, ChatThread chatThread, SettingsManager settingsManager, CancellationToken token = default);
     
diff --git a/app/MindWork AI Studio/Provider/IProvider.cs b/app/MindWork AI Studio/Provider/IProvider.cs
index ef15dd21..e0842f2e 100644
--- a/app/MindWork AI Studio/Provider/IProvider.cs	
+++ b/app/MindWork AI Studio/Provider/IProvider.cs	
@@ -28,6 +28,11 @@ public interface IProvider
     /// The additional API parameters.
     /// </summary>
     public string AdditionalJsonApiParameters { get; }
+
+    /// <summary>
+    /// The tokenizer path associated with this provider configuration.
+    /// </summary>
+    public string TokenizerPath { get; }
     
     /// <summary>
     /// Starts a chat completion stream.
@@ -101,4 +106,4 @@ public interface IProvider
     /// <param name="token">>The cancellation token.</param>
     /// <returns>>The list of transcription models.</returns>
     public Task<IEnumerable<Model>> GetTranscriptionModels(string? apiKeyProvisional = null, CancellationToken token = default);
-}
\ No newline at end of file
+}
diff --git a/app/MindWork AI Studio/Provider/LLMProvidersExtensions.cs b/app/MindWork AI Studio/Provider/LLMProvidersExtensions.cs
index e71cef95..f04d9af4 100644
--- a/app/MindWork AI Studio/Provider/LLMProvidersExtensions.cs	
+++ b/app/MindWork AI Studio/Provider/LLMProvidersExtensions.cs	
@@ -186,7 +186,7 @@ public static class LLMProvidersExtensions
     /// <returns>The provider instance.</returns>
     public static IProvider CreateProvider(this AIStudio.Settings.Provider providerSettings)
     {
-        return providerSettings.UsedLLMProvider.CreateProvider(providerSettings.InstanceName, providerSettings.Host, providerSettings.Hostname, providerSettings.Model, providerSettings.HFInferenceProvider, providerSettings.AdditionalJsonApiParameters, providerSettings.IsEnterpriseConfiguration);
+        return providerSettings.UsedLLMProvider.CreateProvider(providerSettings.InstanceName, providerSettings.Host, providerSettings.Hostname, providerSettings.Model, providerSettings.HFInferenceProvider, providerSettings.TokenizerPath, providerSettings.AdditionalJsonApiParameters, providerSettings.IsEnterpriseConfiguration);
     }
     
     /// <summary>
@@ -196,7 +196,7 @@ public static class LLMProvidersExtensions
     /// <returns>The provider instance.</returns>
     public static IProvider CreateProvider(this EmbeddingProvider embeddingProviderSettings)
     {
-        return embeddingProviderSettings.UsedLLMProvider.CreateProvider(embeddingProviderSettings.Name, embeddingProviderSettings.Host, embeddingProviderSettings.Hostname, embeddingProviderSettings.Model, HFInferenceProvider.NONE, isEnterpriseConfiguration: embeddingProviderSettings.IsEnterpriseConfiguration);
+        return embeddingProviderSettings.UsedLLMProvider.CreateProvider(embeddingProviderSettings.Name, embeddingProviderSettings.Host, embeddingProviderSettings.Hostname, embeddingProviderSettings.Model, HFInferenceProvider.NONE, embeddingProviderSettings.TokenizerPath, isEnterpriseConfiguration: embeddingProviderSettings.IsEnterpriseConfiguration);
     }
     
     /// <summary>
@@ -206,33 +206,33 @@ public static class LLMProvidersExtensions
     /// <returns>The provider instance.</returns>
     public static IProvider CreateProvider(this TranscriptionProvider transcriptionProviderSettings)
     {
-        return transcriptionProviderSettings.UsedLLMProvider.CreateProvider(transcriptionProviderSettings.Name, transcriptionProviderSettings.Host, transcriptionProviderSettings.Hostname, transcriptionProviderSettings.Model, HFInferenceProvider.NONE, isEnterpriseConfiguration: transcriptionProviderSettings.IsEnterpriseConfiguration);
+        return transcriptionProviderSettings.UsedLLMProvider.CreateProvider(transcriptionProviderSettings.Name, transcriptionProviderSettings.Host, transcriptionProviderSettings.Hostname, transcriptionProviderSettings.Model, HFInferenceProvider.NONE, string.Empty, isEnterpriseConfiguration: transcriptionProviderSettings.IsEnterpriseConfiguration);
     }
     
-    private static IProvider CreateProvider(this LLMProviders provider, string instanceName, Host host, string hostname, Model model, HFInferenceProvider inferenceProvider, string expertProviderApiParameter = "", bool isEnterpriseConfiguration = false)
+    private static IProvider CreateProvider(this LLMProviders provider, string instanceName, Host host, string hostname, Model model, HFInferenceProvider inferenceProvider, string tokenizerPath = "", string expertProviderApiParameter = "", bool isEnterpriseConfiguration = false)
     {
         try
         {
             return provider switch
             {
-                LLMProviders.OPEN_AI => new ProviderOpenAI { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
-                LLMProviders.ANTHROPIC => new ProviderAnthropic { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
-                LLMProviders.MISTRAL => new ProviderMistral { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
-                LLMProviders.GOOGLE => new ProviderGoogle { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
-                LLMProviders.X => new ProviderX { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
-                LLMProviders.DEEP_SEEK => new ProviderDeepSeek { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
-                LLMProviders.ALIBABA_CLOUD => new ProviderAlibabaCloud { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
-                LLMProviders.PERPLEXITY => new ProviderPerplexity { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
-                LLMProviders.OPEN_ROUTER => new ProviderOpenRouter { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+                LLMProviders.OPEN_AI => new ProviderOpenAI { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+                LLMProviders.ANTHROPIC => new ProviderAnthropic { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+                LLMProviders.MISTRAL => new ProviderMistral { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+                LLMProviders.GOOGLE => new ProviderGoogle { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+                LLMProviders.X => new ProviderX { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+                LLMProviders.DEEP_SEEK => new ProviderDeepSeek { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+                LLMProviders.ALIBABA_CLOUD => new ProviderAlibabaCloud { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+                LLMProviders.PERPLEXITY => new ProviderPerplexity { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+                LLMProviders.OPEN_ROUTER => new ProviderOpenRouter { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
 
-                LLMProviders.GROQ => new ProviderGroq { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
-                LLMProviders.FIREWORKS => new ProviderFireworks { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
-                LLMProviders.HUGGINGFACE => new ProviderHuggingFace(inferenceProvider, model) { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+                LLMProviders.GROQ => new ProviderGroq { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+                LLMProviders.FIREWORKS => new ProviderFireworks { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+                LLMProviders.HUGGINGFACE => new ProviderHuggingFace(inferenceProvider, model) { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
 
-                LLMProviders.SELF_HOSTED => new ProviderSelfHosted(host, hostname) { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+                LLMProviders.SELF_HOSTED => new ProviderSelfHosted(host, hostname) { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
 
-                LLMProviders.HELMHOLTZ => new ProviderHelmholtz { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
-                LLMProviders.GWDG => new ProviderGWDG { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+                LLMProviders.HELMHOLTZ => new ProviderHelmholtz { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
+                LLMProviders.GWDG => new ProviderGWDG { InstanceName = instanceName, AdditionalJsonApiParameters = expertProviderApiParameter, TokenizerPath = tokenizerPath, IsEnterpriseConfiguration = isEnterpriseConfiguration },
 
                 _ => new NoProvider(),
             };
@@ -442,4 +442,4 @@ public static class LLMProvidersExtensions
         LLMProviders.HUGGINGFACE => true,
         _ => false,
     };
-}
\ No newline at end of file
+}
diff --git a/app/MindWork AI Studio/Provider/NoProvider.cs b/app/MindWork AI Studio/Provider/NoProvider.cs
index 3fc8459c..9128ad47 100644
--- a/app/MindWork AI Studio/Provider/NoProvider.cs	
+++ b/app/MindWork AI Studio/Provider/NoProvider.cs	
@@ -18,6 +18,9 @@ public class NoProvider : IProvider
     /// <inheritdoc />
     public string AdditionalJsonApiParameters { get; init; } = string.Empty;
 
+    /// <inheritdoc />
+    public string TokenizerPath { get; init; } = string.Empty;
+
     public Task<IEnumerable<Model>> GetTextModels(string? apiKeyProvisional = null, CancellationToken token = default) => Task.FromResult<IEnumerable<Model>>([]);
 
     public Task<IEnumerable<Model>> GetImageModels(string? apiKeyProvisional = null, CancellationToken token = default) => Task.FromResult<IEnumerable<Model>>([]);
@@ -45,4 +48,4 @@ public class NoProvider : IProvider
     public IReadOnlyCollection<Capability> GetModelCapabilities(Model model) => [ Capability.NONE ];
 
     #endregion
-}
\ No newline at end of file
+}
diff --git a/app/MindWork AI Studio/Settings/EmbeddingProvider.cs b/app/MindWork AI Studio/Settings/EmbeddingProvider.cs
index d5a6f20a..0f72c6c7 100644
--- a/app/MindWork AI Studio/Settings/EmbeddingProvider.cs	
+++ b/app/MindWork AI Studio/Settings/EmbeddingProvider.cs	
@@ -19,7 +19,8 @@ public sealed record EmbeddingProvider(
     bool IsEnterpriseConfiguration = false,
     Guid EnterpriseConfigurationPluginId = default,
     string Hostname = "http://localhost:1234",
-    Host Host = Host.NONE) : ConfigurationBaseObject, ISecretId
+    Host Host = Host.NONE,
+    string TokenizerPath = "") : ConfigurationBaseObject, ISecretId
 {
     private static readonly ILogger<EmbeddingProvider> LOGGER = Program.LOGGER_FACTORY.CreateLogger<EmbeddingProvider>();
 
@@ -96,6 +97,13 @@ public sealed record EmbeddingProvider(
             return false;
         }
 
+        var tokenizerPath = string.Empty;
+        if (table.TryGetValue("TokenizerPath", out var tokenizerPathValue) && !tokenizerPathValue.TryRead<string>(out tokenizerPath))
+        {
+            LOGGER.LogWarning($"The configured embedding provider {idx} does not contain a valid tokenizer path. (Plugin ID: {configPluginId})");
+            tokenizerPath = string.Empty;
+        }
+
         provider = new EmbeddingProvider
         {
             Num = 0, // will be set later by the PluginConfigurationObject
@@ -108,6 +116,7 @@ public sealed record EmbeddingProvider(
             EnterpriseConfigurationPluginId = configPluginId,
             Hostname = hostname,
             Host = host,
+            TokenizerPath = tokenizerPath,
         };
 
         // Handle encrypted API key if present:
diff --git a/app/MindWork AI Studio/Settings/Provider.cs b/app/MindWork AI Studio/Settings/Provider.cs
index 0ccf272c..c8276bcd 100644
--- a/app/MindWork AI Studio/Settings/Provider.cs	
+++ b/app/MindWork AI Studio/Settings/Provider.cs	
@@ -32,7 +32,8 @@ public sealed record Provider(
     string Hostname = "http://localhost:1234",
     Host Host = Host.NONE,
     HFInferenceProvider HFInferenceProvider = HFInferenceProvider.NONE,
-    string AdditionalJsonApiParameters = "") : ConfigurationBaseObject, ISecretId
+    string AdditionalJsonApiParameters = "",
+    string TokenizerPath = "") : ConfigurationBaseObject, ISecretId
 {
     private static readonly ILogger<Provider> LOGGER = Program.LOGGER_FACTORY.CreateLogger<Provider>();
     
@@ -151,6 +152,13 @@ public sealed record Provider(
             additionalJsonApiParameters = string.Empty;
         }
 
+        var tokenizerPath = string.Empty;
+        if (table.TryGetValue("TokenizerPath", out var tokenizerPathValue) && !tokenizerPathValue.TryRead<string>(out tokenizerPath))
+        {
+            LOGGER.LogWarning($"The configured provider {idx} does not contain a valid tokenizer path. (Plugin ID: {configPluginId})");
+            tokenizerPath = string.Empty;
+        }
+
         provider = new Provider
         {
             Num = 0, // will be set later by the PluginConfigurationObject
@@ -165,6 +173,7 @@ public sealed record Provider(
             Host = host,
             HFInferenceProvider = hfInferenceProvider,
             AdditionalJsonApiParameters = additionalJsonApiParameters,
+            TokenizerPath = tokenizerPath,
         };
 
         // Handle encrypted API key if present:
diff --git a/runtime/Cargo.toml b/runtime/Cargo.toml
index 0fb62f1a..4b41800c 100644
--- a/runtime/Cargo.toml
+++ b/runtime/Cargo.toml
@@ -41,6 +41,7 @@ pptx-to-md = "0.4.0"
 tempfile = "3.27.0"
 strum_macros = "0.28.0"
 sysinfo = "0.38.4"
+tokenizers = "0.22.2"
 
 # Fixes security vulnerability downstream, where the upstream is not fixed yet:
 time = "0.3.47" # -> Rocket
diff --git a/runtime/src/tokenizer.rs b/runtime/src/tokenizer.rs
index 9fe1801e..f45416af 100644
--- a/runtime/src/tokenizer.rs
+++ b/runtime/src/tokenizer.rs
@@ -1,5 +1,4 @@
-﻿use rocket::yansi::Paint;
-use std::fs;
+﻿use std::fs;
 use std::path::{PathBuf};
 use std::sync::OnceLock;
 use rocket::{post};
@@ -75,23 +74,16 @@ fn validate_tokenizer_at_path(path: &PathBuf) -> Result<usize, TokenizerError> {
     }
 
     let tokenizer = Tokenizer::from_file(path).map_err(|e| {
-        println!("Failed to load tokenizer from {}: {}", Paint::red(&path.display()), e);
         TokenizerError::from(format!(
             "Failed to load tokenizer from '{}': {}",
             path.display(),
             e
         ))
     })?;
-    println!("Loaded tokenizer from {}", Paint::green(&path.display()));
 
     let test_string = "Hello, world! This is a test string for tokenizer validation.";
 
     let encoding = tokenizer.encode(test_string, true).map_err(|e| {
-        println!(
-            "Tokenizer failed to encode validation string for {}: {}",
-            Paint::red(&path.display()),
-            e
-        );
         TokenizerError::from(format!(
             "Tokenizer failed to encode validation string: {}",
             e
@@ -114,7 +106,7 @@ fn validate_tokenizer_at_path(path: &PathBuf) -> Result<usize, TokenizerError> {
     Ok(token_count)
 }
 
-fn handle_tokenizer_store(payload: &TokenizerStorage) -> Result<(), std::io::Error> {
+fn handle_tokenizer_store(payload: &TokenizerStorage) -> Result<String, std::io::Error> {
     let data_dir = DATA_DIRECTORY
         .get()
         .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::Other, "DATA_DIRECTORY not initialized"))?;
@@ -124,11 +116,11 @@ fn handle_tokenizer_store(payload: &TokenizerStorage) -> Result<(), std::io::Err
     // Delete previous model if file_path is empty
     if payload.file_path.trim().is_empty() {
         if payload.previous_model_id.trim().is_empty() {
-            return Ok(()); // Nothing to delete
+            return Ok(String::from("")); // Nothing to delete
         }
         let previous_path = base_path.join(&payload.previous_model_id);
         fs::remove_dir_all(previous_path)?;
-        return Ok(());
+        return Ok(String::from(""));
     }
 
     // Copy file
@@ -136,22 +128,28 @@ fn handle_tokenizer_store(payload: &TokenizerStorage) -> Result<(), std::io::Err
     let source_name = source_path.file_name()
         .and_then(|n| n.to_str())
         .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::InvalidInput, "Invalid tokenizer file path"))?;
-    fs::create_dir_all(&base_path.join(&payload.model_id))?;
-    let destination_path = base_path.join(&payload.model_id).join(source_name);
-    println!("Moving tokenizer file from {} to {}", source_path.display(), destination_path.display());
+    let model_path = &base_path.join(&payload.model_id);
+    let destination_path = &model_path.join(source_name);
+    println!("source_path: {}, destination_path: {}", source_path.display(), destination_path.display());
+    println!("equals {}", source_path.eq(destination_path));
 
+    if !source_path.eq(destination_path) && model_path.exists() {
+        fs::remove_dir_all(model_path)?;
+    }
+    fs::create_dir_all(model_path)?;
+    println!("Moving tokenizer file from {} to {}", source_path.display(), destination_path.display());
     let previous_path = base_path.join(&payload.previous_model_id);
 
     // Delete previous tokenizer folder if specified
     if !payload.previous_model_id.trim().is_empty() && source_path.starts_with(&previous_path){
         fs::rename(&source_path, &destination_path)?;
-        if previous_path.exists() {
+        if previous_path.exists() && !previous_path.eq(model_path) {
             fs::remove_dir_all(previous_path)?;
         }
     }else{
         fs::copy( & source_path, & destination_path)?;
     }
-    Ok(())
+    Ok(destination_path.to_str().unwrap().to_string())
 }
 
 pub fn get_token_count(text: &str) -> Result<usize, TokenizerError> {
@@ -179,10 +177,10 @@ pub fn validate_tokenizer(_token: APIToken, payload: Json<TokenizerValidation>)
 pub fn store_tokenizer(_token: APIToken, payload: Json<TokenizerStorage>) -> Json<TokenizerResponse>{
     println!("Received tokenizer store request: {}, {}, {}", payload.model_id, payload.previous_model_id, payload.file_path);
     match handle_tokenizer_store(&payload) {
-        Ok(()) => Json(TokenizerResponse {
+        Ok(dest_path) => Json(TokenizerResponse {
             success: true,
             token_count: 0,
-            message: "Success".to_string(),
+            message: dest_path,
         }),
         Err(e) => Json(TokenizerResponse {
             success: false,