diff --git a/app/MindWork AI Studio/Assistants/I18N/allTexts.lua b/app/MindWork AI Studio/Assistants/I18N/allTexts.lua index 06009d62..5ef3bb98 100644 --- a/app/MindWork AI Studio/Assistants/I18N/allTexts.lua +++ b/app/MindWork AI Studio/Assistants/I18N/allTexts.lua @@ -1915,6 +1915,9 @@ UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3403290862"] = "The selec -- Select a provider first UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3654197869"] = "Select a provider first" +-- Estimated amount of tokens: +UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T377990776"] = "Estimated amount of tokens:" + -- Start new chat in workspace '{0}' UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3928697643"] = "Start new chat in workspace '{0}'" @@ -3838,6 +3841,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2331453405"] = "(O -- Add UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2646845972"] = "Add" +-- Selected file path for the custom tokenizer +UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T278585345"] = "Selected file path for the custom tokenizer" + -- No models loaded or available. UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2810182573"] = "No models loaded or available." @@ -3847,6 +3853,12 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T2842060373"] = "In -- Currently, we cannot query the embedding models for the selected provider and/or host. Therefore, please enter the model name manually. UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T290547799"] = "Currently, we cannot query the embedding models for the selected provider and/or host. Therefore, please enter the model name manually." +-- Choose a custom tokenizer here +UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T3787466119"] = "Choose a custom tokenizer here" + +-- For better embeddings and less storage usage, it's recommended to use a custom tokenizer to enable a more accurate token count. +UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T4126312157"] = "For better embeddings and less storage usage, it's recommended to use a custom tokenizer to enable a more accurate token count." + -- Model selection UI_TEXT_CONTENT["AISTUDIO::DIALOGS::EMBEDDINGPROVIDERDIALOG::T416738168"] = "Model selection" @@ -5689,6 +5701,9 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1019424746"] = "Startup log file -- Browse AI Studio's source code on GitHub — we welcome your contributions. UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1107156991"] = "Browse AI Studio's source code on GitHub — we welcome your contributions." +-- The Tokenizer library serves as the base framework for integrating the DeepSeek tokenizer. +UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1132433749"] = "The Tokenizer library serves as the base framework for integrating the DeepSeek tokenizer." + -- ID mismatch: the plugin ID differs from the enterprise configuration ID. UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1137744461"] = "ID mismatch: the plugin ID differs from the enterprise configuration ID." @@ -5929,6 +5944,9 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T566998575"] = "This is a library -- Used .NET SDK UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T585329785"] = "Used .NET SDK" +-- We use the DeepSeek Tokenizer to estimate the number of tokens an input will generate. +UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T591393704"] = "We use the DeepSeek Tokenizer to estimate the number of tokens an input will generate." + -- This library is used to manage sidecar processes and to ensure that stale or zombie sidecars are detected and terminated. UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T633932150"] = "This library is used to manage sidecar processes and to ensure that stale or zombie sidecars are detected and terminated." diff --git a/app/MindWork AI Studio/Components/ChatComponent.razor b/app/MindWork AI Studio/Components/ChatComponent.razor index 6ab7d977..1998aee6 100644 --- a/app/MindWork AI Studio/Components/ChatComponent.razor +++ b/app/MindWork AI Studio/Components/ChatComponent.razor @@ -34,7 +34,7 @@ - diff --git a/app/MindWork AI Studio/Components/ChatComponent.razor.cs b/app/MindWork AI Studio/Components/ChatComponent.razor.cs index c4b30a2f..0e01d58d 100644 --- a/app/MindWork AI Studio/Components/ChatComponent.razor.cs +++ b/app/MindWork AI Studio/Components/ChatComponent.razor.cs @@ -3,6 +3,7 @@ using AIStudio.Dialogs; using AIStudio.Provider; using AIStudio.Settings; using AIStudio.Settings.DataModel; +using AIStudio.Tools.Services; using Microsoft.AspNetCore.Components; using Microsoft.AspNetCore.Components.Web; @@ -44,6 +45,8 @@ public partial class ChatComponent : MSGComponentBase, IAsyncDisposable [Inject] private IDialogService DialogService { get; init; } = null!; + [Inject] + private RustService RustService { get; init; } = null!; [Inject] private IJSRuntime JsRuntime { get; init; } = null!; @@ -69,10 +72,12 @@ public partial class ChatComponent : MSGComponentBase, IAsyncDisposable private Guid currentChatThreadId = Guid.Empty; private CancellationTokenSource? cancellationTokenSource; private HashSet chatDocumentPaths = []; + private string tokenCount = "0"; + private string TokenCountMessage => $"{this.T("Estimated amount of tokens:")} {this.tokenCount}"; // Unfortunately, we need the input field reference to blur the focus away. Without // this, we cannot clear the input field. - private MudTextField inputField = null!; + private UserPromptComponent inputField = null!; #region Overrides of ComponentBase @@ -460,6 +465,9 @@ public partial class ChatComponent : MSGComponentBase, IAsyncDisposable // Was a modifier key pressed as well? var isModifier = keyEvent.AltKey || keyEvent.CtrlKey || keyEvent.MetaKey || keyEvent.ShiftKey; + if (isEnter) + await this.CalculateTokenCount(); + // Depending on the user's settings, might react to shortcuts: switch (this.SettingsManager.ConfigurationData.Chat.ShortcutSendBehavior) { @@ -596,6 +604,7 @@ public partial class ChatComponent : MSGComponentBase, IAsyncDisposable this.chatDocumentPaths.Clear(); await this.inputField.BlurAsync(); + this.tokenCount = "0"; // Enable the stream state for the chat component: this.isStreaming = true; @@ -978,6 +987,25 @@ public partial class ChatComponent : MSGComponentBase, IAsyncDisposable return Task.CompletedTask; } + private async Task CalculateTokenCount() + { + if (this.inputField.Value is null) + { + this.tokenCount = "0"; + return; + } + var response = await this.RustService.GetTokenCount(this.inputField.Value); + if (response is null) + return; + if (!response.Value.Success) + { + this.Logger.LogWarning($"Failed to calculate token count: {response.Value.Message}"); + return; + } + this.tokenCount = response.Value.TokenCount.ToString(); + this.StateHasChanged(); + } + #region Overrides of MSGComponentBase protected override async Task ProcessIncomingMessage(ComponentBase? sendingComponent, Event triggeredEvent, T? data) where T : default diff --git a/app/MindWork AI Studio/Components/SelectFile.razor b/app/MindWork AI Studio/Components/SelectFile.razor index de3971e5..b6f7d39b 100644 --- a/app/MindWork AI Studio/Components/SelectFile.razor +++ b/app/MindWork AI Studio/Components/SelectFile.razor @@ -5,12 +5,16 @@ T="string" Text="@this.File" Label="@this.Label" - ReadOnly="@true" + ReadOnly="@(!this.IsClearable)" Validation="@this.Validation" Adornment="Adornment.Start" AdornmentIcon="@Icons.Material.Filled.AttachFile" UserAttributes="@SPELLCHECK_ATTRIBUTES" Variant="Variant.Outlined" + Clearable="this.IsClearable" + Error="@this.Error" + ErrorText="@this.ErrorText" + OnClearButtonClick="@this.OnClear" /> diff --git a/app/MindWork AI Studio/Components/SelectFile.razor.cs b/app/MindWork AI Studio/Components/SelectFile.razor.cs index 91c7a667..38215b0c 100644 --- a/app/MindWork AI Studio/Components/SelectFile.razor.cs +++ b/app/MindWork AI Studio/Components/SelectFile.razor.cs @@ -2,6 +2,7 @@ using AIStudio.Tools.Rust; using AIStudio.Tools.Services; using Microsoft.AspNetCore.Components; +using Microsoft.AspNetCore.Components.Web; namespace AIStudio.Components; @@ -27,7 +28,19 @@ public partial class SelectFile : MSGComponentBase [Parameter] public Func Validation { get; set; } = _ => null; - + + [Parameter] + public bool IsClearable { get; set; } = false; + + [Parameter] + public bool Error { get; set; } = false; + + [Parameter] + public string ErrorText { get; set; } = string.Empty; + + [Parameter] + public Func OnClear { get; set; } = _ => Task.CompletedTask; + [Inject] public RustService RustService { get; set; } = null!; @@ -52,7 +65,7 @@ public partial class SelectFile : MSGComponentBase this.File = file; this.FileChanged.InvokeAsync(file); } - + private async Task OpenFileDialog() { var response = await this.RustService.SelectFile(this.FileDialogTitle, this.Filter, string.IsNullOrWhiteSpace(this.File) ? null : this.File); diff --git a/app/MindWork AI Studio/Components/UserPromptComponent.cs b/app/MindWork AI Studio/Components/UserPromptComponent.cs new file mode 100644 index 00000000..03139a52 --- /dev/null +++ b/app/MindWork AI Studio/Components/UserPromptComponent.cs @@ -0,0 +1,68 @@ +using Microsoft.AspNetCore.Components; +using Timer = System.Timers.Timer; + +namespace AIStudio.Components; + +/// +/// Debounced multi-line text input built on . +/// Keeps the base API while adding a debounce timer. +/// Callers can override any property as usual. +/// +public class UserPromptComponent : MudTextField +{ + [Parameter] + public TimeSpan DebounceTime { get; set; } = TimeSpan.FromMilliseconds(800); + + [Parameter] + public Func WhenTextChangedAsync { get; set; } = _ => Task.CompletedTask; + + private readonly Timer debounceTimer = new(); + private string text = string.Empty; + private string lastParameterText = string.Empty; + private string lastNotifiedText = string.Empty; + private bool isInitialized; + + protected override async Task OnInitializedAsync() + { + this.text = this.Text ?? string.Empty; + this.lastParameterText = this.text; + this.lastNotifiedText = this.text; + this.debounceTimer.AutoReset = false; + this.debounceTimer.Interval = this.DebounceTime.TotalMilliseconds; + this.debounceTimer.Elapsed += (_, _) => + { + this.debounceTimer.Stop(); + if (this.text == this.lastNotifiedText) + return; + + this.lastNotifiedText = this.text; + this.InvokeAsync(async () => await this.TextChanged.InvokeAsync(this.text)); + this.InvokeAsync(async () => await this.WhenTextChangedAsync(this.text)); + }; + + this.isInitialized = true; + await base.OnInitializedAsync(); + } + + protected override async Task OnParametersSetAsync() + { + // Ensure the timer uses the latest debouncing interval: + if (!this.isInitialized) + return; + + if(Math.Abs(this.debounceTimer.Interval - this.DebounceTime.TotalMilliseconds) > 1) + this.debounceTimer.Interval = this.DebounceTime.TotalMilliseconds; + + // Only sync when the parent's parameter actually changed since the last change: + if (this.Text != this.lastParameterText) + { + this.text = this.Text ?? string.Empty; + this.lastParameterText = this.text; + } + + this.debounceTimer.Stop(); + this.debounceTimer.Start(); + + await base.OnParametersSetAsync(); + } +} diff --git a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor index 85e6e6ef..c23a7948 100644 --- a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor +++ b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor @@ -1,5 +1,6 @@ @using AIStudio.Provider @using AIStudio.Provider.SelfHosted +@using AIStudio.Tools.Rust @inherits MSGComponentBase @@ -7,7 +8,7 @@ @* ReSharper disable once CSharpWarnings::CS8974 *@ - + @foreach (LLMProviders provider in Enum.GetValues(typeof(LLMProviders))) { if (provider.ProvideEmbeddingAPI() || provider is LLMProviders.NONE) @@ -22,7 +23,7 @@ @T("Create account") - + @if (this.DataLLMProvider.IsAPIKeyNeeded(this.DataHost)) { @@ -71,15 +72,14 @@ AdornmentColor="Color.Info" Validation="@this.ValidateManuallyModel" UserAttributes="@SPELLCHECK_ATTRIBUTES" - HelperText="@T("Currently, we cannot query the embedding models for the selected provider and/or host. Therefore, please enter the model name manually.")" - /> + HelperText="@T("Currently, we cannot query the embedding models for the selected provider and/or host. Therefore, please enter the model name manually.")"/> } else { @T("Load") - @if(this.availableModels.Count is 0) + @if (this.availableModels.Count is 0) { @T("No models loaded or available.") @@ -122,18 +122,36 @@ AdornmentIcon="@Icons.Material.Filled.Lightbulb" AdornmentColor="Color.Info" Validation="@this.providerValidation.ValidatingInstanceName" - UserAttributes="@SPELLCHECK_ATTRIBUTES" - /> - + UserAttributes="@SPELLCHECK_ATTRIBUTES"/> + @if (this.DataModel != default){ + + @T("For better embeddings and less storage usage, it's recommended to use a custom tokenizer to enable a more accurate token count.") + + + } - + @if (this.dataStoreWasAttempted) + { + + } @T("Cancel") - @if(this.IsEditing) + @if (this.IsEditing) { @T("Update") } @@ -143,4 +161,4 @@ } - \ No newline at end of file + diff --git a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs index 6520b7ee..3892d05d 100644 --- a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs +++ b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs @@ -1,3 +1,4 @@ +using AIStudio.Chat; using AIStudio.Components; using AIStudio.Provider; using AIStudio.Settings; @@ -5,7 +6,7 @@ using AIStudio.Tools.Services; using AIStudio.Tools.Validation; using Microsoft.AspNetCore.Components; - +using Microsoft.AspNetCore.Components.Web; using Host = AIStudio.Provider.SelfHosted.Host; namespace AIStudio.Dialogs; @@ -89,6 +90,11 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId private string dataAPIKeyStorageIssue = string.Empty; private string dataEditingPreviousInstanceName = string.Empty; private string dataLoadingModelsIssue = string.Empty; + private string dataFilePath = string.Empty; + private string dataCustomTokenizerValidationIssue = string.Empty; + private Task dataTokenizerValidationTask = Task.CompletedTask; + private bool dataStoreWasAttempted; + private int dataTokenizerValidationRevision; // We get the form reference from Blazor code to validate it manually: private MudForm form = null!; @@ -96,7 +102,7 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId private readonly List availableModels = new(); private readonly Encryption encryption = Program.ENCRYPTION; private readonly ProviderValidation providerValidation; - + public EmbeddingProviderDialog() { this.providerValidation = new() @@ -107,6 +113,7 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId GetUsedInstanceNames = () => this.UsedInstanceNames, GetHost = () => this.DataHost, IsModelProvidedManually = () => this.DataLLMProvider is LLMProviders.SELF_HOSTED && this.DataHost is Host.OLLAMA, + GetCustomTokenizerValidationIssue = () => this.dataCustomTokenizerValidationIssue, }; } @@ -152,10 +159,12 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId // Load the used instance names: this.UsedInstanceNames = this.SettingsManager.ConfigurationData.EmbeddingProviders.Select(x => x.Name.ToLowerInvariant()).ToList(); + Console.WriteLine($"Previous instance names: {this.dataEditingPreviousInstanceName}"); // When editing, we need to load the data: if(this.IsEditing) { this.dataEditingPreviousInstanceName = this.DataName.ToLowerInvariant(); + Console.WriteLine($"Previous instance name is '{this.dataEditingPreviousInstanceName}'"); // When using self-hosted embedding, we must copy the model name: if (this.DataLLMProvider is LLMProviders.SELF_HOSTED) @@ -211,6 +220,8 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId private async Task Store() { + this.dataStoreWasAttempted = true; + await this.dataTokenizerValidationTask; await this.form.Validate(); this.dataAPIKeyStorageIssue = string.Empty; @@ -227,6 +238,11 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId if (!this.dataIsValid) return; + var response = await this.RustService.StoreTokenizer(this.DataName, this.dataEditingPreviousInstanceName, this.dataFilePath); + Console.WriteLine($"Response from Rust: {response.Message}"); + if (!response.Success) + return; + // Use the data model to store the provider. // We just return this data to the parent component: var addedProviderSettings = this.CreateEmbeddingProviderSettings(); @@ -265,6 +281,58 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId } } + private Task ClearPathTokenizer(MouseEventArgs _) + { + return this.OnDataFilePathChanged(string.Empty); + } + + private async Task OnDataFilePathChanged(string filePath) + { + this.dataFilePath = filePath; + var validationRevision = ++this.dataTokenizerValidationRevision; + this.dataTokenizerValidationTask = this.ValidateCustomTokenizer(filePath, validationRevision); + await this.dataTokenizerValidationTask; + + if (validationRevision != this.dataTokenizerValidationRevision) + return; + + if (this.dataStoreWasAttempted) + await this.form.Validate(); + else + this.form.ResetValidation(); + } + + private async Task ValidateCustomTokenizer(string filePath, int validationRevision) + { + if (string.IsNullOrWhiteSpace(filePath)) + { + if (validationRevision == this.dataTokenizerValidationRevision) + this.dataCustomTokenizerValidationIssue = string.Empty; + + return; + } + + try + { + var response = await this.RustService.ValidateTokenizer(filePath); + if (validationRevision != this.dataTokenizerValidationRevision) + return; + + if (response.Success) + this.dataCustomTokenizerValidationIssue = string.Empty; + else + this.dataCustomTokenizerValidationIssue = T("Invalid tokenizer: ") + response.Message; + } + catch (Exception e) + { + if (validationRevision != this.dataTokenizerValidationRevision) + return; + + this.Logger.LogError(e, "Failed to validate custom tokenizer."); + this.dataCustomTokenizerValidationIssue = T("Failed to validate the selected tokenizer. Please try again."); + } + } + private void OnHostChanged(Host selectedHost) { // When the host changes, reset the model selection state: @@ -307,4 +375,4 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId }; private bool IsNoneProvider => this.DataLLMProvider is LLMProviders.NONE; -} \ No newline at end of file +} diff --git a/app/MindWork AI Studio/Pages/Information.razor b/app/MindWork AI Studio/Pages/Information.razor index b7b9aea4..665afad6 100644 --- a/app/MindWork AI Studio/Pages/Information.razor +++ b/app/MindWork AI Studio/Pages/Information.razor @@ -290,6 +290,8 @@ + + diff --git a/app/MindWork AI Studio/Plugins/languages/de-de-43065dbc-78d0-45b7-92be-f14c2926e2dc/plugin.lua b/app/MindWork AI Studio/Plugins/languages/de-de-43065dbc-78d0-45b7-92be-f14c2926e2dc/plugin.lua index 75c38a6d..b4c9692e 100644 --- a/app/MindWork AI Studio/Plugins/languages/de-de-43065dbc-78d0-45b7-92be-f14c2926e2dc/plugin.lua +++ b/app/MindWork AI Studio/Plugins/languages/de-de-43065dbc-78d0-45b7-92be-f14c2926e2dc/plugin.lua @@ -1917,6 +1917,9 @@ UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3403290862"] = "Der ausge -- Select a provider first UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3654197869"] = "Wähle zuerst einen Anbieter aus" +-- Estimated amount of tokens: +UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T377990776"] = "Geschätzte Anzahl an Tokens:" + -- Start new chat in workspace "{0}" UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3928697643"] = "Neuen Chat im Arbeitsbereich \"{0}\" starten" @@ -5691,6 +5694,9 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1019424746"] = "Startprotokollda -- Browse AI Studio's source code on GitHub — we welcome your contributions. UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1107156991"] = "Sehen Sie sich den Quellcode von AI Studio auf GitHub an – wir freuen uns über ihre Beiträge." +-- The Tokenizer library serves as the base framework for integrating the DeepSeek tokenizer. +UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1132433749"] = "Die Tokenizer‑Bibliothek dient als Basis‑Framework für die Integration des DeepSeek‑Tokenizers." + -- ID mismatch: the plugin ID differs from the enterprise configuration ID. UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1137744461"] = "ID-Konflikt: Die Plugin-ID stimmt nicht mit der ID der Unternehmenskonfiguration überein." @@ -5931,6 +5937,9 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T566998575"] = "Dies ist eine Bib -- Used .NET SDK UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T585329785"] = "Verwendetes .NET SDK" +-- We use the DeepSeek Tokenizer to estimate the number of tokens an input will generate. +UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T591393704"] = "Wir verwenden den DeepSeek‑Tokenizer, um die Token‑Anzahl einer Eingabe zu schätzen." + -- This library is used to manage sidecar processes and to ensure that stale or zombie sidecars are detected and terminated. UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T633932150"] = "Diese Bibliothek wird verwendet, um Sidecar-Prozesse zu verwalten und sicherzustellen, dass veraltete oder Zombie-Sidecars erkannt und beendet werden." diff --git a/app/MindWork AI Studio/Plugins/languages/en-us-97dfb1ba-50c4-4440-8dfa-6575daf543c8/plugin.lua b/app/MindWork AI Studio/Plugins/languages/en-us-97dfb1ba-50c4-4440-8dfa-6575daf543c8/plugin.lua index 8e7c757f..544a565b 100644 --- a/app/MindWork AI Studio/Plugins/languages/en-us-97dfb1ba-50c4-4440-8dfa-6575daf543c8/plugin.lua +++ b/app/MindWork AI Studio/Plugins/languages/en-us-97dfb1ba-50c4-4440-8dfa-6575daf543c8/plugin.lua @@ -1917,6 +1917,9 @@ UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3403290862"] = "The selec -- Select a provider first UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3654197869"] = "Select a provider first" +-- Estimated amount of tokens: +UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T377990776"] = "Estimated amount of tokens:" + -- Start new chat in workspace "{0}" UI_TEXT_CONTENT["AISTUDIO::COMPONENTS::CHATCOMPONENT::T3928697643"] = "Start new chat in workspace \"{0}\"" @@ -5691,6 +5694,9 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1019424746"] = "Startup log file -- Browse AI Studio's source code on GitHub — we welcome your contributions. UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1107156991"] = "Browse AI Studio's source code on GitHub — we welcome your contributions." +-- The Tokenizer library serves as the base framework for integrating the DeepSeek tokenizer. +UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1132433749"] = "The Tokenizer library serves as the base framework for integrating the DeepSeek tokenizer." + -- ID mismatch: the plugin ID differs from the enterprise configuration ID. UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T1137744461"] = "ID mismatch: the plugin ID differs from the enterprise configuration ID." @@ -5931,6 +5937,9 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T566998575"] = "This is a library -- Used .NET SDK UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T585329785"] = "Used .NET SDK" +-- We use the DeepSeek Tokenizer to estimate the number of tokens an input will generate. +UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T591393704"] = "We use the DeepSeek Tokenizer to estimate the number of tokens an input will generate." + -- This library is used to manage sidecar processes and to ensure that stale or zombie sidecars are detected and terminated. UI_TEXT_CONTENT["AISTUDIO::PAGES::INFORMATION::T633932150"] = "This library is used to manage sidecar processes and to ensure that stale or zombie sidecars are detected and terminated." diff --git a/app/MindWork AI Studio/Tools/Rust/FileType.cs b/app/MindWork AI Studio/Tools/Rust/FileType.cs new file mode 100644 index 00000000..c333a691 --- /dev/null +++ b/app/MindWork AI Studio/Tools/Rust/FileType.cs @@ -0,0 +1,41 @@ +namespace AIStudio.Tools.Rust; + +/// +/// Represents a file type that can optionally contain child file types. +/// Use the static helpers , and to build readable trees. +/// +/// Display name of the type (e.g., "Document"). +/// File extensions belonging to this type (without dot). +/// Nested file types that are included when this type is selected. +public sealed record FileType(string FilterName, string[] FilterExtensions, IReadOnlyList Children) +{ + /// + /// Factory for a leaf node. + /// Example: FileType.Leaf(".NET", "cs", "razor") + /// + public static FileType Leaf(string name, params string[] extensions) => + new(name, extensions, []); + + /// + /// Factory for a parent node that only has children. + /// Example: FileType.Parent("Source Code", dotnet, java) + /// + public static FileType Parent(string name, params FileType[]? children) => + new(name, [], children ?? []); + + /// + /// Factory for a composite node that has its own extensions in addition to children. + /// + public static FileType Composite(string name, string[] extensions, params FileType[] children) => + new(name, extensions, children); + + /// + /// Collects all extensions for this type, including children. + /// + public IEnumerable FlattenExtensions() + { + return this.FilterExtensions + .Concat(this.Children.SelectMany(child => child.FlattenExtensions())) + .Distinct(StringComparer.OrdinalIgnoreCase); + } +} \ No newline at end of file diff --git a/app/MindWork AI Studio/Tools/Rust/FileTypes.cs b/app/MindWork AI Studio/Tools/Rust/FileTypes.cs index 87a551b2..4a02608e 100644 --- a/app/MindWork AI Studio/Tools/Rust/FileTypes.cs +++ b/app/MindWork AI Studio/Tools/Rust/FileTypes.cs @@ -127,4 +127,4 @@ public static class FileTypes return false; } -} \ No newline at end of file +} diff --git a/app/MindWork AI Studio/Tools/Rust/TokenizerHandlingResponse.cs b/app/MindWork AI Studio/Tools/Rust/TokenizerHandlingResponse.cs new file mode 100644 index 00000000..4323f76f --- /dev/null +++ b/app/MindWork AI Studio/Tools/Rust/TokenizerHandlingResponse.cs @@ -0,0 +1,3 @@ +namespace AIStudio.Tools.Rust; + +public readonly record struct TokenizerHandlingResponse(int Success, string Response); \ No newline at end of file diff --git a/app/MindWork AI Studio/Tools/Rust/TokenizerResponse.cs b/app/MindWork AI Studio/Tools/Rust/TokenizerResponse.cs new file mode 100644 index 00000000..54f0b61c --- /dev/null +++ b/app/MindWork AI Studio/Tools/Rust/TokenizerResponse.cs @@ -0,0 +1,3 @@ +namespace AIStudio.Tools.Rust; + +public readonly record struct TokenizerResponse(bool Success, int TokenCount, string Message); diff --git a/app/MindWork AI Studio/Tools/Services/RustService.Tokenizer.cs b/app/MindWork AI Studio/Tools/Services/RustService.Tokenizer.cs new file mode 100644 index 00000000..d7976198 --- /dev/null +++ b/app/MindWork AI Studio/Tools/Services/RustService.Tokenizer.cs @@ -0,0 +1,69 @@ +using AIStudio.Tools.Rust; + +namespace AIStudio.Tools.Services; + +public sealed partial class RustService +{ + public async Task ValidateTokenizer(string filePath) + { + var result = await this.http.PostAsJsonAsync("/tokenizer/validate", new { + file_path = filePath, + }, this.jsonRustSerializerOptions); + + if (!result.IsSuccessStatusCode) + { + this.logger!.LogError($"Failed to validate the tokenizer '{result.StatusCode}'"); + return new TokenizerResponse + { + Success = false, + Message = "An error occured while sending the path to the Rust framework for validation: "+result.StatusCode, + TokenCount = 0 + }; + } + + return await result.Content.ReadFromJsonAsync(this.jsonRustSerializerOptions); + } + + public async Task StoreTokenizer(string modelId, string previousmodelId, string filePath) + { + Console.WriteLine($"Storing tokenizer for model '{modelId}' with previous model '{previousmodelId}' from file '{filePath}'"); + var result = await this.http.PostAsJsonAsync("/tokenizer/store", new { + model_id = modelId, + previous_model_id = previousmodelId, + file_path = filePath, + }, this.jsonRustSerializerOptions); + + if (!result.IsSuccessStatusCode) + { + this.logger!.LogError($"Failed to store the tokenizer '{result.StatusCode}'"); + return new TokenizerResponse{ + Success = false, + Message = "An error occured while sending the path to the Rust framework for storing: "+result.StatusCode, + TokenCount = 0 + }; + } + + return await result.Content.ReadFromJsonAsync(this.jsonRustSerializerOptions); + } + + public async Task GetTokenCount(string text) + { + try + { + var cts = new CancellationTokenSource(TimeSpan.FromSeconds(5)); + var payload = new { text }; + var response = await this.http.PostAsJsonAsync("/tokenizer/count", payload, this.jsonRustSerializerOptions, cts.Token); + response.EnsureSuccessStatusCode(); + return await response.Content.ReadFromJsonAsync(this.jsonRustSerializerOptions, cancellationToken: cts.Token); + } + catch (Exception e) + { + if(this.logger is not null) + this.logger.LogError(e, "Error while getting token count from Rust service."); + else + Console.WriteLine($"Error while getting token count from Rust service: '{e}'."); + + return null; + } + } +} \ No newline at end of file diff --git a/app/MindWork AI Studio/Tools/Validation/ProviderValidation.cs b/app/MindWork AI Studio/Tools/Validation/ProviderValidation.cs index bb72feb4..595eb23e 100644 --- a/app/MindWork AI Studio/Tools/Validation/ProviderValidation.cs +++ b/app/MindWork AI Studio/Tools/Validation/ProviderValidation.cs @@ -22,6 +22,8 @@ public sealed class ProviderValidation public Func IsModelProvidedManually { get; init; } = () => false; + public Func GetCustomTokenizerValidationIssue { get; init; } = () => string.Empty; + public string? ValidatingHostname(string hostname) { if(this.GetProvider() != LLMProviders.SELF_HOSTED) @@ -120,4 +122,13 @@ public sealed class ProviderValidation return null; } -} \ No newline at end of file + + public string? ValidatingCustomTokenizer(string _) + { + var issue = this.GetCustomTokenizerValidationIssue(); + if (string.IsNullOrWhiteSpace(issue)) + return null; + + return issue; + } +} diff --git a/runtime/src/app_window.rs b/runtime/src/app_window.rs index 0066cfae..223047e9 100644 --- a/runtime/src/app_window.rs +++ b/runtime/src/app_window.rs @@ -11,7 +11,6 @@ use serde::Deserialize; use strum_macros::Display; use tauri::updater::UpdateResponse; use tauri::{FileDropEvent, GlobalShortcutManager, UpdaterEvent, RunEvent, Manager, PathResolver, Window, WindowEvent, generate_context}; -use tauri::api::dialog::blocking::FileDialogBuilder; use tokio::sync::broadcast; use tokio::time; use crate::api_token::APIToken; @@ -474,241 +473,6 @@ pub async fn install_update(_token: APIToken) { } } -/// Let the user select a directory. -#[post("/select/directory?", data = "<previous_directory>")] -pub fn select_directory(_token: APIToken, title: &str, previous_directory: Option<Json<PreviousDirectory>>) -> Json<DirectorySelectionResponse> { - let folder_path = match previous_directory { - Some(previous) => { - let previous_path = previous.path.as_str(); - FileDialogBuilder::new() - .set_title(title) - .set_directory(previous_path) - .pick_folder() - }, - - None => { - FileDialogBuilder::new() - .set_title(title) - .pick_folder() - }, - }; - - match folder_path { - Some(path) => { - info!("User selected directory: {path:?}"); - Json(DirectorySelectionResponse { - user_cancelled: false, - selected_directory: path.to_str().unwrap().to_string(), - }) - }, - - None => { - info!("User cancelled directory selection."); - Json(DirectorySelectionResponse { - user_cancelled: true, - selected_directory: String::from(""), - }) - }, - } -} - -#[derive(Clone, Deserialize)] -pub struct PreviousDirectory { - path: String, -} - -#[derive(Clone, Deserialize)] -pub struct FileTypeFilter { - filter_name: String, - filter_extensions: Vec<String>, -} - -#[derive(Clone, Deserialize)] -pub struct SelectFileOptions { - title: String, - previous_file: Option<PreviousFile>, - filter: Option<FileTypeFilter>, -} - -#[derive(Clone, Deserialize)] -pub struct SaveFileOptions { - title: String, - name_file: Option<PreviousFile>, - filter: Option<FileTypeFilter>, -} - -#[derive(Serialize)] -pub struct DirectorySelectionResponse { - user_cancelled: bool, - selected_directory: String, -} - -/// Let the user select a file. -#[post("/select/file", data = "<payload>")] -pub fn select_file(_token: APIToken, payload: Json<SelectFileOptions>) -> Json<FileSelectionResponse> { - - // Create a new file dialog builder: - let file_dialog = FileDialogBuilder::new(); - - // Set the title of the file dialog: - let file_dialog = file_dialog.set_title(&payload.title); - - // Set the file type filter if provided: - let file_dialog = apply_filter(file_dialog, &payload.filter); - - // Set the previous file path if provided: - let file_dialog = match &payload.previous_file { - Some(previous) => { - let previous_path = previous.file_path.as_str(); - file_dialog.set_directory(previous_path) - }, - - None => file_dialog, - }; - - // Show the file dialog and get the selected file path: - let file_path = file_dialog.pick_file(); - match file_path { - Some(path) => { - info!("User selected file: {path:?}"); - Json(FileSelectionResponse { - user_cancelled: false, - selected_file_path: path.to_str().unwrap().to_string(), - }) - }, - - None => { - info!("User cancelled file selection."); - Json(FileSelectionResponse { - user_cancelled: true, - selected_file_path: String::from(""), - }) - }, - } -} - -/// Let the user select some files. -#[post("/select/files", data = "<payload>")] -pub fn select_files(_token: APIToken, payload: Json<SelectFileOptions>) -> Json<FilesSelectionResponse> { - - // Create a new file dialog builder: - let file_dialog = FileDialogBuilder::new(); - - // Set the title of the file dialog: - let file_dialog = file_dialog.set_title(&payload.title); - - // Set the file type filter if provided: - let file_dialog = apply_filter(file_dialog, &payload.filter); - - // Set the previous file path if provided: - let file_dialog = match &payload.previous_file { - Some(previous) => { - let previous_path = previous.file_path.as_str(); - file_dialog.set_directory(previous_path) - }, - - None => file_dialog, - }; - - // Show the file dialog and get the selected file path: - let file_paths = file_dialog.pick_files(); - match file_paths { - Some(paths) => { - info!("User selected {} files.", paths.len()); - Json(FilesSelectionResponse { - user_cancelled: false, - selected_file_paths: paths.iter().map(|p| p.to_str().unwrap().to_string()).collect(), - }) - } - - None => { - info!("User cancelled file selection."); - Json(FilesSelectionResponse { - user_cancelled: true, - selected_file_paths: Vec::new(), - }) - }, - } -} - -#[post("/save/file", data = "<payload>")] -pub fn save_file(_token: APIToken, payload: Json<SaveFileOptions>) -> Json<FileSaveResponse> { - - // Create a new file dialog builder: - let file_dialog = FileDialogBuilder::new(); - - // Set the title of the file dialog: - let file_dialog = file_dialog.set_title(&payload.title); - - // Set the file type filter if provided: - let file_dialog = apply_filter(file_dialog, &payload.filter); - - // Set the previous file path if provided: - let file_dialog = match &payload.name_file { - Some(previous) => { - let previous_path = previous.file_path.as_str(); - file_dialog.set_directory(previous_path) - }, - - None => file_dialog, - }; - - // Displays the file dialogue box and select the file: - let file_path = file_dialog.save_file(); - match file_path { - Some(path) => { - info!("User selected file for writing operation: {path:?}"); - Json(FileSaveResponse { - user_cancelled: false, - save_file_path: path.to_str().unwrap().to_string(), - }) - }, - - None => { - info!("User cancelled file selection."); - Json(FileSaveResponse { - user_cancelled: true, - save_file_path: String::from(""), - }) - }, - } -} - -#[derive(Clone, Deserialize)] -pub struct PreviousFile { - file_path: String, -} - -/// Applies an optional file type filter to a FileDialogBuilder. -fn apply_filter(file_dialog: FileDialogBuilder, filter: &Option<FileTypeFilter>) -> FileDialogBuilder { - match filter { - Some(f) => file_dialog.add_filter( - &f.filter_name, - &f.filter_extensions.iter().map(|s| s.as_str()).collect::<Vec<&str>>(), - ), - - None => file_dialog, - } -} - -#[derive(Serialize)] -pub struct FileSelectionResponse { - user_cancelled: bool, - selected_file_path: String, -} - -#[derive(Serialize)] -pub struct FilesSelectionResponse { - user_cancelled: bool, - selected_file_paths: Vec<String>, -} - -#[derive(Serialize)] -pub struct FileSaveResponse { - user_cancelled: bool, - save_file_path: String, -} - /// Request payload for registering a global shortcut. #[derive(Clone, Deserialize)] pub struct RegisterShortcutRequest { diff --git a/runtime/src/file_actions.rs b/runtime/src/file_actions.rs new file mode 100644 index 00000000..333190c2 --- /dev/null +++ b/runtime/src/file_actions.rs @@ -0,0 +1,241 @@ +use log::info; +use rocket::post; +use rocket::serde::{Deserialize, Serialize}; +use rocket::serde::json::Json; +use tauri::api::dialog::blocking::FileDialogBuilder; +use crate::api_token::APIToken; + +#[derive(Clone, Deserialize)] +pub struct PreviousDirectory { + path: String, +} + +#[derive(Clone, Deserialize)] +pub struct FileTypeFilter { + filter_name: String, + filter_extensions: Vec<String>, +} + +#[derive(Clone, Deserialize)] +pub struct SelectFileOptions { + title: String, + previous_file: Option<PreviousFile>, + filter: Option<FileTypeFilter>, +} + +#[derive(Clone, Deserialize)] +pub struct SaveFileOptions { + title: String, + name_file: Option<PreviousFile>, + filter: Option<FileTypeFilter>, +} + +#[derive(Serialize)] +pub struct DirectorySelectionResponse { + user_cancelled: bool, + selected_directory: String, +} + +#[derive(Serialize)] +pub struct FileSelectionResponse { + user_cancelled: bool, + selected_file_path: String, +} + +#[derive(Serialize)] +pub struct FilesSelectionResponse { + user_cancelled: bool, + selected_file_paths: Vec<String>, +} + +#[derive(Serialize)] +pub struct FileSaveResponse { + user_cancelled: bool, + save_file_path: String, +} + +#[derive(Clone, Deserialize)] +pub struct PreviousFile { + file_path: String, +} + +/// Let the user select a directory. +#[post("/select/directory?<title>", data = "<previous_directory>")] +pub fn select_directory(_token: APIToken, title: &str, previous_directory: Option<Json<PreviousDirectory>>) -> Json<DirectorySelectionResponse> { + let folder_path = match previous_directory { + Some(previous) => { + let previous_path = previous.path.as_str(); + FileDialogBuilder::new() + .set_title(title) + .set_directory(previous_path) + .pick_folder() + }, + + None => { + FileDialogBuilder::new() + .set_title(title) + .pick_folder() + }, + }; + + match folder_path { + Some(path) => { + info!("User selected directory: {path:?}"); + Json(DirectorySelectionResponse { + user_cancelled: false, + selected_directory: path.to_str().unwrap().to_string(), + }) + }, + + None => { + info!("User cancelled directory selection."); + Json(DirectorySelectionResponse { + user_cancelled: true, + selected_directory: String::from(""), + }) + }, + } +} + +/// Let the user select a file. +#[post("/select/file", data = "<payload>")] +pub fn select_file(_token: APIToken, payload: Json<SelectFileOptions>) -> Json<FileSelectionResponse> { + + // Create a new file dialog builder: + let file_dialog = FileDialogBuilder::new(); + + // Set the title of the file dialog: + let file_dialog = file_dialog.set_title(&payload.title); + + // Set the file type filter if provided: + let file_dialog = apply_filter(file_dialog, &payload.filter); + + // Set the previous file path if provided: + let file_dialog = match &payload.previous_file { + Some(previous) => { + let previous_path = previous.file_path.as_str(); + file_dialog.set_directory(previous_path) + }, + + None => file_dialog, + }; + + // Show the file dialog and get the selected file path: + let file_path = file_dialog.pick_file(); + match file_path { + Some(path) => { + info!("User selected file: {path:?}"); + Json(FileSelectionResponse { + user_cancelled: false, + selected_file_path: path.to_str().unwrap().to_string(), + }) + }, + + None => { + info!("User cancelled file selection."); + Json(FileSelectionResponse { + user_cancelled: true, + selected_file_path: String::from(""), + }) + }, + } +} + +/// Let the user select some files. +#[post("/select/files", data = "<payload>")] +pub fn select_files(_token: APIToken, payload: Json<SelectFileOptions>) -> Json<FilesSelectionResponse> { + + // Create a new file dialog builder: + let file_dialog = FileDialogBuilder::new(); + + // Set the title of the file dialog: + let file_dialog = file_dialog.set_title(&payload.title); + + // Set the file type filter if provided: + let file_dialog = apply_filter(file_dialog, &payload.filter); + + // Set the previous file path if provided: + let file_dialog = match &payload.previous_file { + Some(previous) => { + let previous_path = previous.file_path.as_str(); + file_dialog.set_directory(previous_path) + }, + + None => file_dialog, + }; + + // Show the file dialog and get the selected file path: + let file_paths = file_dialog.pick_files(); + match file_paths { + Some(paths) => { + info!("User selected {} files.", paths.len()); + Json(FilesSelectionResponse { + user_cancelled: false, + selected_file_paths: paths.iter().map(|p| p.to_str().unwrap().to_string()).collect(), + }) + } + + None => { + info!("User cancelled file selection."); + Json(FilesSelectionResponse { + user_cancelled: true, + selected_file_paths: Vec::new(), + }) + }, + } +} + +#[post("/save/file", data = "<payload>")] +pub fn save_file(_token: APIToken, payload: Json<SaveFileOptions>) -> Json<FileSaveResponse> { + + // Create a new file dialog builder: + let file_dialog = FileDialogBuilder::new(); + + // Set the title of the file dialog: + let file_dialog = file_dialog.set_title(&payload.title); + + // Set the file type filter if provided: + let file_dialog = apply_filter(file_dialog, &payload.filter); + + // Set the previous file path if provided: + let file_dialog = match &payload.name_file { + Some(previous) => { + let previous_path = previous.file_path.as_str(); + file_dialog.set_directory(previous_path) + }, + + None => file_dialog, + }; + + // Displays the file dialogue box and select the file: + let file_path = file_dialog.save_file(); + match file_path { + Some(path) => { + info!("User selected file for writing operation: {path:?}"); + Json(FileSaveResponse { + user_cancelled: false, + save_file_path: path.to_str().unwrap().to_string(), + }) + }, + + None => { + info!("User cancelled file selection."); + Json(FileSaveResponse { + user_cancelled: true, + save_file_path: String::from(""), + }) + }, + } +} + +/// Applies an optional file type filter to a FileDialogBuilder. +fn apply_filter(file_dialog: FileDialogBuilder, filter: &Option<FileTypeFilter>) -> FileDialogBuilder { + match filter { + Some(f) => file_dialog.add_filter( + &f.filter_name, + &f.filter_extensions.iter().map(|s| s.as_str()).collect::<Vec<&str>>(), + ), + + None => file_dialog, + } +} \ No newline at end of file diff --git a/runtime/src/lib.rs b/runtime/src/lib.rs index 1b13e099..d4366e3e 100644 --- a/runtime/src/lib.rs +++ b/runtime/src/lib.rs @@ -17,4 +17,6 @@ pub mod qdrant; pub mod certificate_factory; pub mod runtime_api_token; pub mod stale_process_cleanup; -mod sidecar_types; \ No newline at end of file +mod sidecar_types; +pub mod tokenizer; +pub mod file_actions; \ No newline at end of file diff --git a/runtime/src/main.rs b/runtime/src/main.rs index 00a7ba90..a210de54 100644 --- a/runtime/src/main.rs +++ b/runtime/src/main.rs @@ -11,7 +11,7 @@ use mindwork_ai_studio::environment::is_dev; use mindwork_ai_studio::log::init_logging; use mindwork_ai_studio::metadata::MetaData; use mindwork_ai_studio::runtime_api::start_runtime_api; - +use mindwork_ai_studio::tokenizer::{init_tokenizer}; #[tokio::main] async fn main() { @@ -43,8 +43,12 @@ async fn main() { info!("Running in production mode."); } + if let Err(e) = init_tokenizer() { + warn!(Source = "Tokenizer"; "Error during the initialisation of the tokenizer: {}", e); + } + generate_runtime_certificate(); start_runtime_api(); start_tauri(); -} \ No newline at end of file +} diff --git a/runtime/src/runtime_api.rs b/runtime/src/runtime_api.rs index 64bc8174..b3401db9 100644 --- a/runtime/src/runtime_api.rs +++ b/runtime/src/runtime_api.rs @@ -72,10 +72,10 @@ pub fn start_runtime_api() { crate::app_window::get_event_stream, crate::app_window::check_for_update, crate::app_window::install_update, - crate::app_window::select_directory, - crate::app_window::select_file, - crate::app_window::select_files, - crate::app_window::save_file, + crate::file_actions::select_directory, + crate::file_actions::select_file, + crate::file_actions::select_files, + crate::file_actions::save_file, crate::secret::get_secret, crate::secret::store_secret, crate::secret::delete_secret, @@ -89,6 +89,9 @@ pub fn start_runtime_api() { crate::file_data::extract_data, crate::log::get_log_paths, crate::log::log_event, + crate::tokenizer::token_count, + crate::tokenizer::validate_tokenizer, + crate::tokenizer::store_tokenizer, crate::app_window::register_shortcut, crate::app_window::validate_shortcut, crate::app_window::suspend_shortcuts, diff --git a/runtime/src/tokenizer.rs b/runtime/src/tokenizer.rs new file mode 100644 index 00000000..9fe1801e --- /dev/null +++ b/runtime/src/tokenizer.rs @@ -0,0 +1,194 @@ +use rocket::yansi::Paint; +use std::fs; +use std::path::{PathBuf}; +use std::sync::OnceLock; +use rocket::{post}; +use rocket::serde::json::Json; +use rocket::serde::Serialize; +use serde::Deserialize; +use tokenizers::Error; +use tokenizers::tokenizer::{Tokenizer, Error as TokenizerError}; +use crate::api_token::APIToken; +use crate::environment::{DATA_DIRECTORY}; + +static TOKENIZER: OnceLock<Tokenizer> = OnceLock::new(); + +#[derive(Deserialize)] +pub struct SetTokenText { + pub text: String, +} + +#[derive(Clone, Deserialize)] +pub struct TokenizerStorage { + model_id: String, + previous_model_id: String, + file_path: String, +} + +#[derive(Clone, Deserialize)] +pub struct TokenizerValidation { + file_path: String, +} + +#[derive(Serialize)] +pub struct TokenizerResponse { + success: bool, + token_count: usize, + message: String, +} + +impl From<Result<usize, TokenizerError>> for TokenizerResponse { + fn from(result: Result<usize, TokenizerError>) -> Self { + match result { + Ok(count) => TokenizerResponse { + success: true, + token_count: count, + message: "Success".to_string(), + }, + Err(e) => TokenizerResponse { + success: false, + token_count: 0, + message: e.to_string(), + }, + } + } +} + +pub fn init_tokenizer() -> Result<(), Error>{ + let mut target_dir = PathBuf::from("target"); + target_dir.push("tokenizers"); + fs::create_dir_all(&target_dir)?; + + let mut local_tokenizer_path = target_dir.clone(); + local_tokenizer_path.push("tokenizer.json"); + + TOKENIZER.set(Tokenizer::from_file(local_tokenizer_path)?).expect("Could not set the tokenizer."); + Ok(()) +} + +fn validate_tokenizer_at_path(path: &PathBuf) -> Result<usize, TokenizerError> { + if !path.is_file() { + return Err(TokenizerError::from(format!( + "Tokenizer file was not found: {}", + path.display() + ))); + } + + let tokenizer = Tokenizer::from_file(path).map_err(|e| { + println!("Failed to load tokenizer from {}: {}", Paint::red(&path.display()), e); + TokenizerError::from(format!( + "Failed to load tokenizer from '{}': {}", + path.display(), + e + )) + })?; + println!("Loaded tokenizer from {}", Paint::green(&path.display())); + + let test_string = "Hello, world! This is a test string for tokenizer validation."; + + let encoding = tokenizer.encode(test_string, true).map_err(|e| { + println!( + "Tokenizer failed to encode validation string for {}: {}", + Paint::red(&path.display()), + e + ); + TokenizerError::from(format!( + "Tokenizer failed to encode validation string: {}", + e + )) + })?; + let token_count = encoding.len(); + + if token_count == 0 { + return Err(TokenizerError::from( + "Tokenizer produced 0 tokens for test string. The tokenizer is likely invalid or misconfigured." + )); + } + + if encoding.get_tokens().iter().any(|t| t.is_empty()) { + return Err(TokenizerError::from( + "Tokenizer produced empty tokens. The tokenizer is invalid." + )); + } + + Ok(token_count) +} + +fn handle_tokenizer_store(payload: &TokenizerStorage) -> Result<(), std::io::Error> { + let data_dir = DATA_DIRECTORY + .get() + .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::Other, "DATA_DIRECTORY not initialized"))?; + + let base_path = PathBuf::from(data_dir).join("tokenizers"); + + // Delete previous model if file_path is empty + if payload.file_path.trim().is_empty() { + if payload.previous_model_id.trim().is_empty() { + return Ok(()); // Nothing to delete + } + let previous_path = base_path.join(&payload.previous_model_id); + fs::remove_dir_all(previous_path)?; + return Ok(()); + } + + // Copy file + let source_path = PathBuf::from(&payload.file_path); + let source_name = source_path.file_name() + .and_then(|n| n.to_str()) + .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::InvalidInput, "Invalid tokenizer file path"))?; + fs::create_dir_all(&base_path.join(&payload.model_id))?; + let destination_path = base_path.join(&payload.model_id).join(source_name); + println!("Moving tokenizer file from {} to {}", source_path.display(), destination_path.display()); + + let previous_path = base_path.join(&payload.previous_model_id); + + // Delete previous tokenizer folder if specified + if !payload.previous_model_id.trim().is_empty() && source_path.starts_with(&previous_path){ + fs::rename(&source_path, &destination_path)?; + if previous_path.exists() { + fs::remove_dir_all(previous_path)?; + } + }else{ + fs::copy( & source_path, & destination_path)?; + } + Ok(()) +} + +pub fn get_token_count(text: &str) -> Result<usize, TokenizerError> { + if text.trim().is_empty() { + return Err(TokenizerError::from("Input text is empty")); + } + + let tokenizer = TOKENIZER.get().cloned().ok_or_else(|| TokenizerError::from("Tokenizer not initialized"))?; + let enc = tokenizer.encode(text, true)?; + Ok(enc.len()) +} + +#[post("/tokenizer/count", data = "<req>")] +pub fn token_count(_token: APIToken, req: Json<SetTokenText>) -> Json<TokenizerResponse> { + Json(get_token_count(&req.text).into()) +} + +#[post("/tokenizer/validate", data = "<payload>")] +pub fn validate_tokenizer(_token: APIToken, payload: Json<TokenizerValidation>) -> Json<TokenizerResponse>{ + println!("Received tokenizer validation request: {}", payload.file_path); + Json(validate_tokenizer_at_path(&PathBuf::from(payload.file_path.clone())).into()) +} + +#[post("/tokenizer/store", data = "<payload>")] +pub fn store_tokenizer(_token: APIToken, payload: Json<TokenizerStorage>) -> Json<TokenizerResponse>{ + println!("Received tokenizer store request: {}, {}, {}", payload.model_id, payload.previous_model_id, payload.file_path); + match handle_tokenizer_store(&payload) { + Ok(()) => Json(TokenizerResponse { + success: true, + token_count: 0, + message: "Success".to_string(), + }), + Err(e) => Json(TokenizerResponse { + success: false, + token_count: 0, + message: e.to_string(), + }), + } + +} \ No newline at end of file