From 637bf9701bd7f2acd334a2003b1a08e7903ce794 Mon Sep 17 00:00:00 2001 From: Paul Koudelka Date: Wed, 13 May 2026 18:13:34 +0200 Subject: [PATCH] added manual refresh and cleaned up code --- .../Assistants/I18N/allTexts.lua | 31 +- .../Dialogs/EmbeddingProviderDialog.razor | 6 +- .../Dialogs/EmbeddingProviderDialog.razor.cs | 20 +- .../Dialogs/ProviderDialog.razor.cs | 12 +- .../Settings/SettingsDialogDataSources.razor | 12 +- .../SettingsDialogDataSources.razor.cs | 33 + .../Layout/MainLayout.razor | 4 +- .../Layout/MainLayout.razor.cs | 12 +- app/MindWork AI Studio/Program.cs | 2 +- .../Settings/DataModel/Data.cs | 2 + .../DataModel/DataDataSourceIndexing.cs | 9 + .../Tools/Databases/EmbeddingStore.cs | 7 +- .../Qdrant/QdrantClientImplementation.cs | 27 +- .../Tools/Rust/FileTypes.cs | 2 +- .../Services/DataSourceEmbeddingModels.cs | 86 ++ .../DataSourceEmbeddingService.Files.cs | 242 ++++++ .../DataSourceEmbeddingService.State.cs | 89 +++ .../DataSourceEmbeddingService.Watchers.cs | 226 ++++++ .../Services/DataSourceEmbeddingService.cs | 740 ++++-------------- 19 files changed, 920 insertions(+), 642 deletions(-) create mode 100644 app/MindWork AI Studio/Settings/DataModel/DataDataSourceIndexing.cs create mode 100644 app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingModels.cs create mode 100644 app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.Files.cs create mode 100644 app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.State.cs create mode 100644 app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.Watchers.cs diff --git a/app/MindWork AI Studio/Assistants/I18N/allTexts.lua b/app/MindWork AI Studio/Assistants/I18N/allTexts.lua index bc4d79c0..cdf2f458 100644 --- a/app/MindWork AI Studio/Assistants/I18N/allTexts.lua +++ b/app/MindWork AI Studio/Assistants/I18N/allTexts.lua @@ -4837,15 +4837,24 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T108494 -- Are you sure you want to delete the data source '{0}' of type {1}? UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T1096979935"] = "Are you sure you want to delete the data source '{0}' of type {1}?" +-- Automatic local data source refresh +UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T1208397349"] = "Automatic local data source refresh" + -- Edit Local Directory Data Source UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T1215599168"] = "Edit Local Directory Data Source" +-- Refresh +UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T135637716"] = "Refresh" + -- Add Local Directory as Data Source UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T1454193397"] = "Add Local Directory as Data Source" -- Delete UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T1469573738"] = "Delete" +-- Refresh all +UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T1503082343"] = "Refresh all" + -- External (ERI) UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T1652430727"] = "External (ERI)" @@ -4900,6 +4909,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T352566 -- No data sources configured yet. UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T3549650120"] = "No data sources configured yet." +-- Local data sources refresh when files change. +UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T3687976654"] = "Local data sources refresh when files change." + -- Actions UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T3865031940"] = "Actions" @@ -4912,6 +4924,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T590005 -- External Data (ERI-Server v1) UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T774473996"] = "External Data (ERI-Server v1)" +-- Local data sources refresh only when triggered manually. +UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T854231603"] = "Local data sources refresh only when triggered manually." + -- Local Directory UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T926703547"] = "Local Directory" @@ -7571,25 +7586,19 @@ UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T378481461"] = "Source like p UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T4165204724"] = "Document" -- Running -UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T1160324588"] = "Running" +UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGMODELS::T1160324588"] = "Running" -- Idle -UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T1168775091"] = "Idle" +UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGMODELS::T1168775091"] = "Idle" -- Needs attention -UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T1566837660"] = "Needs attention" +UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGMODELS::T1566837660"] = "Needs attention" -- Queued -UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T2655222900"] = "Queued" - --- Embedding -UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T2838542994"] = "Embedding" +UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGMODELS::T2655222900"] = "Queued" -- Completed -UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T3968379570"] = "Completed" - --- Embeddings -UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T951463987"] = "Embeddings" +UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGMODELS::T3968379570"] = "Completed" -- Pandoc Installation UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::PANDOCAVAILABILITYSERVICE::T185447014"] = "Pandoc Installation" diff --git a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor index c23a7948..7968e410 100644 --- a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor +++ b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor @@ -123,7 +123,8 @@ AdornmentColor="Color.Info" Validation="@this.providerValidation.ValidatingInstanceName" UserAttributes="@SPELLCHECK_ATTRIBUTES"/> - @if (this.DataModel != default){ + @if (this.DataLLMProvider != LLMProviders.NONE) + { @T("For better embeddings and less storage usage, it's recommended to use a custom tokenizer to enable a more accurate token count.") @@ -137,8 +138,7 @@ Error="@(!string.IsNullOrWhiteSpace(this.dataCustomTokenizerValidationIssue))" ErrorText="@(this.dataCustomTokenizerValidationIssue)" Validation="@this.providerValidation.ValidatingCustomTokenizer" - OnClear = "@this.ClearPathTokenizer" - /> + OnClear="@this.ClearPathTokenizer" /> } @if (this.dataStoreWasAttempted) diff --git a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs index 10f4e661..20eabb28 100644 --- a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs +++ b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs @@ -2,6 +2,7 @@ using AIStudio.Chat; using AIStudio.Components; using AIStudio.Provider; using AIStudio.Settings; +using AIStudio.Tools.Rust; using AIStudio.Tools.Services; using AIStudio.Tools.Validation; @@ -115,7 +116,7 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId GetPreviousInstanceName = () => this.dataEditingPreviousInstanceName, GetUsedInstanceNames = () => this.UsedInstanceNames, GetHost = () => this.DataHost, - IsModelProvidedManually = () => this.DataLLMProvider is LLMProviders.SELF_HOSTED && this.DataHost is Host.OLLAMA, + IsModelProvidedManually = () => this.DataLLMProvider.IsEmbeddingModelProvidedManually(this.DataHost), GetCustomTokenizerValidationIssue = () => this.dataCustomTokenizerValidationIssue, }; } @@ -126,7 +127,7 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId Model model = default; if(this.DataLLMProvider is LLMProviders.SELF_HOSTED) { - if (this.DataHost is Host.OLLAMA) + if (this.DataLLMProvider.IsEmbeddingModelProvidedManually(this.DataHost)) model = new Model(this.dataManuallyModel, null); else if (this.DataHost is Host.LM_STUDIO) model = this.DataModel; @@ -176,7 +177,7 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId // // We cannot load the API key for self-hosted providers: // - if (this.DataLLMProvider is LLMProviders.SELF_HOSTED && this.DataHost is not Host.OLLAMA) + if (this.DataLLMProvider is LLMProviders.SELF_HOSTED && this.DataHost is not Host.OLLAMA && this.DataHost is not Host.VLLM) { await this.ReloadModels(); await base.OnInitializedAsync(); @@ -241,10 +242,10 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId if (!this.dataIsValid) return; - var response = await this.RustService.StoreTokenizer(TokenizerModelId.ForEmbeddingProviderId(this.DataId), this.dataFilePath); + var response = await this.StoreOrDeleteTokenizerAsync(); if (!response.Success) { - this.dataCustomTokenizerValidationIssue = response.Message; + this.dataCustomTokenizerValidationIssue = string.IsNullOrWhiteSpace(response.Message) ? string.Empty : response.Message; await this.form.Validate(); return; } @@ -340,6 +341,15 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId } } + private Task StoreOrDeleteTokenizerAsync() + { + var tokenizerId = TokenizerModelId.ForEmbeddingProviderId(this.DataId); + if (string.IsNullOrWhiteSpace(this.dataFilePath)) + return this.RustService.DeleteTokenizer(tokenizerId); + + return this.RustService.StoreTokenizer(tokenizerId, this.dataFilePath); + } + private void OnHostChanged(Host selectedHost) { // When the host changes, reset the model selection state: diff --git a/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs b/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs index cd5e9bdc..d04d9a5c 100644 --- a/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs +++ b/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs @@ -4,6 +4,7 @@ using System.Text.Json; using AIStudio.Components; using AIStudio.Provider; using AIStudio.Provider.HuggingFace; +using AIStudio.Tools.Rust; using AIStudio.Tools.Services; using AIStudio.Tools.Validation; @@ -268,7 +269,7 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId if (!this.dataIsValid) return; - var tokenizerResponse = await this.RustService.StoreTokenizer(TokenizerModelId.ForProviderId(this.DataId), this.dataFilePath); + var tokenizerResponse = await this.StoreOrDeleteTokenizerAsync(); if (!tokenizerResponse.Success) { this.dataCustomTokenizerValidationIssue = tokenizerResponse.Message; @@ -367,6 +368,15 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId } } + private Task StoreOrDeleteTokenizerAsync() + { + var tokenizerId = TokenizerModelId.ForProviderId(this.DataId); + if (string.IsNullOrWhiteSpace(this.dataFilePath)) + return this.RustService.DeleteTokenizer(tokenizerId); + + return this.RustService.StoreTokenizer(tokenizerId, this.dataFilePath); + } + private void OnHostChanged(Host selectedHost) { // When the host changes, reset the model selection state: diff --git a/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor b/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor index 74b15fdb..1f867e2a 100644 --- a/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor +++ b/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor @@ -14,6 +14,13 @@ @T("You might configure different data sources. A data source can include one file, all files in a directory, or data from your company. Later, you can incorporate these data sources as needed when the AI requires this data to complete a certain task.") + + + + @T("Refresh all") + + + @@ -38,6 +45,9 @@ + + @T("Refresh") + @T("Edit") @@ -73,4 +83,4 @@ @T("Close") - \ No newline at end of file + diff --git a/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor.cs b/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor.cs index af71e3a9..020a87d6 100644 --- a/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor.cs +++ b/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor.cs @@ -28,6 +28,39 @@ public partial class SettingsDialogDataSources : SettingsDialogBase return T("Unknown"); } + + private bool CanRefreshDataSource(IDataSource dataSource) + { + return dataSource is DataSourceLocalDirectory or DataSourceLocalFile; + } + + private bool HasRefreshableDataSources() + { + return this.SettingsManager.ConfigurationData.DataSources.Any(this.CanRefreshDataSource); + } + + private async Task AutomaticRefreshChanged(bool enabled) + { + this.SettingsManager.ConfigurationData.DataSourceIndexing.AutomaticRefresh = enabled; + await this.SettingsManager.StoreSettings(); + this.DataSourceEmbeddingService.RefreshAutomaticWatchers(); + await this.MessageBus.SendMessage(this, Event.CONFIGURATION_CHANGED); + } + + private async Task RefreshAllDataSources() + { + await this.DataSourceEmbeddingService.QueueAllInternalDataSourcesAsync(); + await this.MessageBus.SendMessage(this, Event.CONFIGURATION_CHANGED); + } + + private async Task RefreshDataSource(IDataSource dataSource) + { + if (!this.CanRefreshDataSource(dataSource)) + return; + + await this.DataSourceEmbeddingService.QueueDataSourceAsync(dataSource); + await this.MessageBus.SendMessage(this, Event.CONFIGURATION_CHANGED); + } private async Task AddDataSource(DataSourceType type) { diff --git a/app/MindWork AI Studio/Layout/MainLayout.razor b/app/MindWork AI Studio/Layout/MainLayout.razor index 7d4f28e9..aad5b4c8 100644 --- a/app/MindWork AI Studio/Layout/MainLayout.razor +++ b/app/MindWork AI Studio/Layout/MainLayout.razor @@ -25,7 +25,7 @@ - @if (this.embeddingOverview.IsVisible) + @if (this.showEmbeddingStatusIcon) { @@ -64,7 +64,7 @@ - @if (this.embeddingOverview.IsVisible) + @if (this.showEmbeddingStatusIcon) { @if (this.SettingsManager.ConfigurationData.App.NavigationBehavior is NavBehavior.NEVER_EXPAND_USE_TOOLTIPS) diff --git a/app/MindWork AI Studio/Layout/MainLayout.razor.cs b/app/MindWork AI Studio/Layout/MainLayout.razor.cs index f96cc2d2..f99446ff 100644 --- a/app/MindWork AI Studio/Layout/MainLayout.razor.cs +++ b/app/MindWork AI Studio/Layout/MainLayout.razor.cs @@ -60,9 +60,10 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan private bool startupCompleted; private readonly SemaphoreSlim mandatoryInfoDialogSemaphore = new(1, 1); - private DataSourceEmbeddingOverview embeddingOverview = new(false, true, DataSourceEmbeddingState.COMPLETED, 0, 0, 0, string.Empty); + private DataSourceEmbeddingOverview embeddingOverview = new(false, DataSourceEmbeddingState.COMPLETED, 0, 0, 0); private IReadOnlyCollection navItems = []; - private NavBarItem embeddingItem = new NavBarItem(string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, false); + private NavBarItem embeddingItem = new (string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, false); + private bool showEmbeddingStatusIcon = false; #region Overrides of ComponentBase @@ -93,7 +94,7 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan // Ensure that all settings are loaded: await this.SettingsManager.LoadSettings(); - await this.DataSourceEmbeddingService.QueueAllInternalDataSourcesAsync(); + await this.DataSourceEmbeddingService.QueueAllInternalDataSourcesIfAutomaticRefreshAsync(); // Register this component with the message bus: this.MessageBus.RegisterComponent(this); @@ -325,6 +326,7 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan private void LoadEmbeddingItem() { this.embeddingOverview = this.DataSourceEmbeddingService.GetOverview(); + this.showEmbeddingStatusIcon = this.embeddingOverview.IsVisible; var palette = this.ColorTheme.GetCurrentPalette(this.SettingsManager); (string icon, string lightcolor, string darkcolor) embeddingIcon = this.embeddingOverview.State switch { @@ -345,7 +347,7 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan DataSourceEmbeddingState.FAILED => this.embeddingOverview.FailedFiles > 0 ? string.Format(T("Some embeddings failed. {0} file(s) need attention."), this.embeddingOverview.FailedFiles) : T("Some embeddings failed and need attention."), - _ => this.embeddingOverview.NavLabel, + _ => string.Empty }; private async Task ShowUpdateDialog() @@ -508,4 +510,4 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan } #endregion -} \ No newline at end of file +} diff --git a/app/MindWork AI Studio/Program.cs b/app/MindWork AI Studio/Program.cs index db4c18af..04233468 100644 --- a/app/MindWork AI Studio/Program.cs +++ b/app/MindWork AI Studio/Program.cs @@ -88,7 +88,7 @@ internal sealed class Program var embeddingStoreConfig = await rust.GetEmbeddingStoreConfiguration(EmbeddingStoreKind.QDRANT_REMOTE); - EmbeddingStore embeddingStore = EmbeddingStoreFactory.Create(embeddingStoreConfig); + var embeddingStore = EmbeddingStoreFactory.Create(embeddingStoreConfig); var builder = WebApplication.CreateBuilder(); builder.WebHost.ConfigureKestrel(kestrelServerOptions => diff --git a/app/MindWork AI Studio/Settings/DataModel/Data.cs b/app/MindWork AI Studio/Settings/DataModel/Data.cs index b8f429cc..31581611 100644 --- a/app/MindWork AI Studio/Settings/DataModel/Data.cs +++ b/app/MindWork AI Studio/Settings/DataModel/Data.cs @@ -124,6 +124,8 @@ public sealed class Data public DataAgentRetrievalContextValidation AgentRetrievalContextValidation { get; init; } = new(); + public DataDataSourceIndexing DataSourceIndexing { get; init; } = new(); + public DataAssistantPluginAudit AssistantPluginAudit { get; init; } = new(x => x.AssistantPluginAudit); public DataAgenda Agenda { get; init; } = new(); diff --git a/app/MindWork AI Studio/Settings/DataModel/DataDataSourceIndexing.cs b/app/MindWork AI Studio/Settings/DataModel/DataDataSourceIndexing.cs new file mode 100644 index 00000000..fdcb8997 --- /dev/null +++ b/app/MindWork AI Studio/Settings/DataModel/DataDataSourceIndexing.cs @@ -0,0 +1,9 @@ +namespace AIStudio.Settings.DataModel; + +public sealed class DataDataSourceIndexing +{ + /// + /// Whether local data source embeddings should refresh automatically when files change. + /// + public bool AutomaticRefresh { get; set; } = true; +} diff --git a/app/MindWork AI Studio/Tools/Databases/EmbeddingStore.cs b/app/MindWork AI Studio/Tools/Databases/EmbeddingStore.cs index 4993e633..abbae8ef 100644 --- a/app/MindWork AI Studio/Tools/Databases/EmbeddingStore.cs +++ b/app/MindWork AI Studio/Tools/Databases/EmbeddingStore.cs @@ -35,14 +35,15 @@ public abstract class EmbeddingStore(string name, string path) { string[] suffixes = { "B", "KB", "MB", "GB", "TB", "PB" }; int suffixIndex = 0; + double convertedSize = size; - while (size >= 1024 && suffixIndex < suffixes.Length - 1) + while (convertedSize >= 1024 && suffixIndex < suffixes.Length - 1) { - size /= 1024; + convertedSize /= 1024; suffixIndex++; } - return $"{size:0##} {suffixes[suffixIndex]}"; + return $"{convertedSize:0.##} {suffixes[suffixIndex]}"; } public void SetLogger(ILogger logService) diff --git a/app/MindWork AI Studio/Tools/Databases/Qdrant/QdrantClientImplementation.cs b/app/MindWork AI Studio/Tools/Databases/Qdrant/QdrantClientImplementation.cs index 0a7fb022..a4d47842 100644 --- a/app/MindWork AI Studio/Tools/Databases/Qdrant/QdrantClientImplementation.cs +++ b/app/MindWork AI Studio/Tools/Databases/Qdrant/QdrantClientImplementation.cs @@ -1,5 +1,6 @@ using Qdrant.Client; using Qdrant.Client.Grpc; +using Grpc.Core; using AIStudio.Tools.PluginSystem; using static Qdrant.Client.Grpc.Conditions; @@ -104,14 +105,32 @@ public class QdrantClientImplementation : EmbeddingStore return this.GrpcClient.UpsertAsync(collectionName, qdrantPoints, true, null, null, token); } - public override Task DeleteEmbeddingByFile(string collectionName, string filePath, CancellationToken token) + public override async Task DeleteEmbeddingByFile(string collectionName, string filePath, CancellationToken token) { - return this.GrpcClient.DeleteAsync(collectionName, MatchKeyword("file_path", filePath), true, null, null, token); + try + { + await this.GrpcClient.DeleteAsync(collectionName, MatchKeyword("file_path", filePath), true, null, null, token); + } + catch (RpcException exception) when (exception.StatusCode is StatusCode.NotFound) + { + return; + } } - public override Task DeleteEmbeddingStore(string collectionName, CancellationToken token) + public override async Task DeleteEmbeddingStore(string collectionName, CancellationToken token) { - return this.GrpcClient.DeleteCollectionAsync(collectionName, cancellationToken: token); + var exists = await this.GrpcClient.CollectionExistsAsync(collectionName, token); + if (!exists) + return; + + try + { + await this.GrpcClient.DeleteCollectionAsync(collectionName, cancellationToken: token); + } + catch (RpcException exception) when (exception.StatusCode is StatusCode.NotFound) + { + return; + } } public override void Dispose() => this.GrpcClient.Dispose(); diff --git a/app/MindWork AI Studio/Tools/Rust/FileTypes.cs b/app/MindWork AI Studio/Tools/Rust/FileTypes.cs index 4a02608e..623a03b0 100644 --- a/app/MindWork AI Studio/Tools/Rust/FileTypes.cs +++ b/app/MindWork AI Studio/Tools/Rust/FileTypes.cs @@ -59,7 +59,7 @@ public static class FileTypes // Media hierarchy public static readonly FileTypeFilter IMAGE = FileTypeFilter.Leaf(TB("Image"), - "jpg", "jpeg", "png", "gif", "bmp", "tiff", "svg", "webp", "heic"); + "jpg", "jpeg", "png", "gif", "bmp", "tiff", "svg", "webp", "heic", "avif"); public static readonly FileTypeFilter AUDIO = FileTypeFilter.Leaf(TB("Audio"), "mp3", "wav", "wave", "aac", "flac", "ogg", "m4a", "wma", "alac", "aiff", "m4b"); public static readonly FileTypeFilter VIDEO = FileTypeFilter.Leaf(TB("Video"), diff --git a/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingModels.cs b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingModels.cs new file mode 100644 index 00000000..be016cad --- /dev/null +++ b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingModels.cs @@ -0,0 +1,86 @@ +using AIStudio.Settings.DataModel; +using AIStudio.Tools.PluginSystem; + +namespace AIStudio.Tools.Services; + +public sealed record DataSourceEmbeddingOverview( + bool IsVisible, + DataSourceEmbeddingState State, + int IndexedFiles, + int TotalFiles, + int FailedFiles); + +public enum DataSourceEmbeddingState +{ + IDLE, + QUEUED, + RUNNING, + COMPLETED, + FAILED, +} + +public sealed record DataSourceEmbeddingStatus( + string DataSourceId, + string DataSourceName, + DataSourceType DataSourceType, + DataSourceEmbeddingState State, + int TotalFiles, + int IndexedFiles, + int FailedFiles, + string CurrentFile, + string LastError) +{ + private static string TB(string fallbackEN) => I18N.I.T(fallbackEN, typeof(DataSourceEmbeddingService).Namespace, nameof(DataSourceEmbeddingService)); + + public int ProgressPercent => this.TotalFiles <= 0 ? 0 : Math.Clamp((int)Math.Round(this.IndexedFiles * 100d / this.TotalFiles), 0, 100); + + public string StateLabel => this.State switch + { + DataSourceEmbeddingState.QUEUED => TB("Queued"), + DataSourceEmbeddingState.RUNNING => TB("Running"), + DataSourceEmbeddingState.COMPLETED => TB("Completed"), + DataSourceEmbeddingState.FAILED => TB("Needs attention"), + _ => TB("Idle") + }; + + public int SortOrder => this.State switch + { + DataSourceEmbeddingState.RUNNING => 0, + DataSourceEmbeddingState.QUEUED => 1, + DataSourceEmbeddingState.FAILED => 2, + DataSourceEmbeddingState.COMPLETED => 3, + _ => 4, + }; +} + +public sealed class FileEnumerationResult +{ + public List Files { get; } = []; + + public int FailedFiles { get; set; } + + public string LastError { get; set; } = string.Empty; +} + +public sealed class PersistedEmbeddingState +{ + public Dictionary DataSources { get; init; } = new(StringComparer.OrdinalIgnoreCase); +} + +public sealed class DataSourceEmbeddingManifest +{ + public string EmbeddingProviderId { get; set; } = string.Empty; + + public string EmbeddingSignature { get; set; } = string.Empty; + + public int VectorSize { get; set; } + + public Dictionary Files { get; init; } = new(StringComparer.OrdinalIgnoreCase); +} + +public sealed record EmbeddedFileRecord( + string Fingerprint, + long FileSize, + DateTime LastWriteUtc, + DateTime EmbeddedAtUtc, + int ChunkCount); diff --git a/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.Files.cs b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.Files.cs new file mode 100644 index 00000000..80ab1ef4 --- /dev/null +++ b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.Files.cs @@ -0,0 +1,242 @@ +using System.Security.Cryptography; +using System.Text; + +using AIStudio.Settings; +using AIStudio.Settings.DataModel; +using AIStudio.Tools.PluginSystem; +using AIStudio.Tools.Rust; + +namespace AIStudio.Tools.Services; + +public sealed partial class DataSourceEmbeddingService +{ + private static readonly string[] ADDITIONAL_RAG_FILE_EXTENSIONS = ["csv", "tsv", "ods", "xlsm", "xlsb", "xla", "xlam"]; + private static readonly string[] SKIPPED_RAG_FILE_EXTENSIONS = ["lnk"]; + + private async IAsyncEnumerable StreamEmbeddingChunksAsync(string filePath, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken token) + { + if (this.IsImageFilePath(filePath)) + { + yield return this.BuildImageIndexText(filePath); + yield break; + } + + var currentChunk = new StringBuilder(); + + await foreach (var segment in this.rustService.StreamArbitraryFileData(filePath, token: token)) + { + var normalized = NormalizeChunkSegment(segment); + if (string.IsNullOrWhiteSpace(normalized)) + continue; + + if (currentChunk.Length > 0 && currentChunk.Length + normalized.Length + Environment.NewLine.Length > MAX_CHUNK_LENGTH) + { + if (currentChunk.Length >= MIN_CHUNK_LENGTH) + { + var chunk = currentChunk.ToString().Trim(); + if (!string.IsNullOrWhiteSpace(chunk)) + yield return chunk; + + var overlap = chunk.Length > CHUNK_OVERLAP_LENGTH + ? chunk[^CHUNK_OVERLAP_LENGTH..] + : chunk; + + currentChunk.Clear(); + currentChunk.Append(overlap); + currentChunk.AppendLine(); + } + else + { + currentChunk.AppendLine(); + } + } + + currentChunk.Append(normalized); + currentChunk.AppendLine(); + } + + var finalChunk = currentChunk.ToString().Trim(); + if (!string.IsNullOrWhiteSpace(finalChunk)) + yield return finalChunk; + } + + private FileEnumerationResult GetInputFiles(IDataSource dataSource) + { + var result = new FileEnumerationResult(); + + switch (dataSource) + { + case DataSourceLocalFile localFile when File.Exists(localFile.FilePath): + if (this.IsSupportedRagFilePath(localFile.FilePath)) + { + result.Files.Add(new FileInfo(localFile.FilePath)); + } + else + { + result.FailedFiles = 1; + result.LastError = $"The selected file '{localFile.FilePath}' is not supported for background embeddings."; + } + + return result; + + case DataSourceLocalDirectory localDirectory when Directory.Exists(localDirectory.Path): + this.EnumerateAccessibleFiles(localDirectory.Path, result); + return result; + } + + switch (dataSource) + { + case DataSourceLocalFile localFile: + result.FailedFiles = 1; + result.LastError = $"The selected file '{localFile.FilePath}' does not exist."; + break; + + case DataSourceLocalDirectory localDirectory: + result.FailedFiles = 1; + result.LastError = $"The selected directory '{localDirectory.Path}' does not exist."; + break; + } + + return result; + } + + private void EnumerateAccessibleFiles(string rootPath, FileEnumerationResult result) + { + var pendingDirectories = new Stack(); + pendingDirectories.Push(rootPath); + + while (pendingDirectories.Count > 0) + { + var currentPath = pendingDirectories.Pop(); + IEnumerable subDirectories; + IEnumerable files; + + try + { + subDirectories = Directory.EnumerateDirectories(currentPath); + files = Directory.EnumerateFiles(currentPath); + } + catch (Exception exception) + { + this.logger.LogWarning(exception, "Cannot access directory '{DirectoryPath}' while indexing.", currentPath); + result.FailedFiles++; + result.LastError = $"The directory '{currentPath}' could not be accessed."; + continue; + } + + foreach (var filePath in files) + { + FileInfo fileInfo; + try + { + fileInfo = new FileInfo(filePath); + if (!fileInfo.Exists) + continue; + } + catch (Exception exception) + { + this.logger.LogWarning(exception, "Cannot inspect file '{FilePath}' while indexing.", filePath); + result.FailedFiles++; + result.LastError = $"The file '{filePath}' could not be inspected."; + continue; + } + + if (!this.IsSupportedRagFilePath(fileInfo.FullName)) + continue; + + result.Files.Add(fileInfo); + } + + foreach (var subDirectory in subDirectories) + pendingDirectories.Push(subDirectory); + } + } + + private string TryGetRelativePath(IDataSource dataSource, FileInfo file) => dataSource switch + { + DataSourceLocalDirectory localDirectory => Path.GetRelativePath(localDirectory.Path, file.FullName), + _ => file.Name + }; + + private static string NormalizeChunkSegment(string input) + { + return input + .Replace("\r\n", "\n", StringComparison.Ordinal) + .Replace('\r', '\n') + .Trim(); + } + + private bool IsImageFilePath(string filePath) + { + return FileTypes.IsAllowedPath(filePath, FileTypes.IMAGE); + } + + private bool IsSupportedRagFilePath(string filePath) + { + var extension = Path.GetExtension(filePath).TrimStart('.'); + if (SKIPPED_RAG_FILE_EXTENSIONS.Contains(extension, StringComparer.OrdinalIgnoreCase)) + return false; + + return FileTypes.IsAllowedPath(filePath, FileTypes.DOCUMENT, FileTypes.IMAGE) + || ADDITIONAL_RAG_FILE_EXTENSIONS.Contains(extension, StringComparer.OrdinalIgnoreCase); + } + + private string BuildImageIndexText(string filePath) + { + var fileName = Path.GetFileName(filePath); + var fileNameWithoutExtension = Path.GetFileNameWithoutExtension(filePath); + var extension = Path.GetExtension(filePath).TrimStart('.'); + var normalizedName = fileNameWithoutExtension + .Replace('_', ' ') + .Replace('-', ' ') + .Trim(); + + return $$""" + Image asset + File name: {{fileName}} + Type: {{extension}} + Search terms: {{normalizedName}} + Path: {{filePath}} + Note: The current RAG embedding pipeline stores image files by metadata only. Visual content is not OCRed or captioned yet. + """; + } + + private string BuildEmbeddingSignature(EmbeddingProvider embeddingProvider) + { + return string.Join('|', + embeddingProvider.Id, + embeddingProvider.UsedLLMProvider, + embeddingProvider.Model.Id, + embeddingProvider.Host, + embeddingProvider.Hostname, + embeddingProvider.TokenizerPath); + } + + private string BuildFingerprint(FileInfo file) + { + var fingerprintSource = $"{file.FullName}|{file.Length}|{file.LastWriteTimeUtc.Ticks}"; + var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(fingerprintSource)); + return Convert.ToHexString(bytes); + } + + private string GetCollectionName(string dataSourceId) + { + var safeId = dataSourceId + .ToLowerInvariant() + .Replace("-", string.Empty, StringComparison.Ordinal); + + return $"rag_{safeId}"; + } + + private string CreatePointId(string dataSourceId, string fingerprint, int chunkIndex) + { + var source = $"{dataSourceId}:{fingerprint}:{chunkIndex}"; + var hash = SHA256.HashData(Encoding.UTF8.GetBytes(source)); + var guidBytes = hash[..16].ToArray(); + + guidBytes[6] = (byte)((guidBytes[6] & 0x0F) | 0x40); + guidBytes[8] = (byte)((guidBytes[8] & 0x3F) | 0x80); + + return new Guid(guidBytes).ToString(); + } +} diff --git a/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.State.cs b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.State.cs new file mode 100644 index 00000000..b08bb3d4 --- /dev/null +++ b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.State.cs @@ -0,0 +1,89 @@ +using System.Text.Json; + +using AIStudio.Settings; + +namespace AIStudio.Tools.Services; + +public sealed partial class DataSourceEmbeddingService +{ + private const string STATE_FILENAME = "rag-embedding-state.json"; + + private readonly JsonSerializerOptions jsonOptions = new() + { + WriteIndented = true, + }; + + private async Task EnsureStateLoadedAsync(CancellationToken token) + { + if (this.stateLoaded) + return; + + await this.stateLock.WaitAsync(token); + try + { + if (this.stateLoaded) + return; + + var statePath = this.GetStatePath(); + if (!string.IsNullOrWhiteSpace(statePath) && File.Exists(statePath)) + { + var json = await File.ReadAllTextAsync(statePath, token); + var persistedState = JsonSerializer.Deserialize(json, this.jsonOptions); + this.manifests = persistedState?.DataSources ?? new Dictionary(StringComparer.OrdinalIgnoreCase); + } + + this.stateLoaded = true; + } + finally + { + this.stateLock.Release(); + } + } + + private async Task GetManifestAsync(string dataSourceId, CancellationToken token) + { + await this.EnsureStateLoadedAsync(token); + if (this.manifests.TryGetValue(dataSourceId, out var manifest)) + return manifest; + + manifest = new DataSourceEmbeddingManifest(); + this.manifests[dataSourceId] = manifest; + return manifest; + } + + private async Task SaveStateAsync(CancellationToken token) + { + var statePath = this.GetStatePath(); + if (string.IsNullOrWhiteSpace(statePath)) + return; + + var directory = Path.GetDirectoryName(statePath); + if (!string.IsNullOrWhiteSpace(directory)) + Directory.CreateDirectory(directory); + + var persistedState = new PersistedEmbeddingState + { + DataSources = this.manifests + }; + + var json = JsonSerializer.Serialize(persistedState, this.jsonOptions); + await File.WriteAllTextAsync(statePath, json, token); + } + + private async Task ResetPersistedStateAsync(string dataSourceId) + { + await this.EnsureStateLoadedAsync(CancellationToken.None); + this.manifests.Remove(dataSourceId); + await this.DeleteCollectionAsync(this.GetCollectionName(dataSourceId)); + await this.SaveStateAsync(CancellationToken.None); + this.logger.LogInformation("Reset persisted embedding state for data source '{DataSourceId}'.", dataSourceId); + } + + private string GetStatePath() + { + if (string.IsNullOrWhiteSpace(SettingsManager.ConfigDirectory)) + return string.Empty; + + return Path.Combine(SettingsManager.ConfigDirectory, STATE_FILENAME); + } +} diff --git a/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.Watchers.cs b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.Watchers.cs new file mode 100644 index 00000000..75cfa154 --- /dev/null +++ b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.Watchers.cs @@ -0,0 +1,226 @@ +using System.Collections.Concurrent; +using AIStudio.Settings; +using AIStudio.Settings.DataModel; + +namespace AIStudio.Tools.Services; + +public sealed partial class DataSourceEmbeddingService +{ + private const int WATCHER_DEBOUNCE_SECONDS = 2; + + private readonly ConcurrentDictionary watchers = new(StringComparer.OrdinalIgnoreCase); + private readonly Dictionary watcherDebounceTokens = new(StringComparer.OrdinalIgnoreCase); + private readonly object watcherDebounceLock = new(); + + private void RefreshWatchers() + { + if (!this.settingsManager.ConfigurationData.DataSourceIndexing.AutomaticRefresh) + { + this.RemoveAllWatchers(); + return; + } + + var supportedSources = this.settingsManager.ConfigurationData.DataSources + .Where(this.IsSupportedInternalDataSource) + .ToDictionary(source => source.Id, StringComparer.OrdinalIgnoreCase); + + foreach (var existingWatcherId in this.watchers.Keys.Except(supportedSources.Keys, StringComparer.OrdinalIgnoreCase).ToList()) + this.RemoveWatcher(existingWatcherId); + + foreach (var dataSource in supportedSources.Values) + this.EnsureWatcher(dataSource); + } + + private void EnsureWatcher(IDataSource dataSource) + { + if (!this.settingsManager.ConfigurationData.DataSourceIndexing.AutomaticRefresh) + return; + + var configuration = GetWatchConfiguration(dataSource); + if (configuration is null) + return; + + if (this.watchers.TryGetValue(dataSource.Id, out var existingRegistration)) + { + if (IsSameWatchConfiguration(existingRegistration.Configuration, configuration)) + return; + + this.RemoveWatcher(dataSource.Id); + } + + var watcher = this.CreateWatcher(dataSource.Id, configuration); + if (watcher is null) + return; + + if (!this.watchers.TryAdd(dataSource.Id, new DataSourceWatcherRegistration(watcher, configuration))) + watcher.Dispose(); + } + + private FileSystemWatcher? CreateWatcher(string dataSourceId, DataSourceWatcherConfiguration configuration) + { + try + { + var watcher = new FileSystemWatcher(configuration.RootPath) + { + Filter = configuration.Filter, + IncludeSubdirectories = configuration.IncludeSubdirectories, + NotifyFilter = NotifyFilters.FileName | NotifyFilters.DirectoryName | NotifyFilters.LastWrite | NotifyFilters.CreationTime | NotifyFilters.Size, + }; + + watcher.Changed += (_, _) => this.OnWatchedDataSourceChanged(dataSourceId); + watcher.Deleted += (_, _) => this.OnWatchedDataSourceChanged(dataSourceId); + watcher.Created += (_, _) => this.OnWatchedDataSourceChanged(dataSourceId); + watcher.Renamed += (_, _) => this.OnWatchedDataSourceChanged(dataSourceId); + watcher.Error += (_, args) => + { + this.logger.LogWarning(args.GetException(), "The file watcher for data source '{DataSourceId}' failed. Recreating it.", dataSourceId); + this.RemoveWatcher(dataSourceId); + this.EnsureWatcher(dataSourceId); + this.OnWatchedDataSourceChanged(dataSourceId); + }; + watcher.EnableRaisingEvents = true; + return watcher; + } + catch (Exception exception) + { + this.logger.LogWarning(exception, "Failed to create file watcher for data source '{DataSourceId}' at '{RootPath}'.", dataSourceId, configuration.RootPath); + return null; + } + } + + private void RemoveWatcher(string dataSourceId) + { + this.CancelPendingWatcherRefresh(dataSourceId); + + if (this.watchers.TryRemove(dataSourceId, out var registration)) + registration.Watcher.Dispose(); + } + + private void RemoveAllWatchers() + { + foreach (var watcherId in this.watchers.Keys.ToList()) + this.RemoveWatcher(watcherId); + } + + private void DisposeWatchers() + { + this.CancelAllPendingWatcherRefreshes(); + + foreach (var registration in this.watchers.Values) + registration.Watcher.Dispose(); + + this.watchers.Clear(); + } + + private void OnWatchedDataSourceChanged(string dataSourceId) + { + if (!this.settingsManager.ConfigurationData.DataSourceIndexing.AutomaticRefresh) + return; + + this.logger.LogDebug("Detected file system change for data source '{DataSourceId}'. Scheduling a debounced embedding run.", dataSourceId); + var debounceToken = new CancellationTokenSource(); + + lock (this.watcherDebounceLock) + { + if (this.watcherDebounceTokens.Remove(dataSourceId, out var existingToken)) + existingToken.Cancel(); + + this.watcherDebounceTokens[dataSourceId] = debounceToken; + } + + _ = Task.Run(async () => + { + try + { + await Task.Delay(TimeSpan.FromSeconds(WATCHER_DEBOUNCE_SECONDS), debounceToken.Token); + if (!this.TryCompletePendingWatcherRefresh(dataSourceId, debounceToken)) + return; + + var dataSource = this.settingsManager.ConfigurationData.DataSources + .FirstOrDefault(source => source.Id.Equals(dataSourceId, StringComparison.OrdinalIgnoreCase)); + + if (dataSource is not null) + { + this.logger.LogInformation("Queueing data source '{DataSourceName}' ({DataSourceId}) after file system changes settled.", dataSource.Name, dataSource.Id); + await this.QueueDataSourceAsync(dataSource); + } + } + catch (OperationCanceledException) + { + } + catch (Exception exception) + { + this.logger.LogWarning(exception, "Failed to queue watched data source '{DataSourceId}' after a file system change.", dataSourceId); + } + finally + { + debounceToken.Dispose(); + } + }); + } + + private void EnsureWatcher(string dataSourceId) + { + var dataSource = this.settingsManager.ConfigurationData.DataSources + .FirstOrDefault(source => source.Id.Equals(dataSourceId, StringComparison.OrdinalIgnoreCase)); + + if (dataSource is not null) + this.EnsureWatcher(dataSource); + } + + private void CancelPendingWatcherRefresh(string dataSourceId) + { + lock (this.watcherDebounceLock) + { + if (this.watcherDebounceTokens.Remove(dataSourceId, out var token)) + token.Cancel(); + } + } + + private void CancelAllPendingWatcherRefreshes() + { + lock (this.watcherDebounceLock) + { + foreach (var token in this.watcherDebounceTokens.Values) + token.Cancel(); + + this.watcherDebounceTokens.Clear(); + } + } + + private bool TryCompletePendingWatcherRefresh(string dataSourceId, CancellationTokenSource debounceToken) + { + lock (this.watcherDebounceLock) + { + if (!this.watcherDebounceTokens.TryGetValue(dataSourceId, out var currentToken) || !ReferenceEquals(currentToken, debounceToken)) + return false; + + this.watcherDebounceTokens.Remove(dataSourceId); + return true; + } + } + + private static DataSourceWatcherConfiguration? GetWatchConfiguration(IDataSource dataSource) => dataSource switch + { + DataSourceLocalDirectory localDirectory when Directory.Exists(localDirectory.Path) => new DataSourceWatcherConfiguration( + localDirectory.Path, + "*.*", + true), + DataSourceLocalFile localFile when File.Exists(localFile.FilePath) && !string.IsNullOrWhiteSpace(Path.GetDirectoryName(localFile.FilePath)) => new DataSourceWatcherConfiguration( + Path.GetDirectoryName(localFile.FilePath)!, + Path.GetFileName(localFile.FilePath), + false), + _ => null, + }; + + private static bool IsSameWatchConfiguration(DataSourceWatcherConfiguration left, DataSourceWatcherConfiguration right) + { + return left.IncludeSubdirectories == right.IncludeSubdirectories + && string.Equals(left.RootPath, right.RootPath, StringComparison.OrdinalIgnoreCase) + && string.Equals(left.Filter, right.Filter, StringComparison.OrdinalIgnoreCase); + } + + private sealed record DataSourceWatcherConfiguration(string RootPath, string Filter, bool IncludeSubdirectories); + + private sealed record DataSourceWatcherRegistration(FileSystemWatcher Watcher, DataSourceWatcherConfiguration Configuration); +} diff --git a/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.cs b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.cs index 9e2a71bc..c82de777 100644 --- a/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.cs +++ b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.cs @@ -1,7 +1,5 @@ using System.Collections.Concurrent; -using System.Security.Cryptography; -using System.Text; -using System.Text.Json; +using System.Diagnostics.CodeAnalysis; using System.Threading.Channels; using AIStudio.Provider; @@ -13,9 +11,8 @@ using AIStudio.Tools.Rust; namespace AIStudio.Tools.Services; -public sealed class DataSourceEmbeddingService : BackgroundService +public sealed partial class DataSourceEmbeddingService : BackgroundService { - private const string STATE_FILENAME = "rag-embedding-state.json"; private const int MAX_CHUNK_LENGTH = 3_200; private const int MIN_CHUNK_LENGTH = 800; private const int CHUNK_OVERLAP_LENGTH = 320; @@ -28,12 +25,7 @@ public sealed class DataSourceEmbeddingService : BackgroundService private readonly Channel queue = Channel.CreateUnbounded(); private readonly ConcurrentDictionary queuedIds = new(StringComparer.OrdinalIgnoreCase); private readonly ConcurrentDictionary statuses = new(StringComparer.OrdinalIgnoreCase); - private readonly ConcurrentDictionary watchers = new(StringComparer.OrdinalIgnoreCase); private readonly SemaphoreSlim stateLock = new(1, 1); - private readonly JsonSerializerOptions jsonOptions = new() - { - WriteIndented = true, - }; private static string TB(string fallbackEN) => I18N.I.T(fallbackEN, typeof(DataSourceEmbeddingService).Namespace, nameof(DataSourceEmbeddingService)); @@ -58,11 +50,7 @@ public sealed class DataSourceEmbeddingService : BackgroundService public DataSourceEmbeddingOverview GetOverview() { - var orderedStatuses = this.statuses.Values - .OrderBy(status => status.SortOrder) - .ThenBy(status => status.DataSourceName, StringComparer.OrdinalIgnoreCase) - .ToList(); - + var orderedStatuses = this.GetStatuses(); var activeStatus = orderedStatuses .FirstOrDefault(status => status.State is DataSourceEmbeddingState.QUEUED or DataSourceEmbeddingState.RUNNING); @@ -71,21 +59,19 @@ public sealed class DataSourceEmbeddingService : BackgroundService var total = Math.Max(activeStatus.TotalFiles, 1); return new( true, - false, activeStatus.State, activeStatus.IndexedFiles, total, - activeStatus.FailedFiles, - $"{TB("Embedding")} {activeStatus.IndexedFiles}/{total}"); + activeStatus.FailedFiles); } var failedStatus = orderedStatuses .FirstOrDefault(status => status.State is DataSourceEmbeddingState.FAILED || status.FailedFiles > 0); if (failedStatus is not null) - return new(true, true, DataSourceEmbeddingState.FAILED, failedStatus.IndexedFiles, failedStatus.TotalFiles, failedStatus.FailedFiles, TB("Embeddings")); + return new(true, DataSourceEmbeddingState.FAILED, failedStatus.IndexedFiles, failedStatus.TotalFiles, failedStatus.FailedFiles); - return new(false, true, DataSourceEmbeddingState.COMPLETED, 0, 0, 0, TB("Embeddings")); + return new(false, DataSourceEmbeddingState.COMPLETED, 0, 0, 0); } public Task QueueAllInternalDataSourcesAsync() @@ -99,6 +85,22 @@ public sealed class DataSourceEmbeddingService : BackgroundService return Task.WhenAll(tasks); } + public Task QueueAllInternalDataSourcesIfAutomaticRefreshAsync() + { + if (!this.settingsManager.ConfigurationData.DataSourceIndexing.AutomaticRefresh) + { + this.RefreshWatchers(); + return Task.CompletedTask; + } + + return this.QueueAllInternalDataSourcesAsync(); + } + + public void RefreshAutomaticWatchers() + { + this.RefreshWatchers(); + } + public async Task QueueDataSourceAsync(IDataSource dataSource) { if (!this.IsSupportedInternalDataSource(dataSource)) @@ -106,23 +108,14 @@ public sealed class DataSourceEmbeddingService : BackgroundService this.logger.LogInformation("Queueing data source '{DataSourceName}' ({DataSourceId}) for background embeddings.", dataSource.Name, dataSource.Id); this.RefreshWatchers(); + this.logger.LogDebug("Adding watcher for data source '{DataSourceName}' ({DataSourceId}).", dataSource.Name, dataSource.Id); if (!this.statuses.TryGetValue(dataSource.Id, out var currentStatus) || currentStatus.State is not DataSourceEmbeddingState.RUNNING) - { - this.UpsertStatus(new DataSourceEmbeddingStatus( - dataSource.Id, - dataSource.Name, - dataSource.Type, - DataSourceEmbeddingState.QUEUED, - currentStatus?.TotalFiles ?? 0, - currentStatus?.IndexedFiles ?? 0, - currentStatus?.FailedFiles ?? 0, - string.Empty, - string.Empty)); - } - + this.UpsertStatus(this.CreateStatus(dataSource, DataSourceEmbeddingState.QUEUED, currentStatus?.TotalFiles ?? 0, currentStatus?.IndexedFiles ?? 0, currentStatus?.FailedFiles ?? 0)); + this.logger.LogDebug("Upserting status for data source '{DataSourceName}' ({DataSourceId}).", dataSource.Name, dataSource.Id); if (this.queuedIds.TryAdd(dataSource.Id, 0)) await this.queue.Writer.WriteAsync(dataSource.Id); + this.logger.LogDebug("Queued data source '{DataSourceName}' ({DataSourceId}).", dataSource.Name, dataSource.Id); } public async Task RemoveDataSourceAsync(IDataSource dataSource) @@ -169,10 +162,7 @@ public sealed class DataSourceEmbeddingService : BackgroundService public override void Dispose() { - foreach (var watcher in this.watchers.Values) - watcher.Dispose(); - - this.watchers.Clear(); + this.DisposeWatchers(); this.stateLock.Dispose(); base.Dispose(); } @@ -193,11 +183,7 @@ public sealed class DataSourceEmbeddingService : BackgroundService return; } - var embeddingProvider = this.settingsManager.ConfigurationData.EmbeddingProviders.FirstOrDefault(provider => - dataSource is IInternalDataSource internalDataSource && - provider.Id.Equals(internalDataSource.EmbeddingId, StringComparison.OrdinalIgnoreCase)); - - if (embeddingProvider == default || embeddingProvider.UsedLLMProvider is LLMProviders.NONE) + if (!this.TryResolveEmbeddingProvider(dataSource, out var embeddingProvider)) { this.UpsertStatus(this.GetFallbackStatus(dataSource, "The selected embedding provider is not available.")); return; @@ -210,29 +196,11 @@ public sealed class DataSourceEmbeddingService : BackgroundService dataSource.Name, dataSource.Id); - var embeddingSignature = this.BuildEmbeddingSignature(embeddingProvider); - var manifest = await this.GetManifestAsync(dataSource.Id, token); - // A provider/model change invalidates the existing collection because the vectors are no longer comparable. - if (!string.Equals(manifest.EmbeddingSignature, embeddingSignature, StringComparison.Ordinal)) - { - this.logger.LogInformation( - "Embedding configuration changed for data source '{DataSourceName}' ({DataSourceId}). Resetting persisted state and collection '{CollectionName}'.", - dataSource.Name, - dataSource.Id, - this.GetCollectionName(dataSource.Id)); - await this.ResetPersistedStateAsync(dataSource.Id); - manifest = await this.GetManifestAsync(dataSource.Id, token); - manifest.EmbeddingProviderId = embeddingProvider.Id; - manifest.EmbeddingSignature = embeddingSignature; - await this.SaveStateAsync(token); - } - + var collectionName = this.GetCollectionName(dataSource.Id); + var manifest = await this.EnsureCompatibleManifestAsync(dataSource, embeddingProvider, collectionName, token); var inputFiles = this.GetInputFiles(dataSource); var indexedFiles = inputFiles.Files; var totalFiles = indexedFiles.Count + inputFiles.FailedFiles; - var existingPaths = indexedFiles - .Select(file => file.FullName) - .ToHashSet(StringComparer.OrdinalIgnoreCase); this.logger.LogInformation( "Prepared data source '{DataSourceName}' ({DataSourceId}) for embedding. AccessibleFiles={AccessibleFiles}, FailedFiles={FailedFiles}, Collection='{CollectionName}'.", @@ -240,32 +208,18 @@ public sealed class DataSourceEmbeddingService : BackgroundService dataSource.Id, indexedFiles.Count, inputFiles.FailedFiles, - this.GetCollectionName(dataSource.Id)); - - foreach (var removedFilePath in manifest.Files.Keys.Except(existingPaths, StringComparer.OrdinalIgnoreCase).ToList()) - { - await this.DeleteFilePointsAsync(this.GetCollectionName(dataSource.Id), removedFilePath, token); - manifest.Files.Remove(removedFilePath); - this.logger.LogInformation( - "Removed stale embeddings for deleted file '{FilePath}' from data source '{DataSourceName}' ({DataSourceId}).", - removedFilePath, - dataSource.Name, - dataSource.Id); - } + collectionName); + await this.RemoveMissingFileEmbeddingsAsync(dataSource, collectionName, manifest, indexedFiles, token); await this.SaveStateAsync(token); - // Keep the UI status in sync with the long-running file loop below. - this.UpsertStatus(new DataSourceEmbeddingStatus( - dataSource.Id, - dataSource.Name, - dataSource.Type, + this.UpsertStatus(this.CreateStatus( + dataSource, DataSourceEmbeddingState.RUNNING, totalFiles, 0, inputFiles.FailedFiles, - string.Empty, - inputFiles.LastError)); + lastError: inputFiles.LastError)); var provider = embeddingProvider.CreateProvider(); var skippedFiles = 0; @@ -287,29 +241,11 @@ public sealed class DataSourceEmbeddingService : BackgroundService dataSource.Name, dataSource.Id); skippedFiles++; - this.UpsertStatus(new DataSourceEmbeddingStatus( - dataSource.Id, - dataSource.Name, - dataSource.Type, - DataSourceEmbeddingState.RUNNING, - totalFiles, - skippedFiles + completedFiles, - failedFiles, - string.Empty, - lastError)); + this.UpsertStatus(this.CreateStatus(dataSource, DataSourceEmbeddingState.RUNNING, totalFiles, skippedFiles + completedFiles, failedFiles, lastError: lastError)); continue; } - this.UpsertStatus(new DataSourceEmbeddingStatus( - dataSource.Id, - dataSource.Name, - dataSource.Type, - DataSourceEmbeddingState.RUNNING, - totalFiles, - skippedFiles + completedFiles, - failedFiles, - file.Name, - lastError)); + this.UpsertStatus(this.CreateStatus(dataSource, DataSourceEmbeddingState.RUNNING, totalFiles, skippedFiles + completedFiles, failedFiles, file.Name, lastError)); try { @@ -343,37 +279,15 @@ public sealed class DataSourceEmbeddingService : BackgroundService failedFiles++; lastError = exception.Message; manifest.Files.Remove(file.FullName); - await this.DeleteFilePointsAsync(this.GetCollectionName(dataSource.Id), file.FullName, token); + await this.DeleteFilePointsAsync(collectionName, file.FullName, token); await this.SaveStateAsync(token); this.logger.LogWarning(exception, "Failed to embed file '{FilePath}' for data source '{DataSourceName}'.", file.FullName, dataSource.Name); - this.UpsertStatus(new DataSourceEmbeddingStatus( - dataSource.Id, - dataSource.Name, - dataSource.Type, - DataSourceEmbeddingState.RUNNING, - totalFiles, - skippedFiles + completedFiles, - failedFiles, - file.Name, - exception.Message)); + this.UpsertStatus(this.CreateStatus(dataSource, DataSourceEmbeddingState.RUNNING, totalFiles, skippedFiles + completedFiles, failedFiles, file.Name, exception.Message)); } } - this.UpsertStatus(new DataSourceEmbeddingStatus( - dataSource.Id, - dataSource.Name, - dataSource.Type, - failedFiles > 0 ? DataSourceEmbeddingState.FAILED : DataSourceEmbeddingState.COMPLETED, - totalFiles, - skippedFiles + completedFiles, - failedFiles, - string.Empty, - failedFiles > 0 - ? string.IsNullOrWhiteSpace(lastError) - ? "Some files could not be embedded. See the logs for details." - : lastError - : string.Empty)); + this.UpsertStatus(this.CreateCompletedStatus(dataSource, totalFiles, skippedFiles + completedFiles, failedFiles, lastError)); this.logger.LogInformation( "Finished background embeddings for data source '{DataSourceName}' ({DataSourceId}). Indexed={IndexedFiles}, Failed={FailedFiles}, Total={TotalFiles}.", dataSource.Name, @@ -459,7 +373,6 @@ public sealed class DataSourceEmbeddingService : BackgroundService if (manifest.VectorSize == 0) { - // The first successful batch defines the vector size for the whole collection. manifest.VectorSize = vectorSize; await this.EnsureCollectionExistsAsync(collectionName, vectorSize, token); await this.SaveStateAsync(token); @@ -490,53 +403,6 @@ public sealed class DataSourceEmbeddingService : BackgroundService batch.Clear(); } - private async IAsyncEnumerable StreamEmbeddingChunksAsync(string filePath, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken token) - { - if (this.IsImageFilePath(filePath)) - { - yield return this.BuildImageIndexText(filePath); - yield break; - } - - var currentChunk = new StringBuilder(); - - await foreach (var segment in this.rustService.StreamArbitraryFileData(filePath, token: token)) - { - var normalized = NormalizeChunkSegment(segment); - if (string.IsNullOrWhiteSpace(normalized)) - continue; - - if (currentChunk.Length > 0 && currentChunk.Length + normalized.Length + Environment.NewLine.Length > MAX_CHUNK_LENGTH) - { - if (currentChunk.Length >= MIN_CHUNK_LENGTH) - { - var chunk = currentChunk.ToString().Trim(); - if (!string.IsNullOrWhiteSpace(chunk)) - yield return chunk; - - var overlap = chunk.Length > CHUNK_OVERLAP_LENGTH - ? chunk[^CHUNK_OVERLAP_LENGTH..] - : chunk; - - currentChunk.Clear(); - currentChunk.Append(overlap); - currentChunk.AppendLine(); - } - else - { - currentChunk.AppendLine(); - } - } - - currentChunk.Append(normalized); - currentChunk.AppendLine(); - } - - var finalChunk = currentChunk.ToString().Trim(); - if (!string.IsNullOrWhiteSpace(finalChunk)) - yield return finalChunk; - } - private async Task EnsureCollectionExistsAsync(string collectionName, int vectorSize, CancellationToken token) { await this.embeddingStore.EnsureEmbeddingStoreExists(collectionName, vectorSize, token); @@ -581,72 +447,6 @@ public sealed class DataSourceEmbeddingService : BackgroundService await this.embeddingStore.DeleteEmbeddingStore(collectionName, CancellationToken.None); } - private async Task EnsureStateLoadedAsync(CancellationToken token) - { - if (this.stateLoaded) - return; - - await this.stateLock.WaitAsync(token); - try - { - if (this.stateLoaded) - return; - - var statePath = this.GetStatePath(); - if (!string.IsNullOrWhiteSpace(statePath) && File.Exists(statePath)) - { - var json = await File.ReadAllTextAsync(statePath, token); - var persistedState = JsonSerializer.Deserialize(json, this.jsonOptions); - this.manifests = persistedState?.DataSources ?? new Dictionary(StringComparer.OrdinalIgnoreCase); - } - - this.stateLoaded = true; - } - finally - { - this.stateLock.Release(); - } - } - - private async Task GetManifestAsync(string dataSourceId, CancellationToken token) - { - await this.EnsureStateLoadedAsync(token); - if (this.manifests.TryGetValue(dataSourceId, out var manifest)) - return manifest; - - manifest = new DataSourceEmbeddingManifest(); - this.manifests[dataSourceId] = manifest; - return manifest; - } - - private async Task SaveStateAsync(CancellationToken token) - { - var statePath = this.GetStatePath(); - if (string.IsNullOrWhiteSpace(statePath)) - return; - - var directory = Path.GetDirectoryName(statePath); - if (!string.IsNullOrWhiteSpace(directory)) - Directory.CreateDirectory(directory); - - var persistedState = new PersistedEmbeddingState - { - DataSources = this.manifests - }; - - var json = JsonSerializer.Serialize(persistedState, this.jsonOptions); - await File.WriteAllTextAsync(statePath, json, token); - } - - private async Task ResetPersistedStateAsync(string dataSourceId) - { - await this.EnsureStateLoadedAsync(CancellationToken.None); - this.manifests.Remove(dataSourceId); - await this.DeleteCollectionAsync(this.GetCollectionName(dataSourceId)); - await this.SaveStateAsync(CancellationToken.None); - this.logger.LogInformation("Reset persisted embedding state for data source '{DataSourceId}'.", dataSourceId); - } - private async Task WaitForInitialSettingsAndBootstrapAsync(CancellationToken token) { while (!token.IsCancellationRequested) @@ -663,280 +463,8 @@ public sealed class DataSourceEmbeddingService : BackgroundService token.ThrowIfCancellationRequested(); - this.logger.LogInformation("Embedding background service is ready. Queueing all internal data sources."); - await this.QueueAllInternalDataSourcesAsync(); - } - - private void RefreshWatchers() - { - var supportedSources = this.settingsManager.ConfigurationData.DataSources - .Where(this.IsSupportedInternalDataSource) - .ToDictionary(source => source.Id, StringComparer.OrdinalIgnoreCase); - - foreach (var existingWatcherId in this.watchers.Keys.Except(supportedSources.Keys, StringComparer.OrdinalIgnoreCase).ToList()) - this.RemoveWatcher(existingWatcherId); - - foreach (var dataSource in supportedSources.Values) - this.EnsureWatcher(dataSource); - } - - private void EnsureWatcher(IDataSource dataSource) - { - if (this.watchers.TryGetValue(dataSource.Id, out var existingWatcher)) - { - var existingTarget = $"{existingWatcher.Path}|{existingWatcher.Filter}|{existingWatcher.IncludeSubdirectories}"; - var desiredTarget = this.GetWatcherTarget(dataSource); - if (string.Equals(existingTarget, desiredTarget, StringComparison.OrdinalIgnoreCase)) - return; - - this.RemoveWatcher(dataSource.Id); - } - - var watcher = this.CreateWatcher(dataSource); - if (watcher is null) - return; - - if (!this.watchers.TryAdd(dataSource.Id, watcher)) - watcher.Dispose(); - } - - private FileSystemWatcher? CreateWatcher(IDataSource dataSource) - { - FileSystemWatcher? watcher = dataSource switch - { - DataSourceLocalDirectory localDirectory when Directory.Exists(localDirectory.Path) => new FileSystemWatcher(localDirectory.Path) - { - IncludeSubdirectories = true, - NotifyFilter = NotifyFilters.FileName | NotifyFilters.DirectoryName | NotifyFilters.LastWrite | NotifyFilters.CreationTime | NotifyFilters.Size, - }, - DataSourceLocalFile localFile when File.Exists(localFile.FilePath) && !string.IsNullOrWhiteSpace(Path.GetDirectoryName(localFile.FilePath)) => new FileSystemWatcher(Path.GetDirectoryName(localFile.FilePath)!) - { - Filter = Path.GetFileName(localFile.FilePath), - NotifyFilter = NotifyFilters.FileName | NotifyFilters.LastWrite | NotifyFilters.CreationTime | NotifyFilters.Size, - }, - _ => null, - }; - - if (watcher is null) - return null; - - FileSystemEventHandler onChanged = (_, _) => this.OnWatchedDataSourceChanged(dataSource.Id); - RenamedEventHandler onRenamed = (_, _) => this.OnWatchedDataSourceChanged(dataSource.Id); - ErrorEventHandler onError = (_, errorArgs) => - { - this.logger.LogWarning(errorArgs.GetException(), "The file watcher for data source '{DataSourceId}' failed. Recreating it.", dataSource.Id); - this.RemoveWatcher(dataSource.Id); - this.EnsureWatcher(dataSource); - this.OnWatchedDataSourceChanged(dataSource.Id); - }; - - watcher.Created += onChanged; - watcher.Changed += onChanged; - watcher.Deleted += onChanged; - watcher.Renamed += onRenamed; - watcher.Error += onError; - watcher.EnableRaisingEvents = true; - return watcher; - } - - private string GetWatcherTarget(IDataSource dataSource) => dataSource switch - { - DataSourceLocalDirectory localDirectory => $"{localDirectory.Path}|*.*|True", - DataSourceLocalFile localFile => $"{Path.GetDirectoryName(localFile.FilePath)}|{Path.GetFileName(localFile.FilePath)}|False", - _ => string.Empty - }; - - private void RemoveWatcher(string dataSourceId) - { - if (this.watchers.TryRemove(dataSourceId, out var watcher)) - watcher.Dispose(); - } - - private void OnWatchedDataSourceChanged(string dataSourceId) - { - this.logger.LogInformation("Detected file system change for data source '{DataSourceId}'. Queueing a fresh embedding run.", dataSourceId); - _ = Task.Run(async () => - { - try - { - var dataSource = this.settingsManager.ConfigurationData.DataSources - .FirstOrDefault(source => source.Id.Equals(dataSourceId, StringComparison.OrdinalIgnoreCase)); - - if (dataSource is not null) - await this.QueueDataSourceAsync(dataSource); - } - catch (Exception exception) - { - this.logger.LogWarning(exception, "Failed to queue watched data source '{DataSourceId}' after a file system change.", dataSourceId); - } - }); - } - - private string GetStatePath() - { - if (string.IsNullOrWhiteSpace(SettingsManager.ConfigDirectory)) - return string.Empty; - - return Path.Combine(SettingsManager.ConfigDirectory, STATE_FILENAME); - } - - private FileEnumerationResult GetInputFiles(IDataSource dataSource) - { - var result = new FileEnumerationResult(); - - switch (dataSource) - { - case DataSourceLocalFile localFile when File.Exists(localFile.FilePath): - result.Files.Add(new FileInfo(localFile.FilePath)); - return result; - - case DataSourceLocalDirectory localDirectory when Directory.Exists(localDirectory.Path): - this.EnumerateAccessibleFiles(localDirectory.Path, result); - return result; - } - - switch (dataSource) - { - case DataSourceLocalFile localFile: - result.FailedFiles = 1; - result.LastError = $"The selected file '{localFile.FilePath}' does not exist."; - break; - - case DataSourceLocalDirectory localDirectory: - result.FailedFiles = 1; - result.LastError = $"The selected directory '{localDirectory.Path}' does not exist."; - break; - } - - return result; - } - - private void EnumerateAccessibleFiles(string rootPath, FileEnumerationResult result) - { - var pendingDirectories = new Stack(); - pendingDirectories.Push(rootPath); - - while (pendingDirectories.Count > 0) - { - var currentPath = pendingDirectories.Pop(); - IEnumerable subDirectories; - IEnumerable files; - - try - { - subDirectories = Directory.EnumerateDirectories(currentPath); - files = Directory.EnumerateFiles(currentPath); - } - catch (Exception exception) - { - this.logger.LogWarning(exception, "Cannot access directory '{DirectoryPath}' while indexing.", currentPath); - result.FailedFiles++; - result.LastError = $"The directory '{currentPath}' could not be accessed."; - continue; - } - - foreach (var filePath in files) - { - FileInfo fileInfo; - try - { - fileInfo = new FileInfo(filePath); - if (!fileInfo.Exists) - continue; - } - catch (Exception exception) - { - this.logger.LogWarning(exception, "Cannot inspect file '{FilePath}' while indexing.", filePath); - result.FailedFiles++; - result.LastError = $"The file '{filePath}' could not be inspected."; - continue; - } - - result.Files.Add(fileInfo); - } - - foreach (var subDirectory in subDirectories) - pendingDirectories.Push(subDirectory); - } - } - - private string TryGetRelativePath(IDataSource dataSource, FileInfo file) => dataSource switch - { - DataSourceLocalDirectory localDirectory => Path.GetRelativePath(localDirectory.Path, file.FullName), - _ => file.Name - }; - - private static string NormalizeChunkSegment(string input) - { - return input - .Replace("\r\n", "\n", StringComparison.Ordinal) - .Replace('\r', '\n') - .Trim(); - } - - private bool IsImageFilePath(string filePath) - { - return FileTypes.IsAllowedPath(filePath, FileTypes.IMAGE); - } - - private string BuildImageIndexText(string filePath) - { - var fileName = Path.GetFileName(filePath); - var fileNameWithoutExtension = Path.GetFileNameWithoutExtension(filePath); - var extension = Path.GetExtension(filePath).TrimStart('.'); - var normalizedName = fileNameWithoutExtension - .Replace('_', ' ') - .Replace('-', ' ') - .Trim(); - - return $$""" - Image asset - File name: {{fileName}} - Type: {{extension}} - Search terms: {{normalizedName}} - Path: {{filePath}} - Note: The current RAG embedding pipeline stores image files by metadata only. Visual content is not OCRed or captioned yet. - """; - } - - private string BuildEmbeddingSignature(EmbeddingProvider embeddingProvider) - { - return string.Join('|', - embeddingProvider.Id, - embeddingProvider.UsedLLMProvider, - embeddingProvider.Model.Id, - embeddingProvider.Host, - embeddingProvider.Hostname, - embeddingProvider.TokenizerPath); - } - - private string BuildFingerprint(FileInfo file) - { - var fingerprintSource = $"{file.FullName}|{file.Length}|{file.LastWriteTimeUtc.Ticks}"; - var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(fingerprintSource)); - return Convert.ToHexString(bytes); - } - - private string GetCollectionName(string dataSourceId) - { - var safeId = dataSourceId - .ToLowerInvariant() - .Replace("-", string.Empty, StringComparison.Ordinal); - - return $"rag_{safeId}"; - } - - private string CreatePointId(string dataSourceId, string fingerprint, int chunkIndex) - { - var source = $"{dataSourceId}:{fingerprint}:{chunkIndex}"; - var hash = SHA256.HashData(Encoding.UTF8.GetBytes(source)); - var guidBytes = hash[..16].ToArray(); - - // Mark the bytes as an RFC 4122 version 4 UUID so Qdrant accepts the ID format. - guidBytes[6] = (byte)((guidBytes[6] & 0x0F) | 0x40); - guidBytes[8] = (byte)((guidBytes[8] & 0x3F) | 0x80); - - return new Guid(guidBytes).ToString(); + this.logger.LogInformation("Embedding background service is ready. Checking whether automatic data source refresh is enabled."); + await this.QueueAllInternalDataSourcesIfAutomaticRefreshAsync(); } private bool IsSupportedInternalDataSource(IDataSource dataSource) @@ -944,18 +472,104 @@ public sealed class DataSourceEmbeddingService : BackgroundService return dataSource is DataSourceLocalDirectory or DataSourceLocalFile; } - private DataSourceEmbeddingStatus GetFallbackStatus(IDataSource dataSource, string errorMessage) + private bool TryResolveEmbeddingProvider(IDataSource dataSource, [NotNullWhen(true)] out EmbeddingProvider? embeddingProvider) + { + embeddingProvider = this.settingsManager.ConfigurationData.EmbeddingProviders.FirstOrDefault(provider => + dataSource is IInternalDataSource internalDataSource && + provider.Id.Equals(internalDataSource.EmbeddingId, StringComparison.OrdinalIgnoreCase)); + + return embeddingProvider != default && embeddingProvider.UsedLLMProvider is not LLMProviders.NONE; + } + + private async Task EnsureCompatibleManifestAsync(IDataSource dataSource, EmbeddingProvider embeddingProvider, string collectionName, CancellationToken token) + { + var embeddingSignature = this.BuildEmbeddingSignature(embeddingProvider); + var manifest = await this.GetManifestAsync(dataSource.Id, token); + + if (!string.Equals(manifest.EmbeddingSignature, embeddingSignature, StringComparison.Ordinal)) + { + this.logger.LogInformation( + "Embedding configuration changed for data source '{DataSourceName}' ({DataSourceId}). Resetting persisted state and collection '{CollectionName}'.", + dataSource.Name, + dataSource.Id, + collectionName); + await this.ResetPersistedStateAsync(dataSource.Id); + manifest = await this.GetManifestAsync(dataSource.Id, token); + } + + if (!string.Equals(manifest.EmbeddingProviderId, embeddingProvider.Id, StringComparison.OrdinalIgnoreCase) || + !string.Equals(manifest.EmbeddingSignature, embeddingSignature, StringComparison.Ordinal)) + { + manifest.EmbeddingProviderId = embeddingProvider.Id; + manifest.EmbeddingSignature = embeddingSignature; + await this.SaveStateAsync(token); + } + + return manifest; + } + + private async Task RemoveMissingFileEmbeddingsAsync( + IDataSource dataSource, + string collectionName, + DataSourceEmbeddingManifest manifest, + IReadOnlyCollection indexedFiles, + CancellationToken token) + { + var existingPaths = indexedFiles + .Select(file => file.FullName) + .ToHashSet(StringComparer.OrdinalIgnoreCase); + + foreach (var removedFilePath in manifest.Files.Keys.Except(existingPaths, StringComparer.OrdinalIgnoreCase).ToList()) + { + await this.DeleteFilePointsAsync(collectionName, removedFilePath, token); + manifest.Files.Remove(removedFilePath); + this.logger.LogInformation( + "Removed stale embeddings for deleted file '{FilePath}' from data source '{DataSourceName}' ({DataSourceId}).", + removedFilePath, + dataSource.Name, + dataSource.Id); + } + } + + private DataSourceEmbeddingStatus CreateStatus( + IDataSource dataSource, + DataSourceEmbeddingState state, + int totalFiles, + int indexedFiles, + int failedFiles, + string currentFile = "", + string lastError = "") { return new DataSourceEmbeddingStatus( dataSource.Id, dataSource.Name, dataSource.Type, - DataSourceEmbeddingState.FAILED, - 0, - 0, - 1, - string.Empty, - errorMessage); + state, + totalFiles, + indexedFiles, + failedFiles, + currentFile, + lastError); + } + + private DataSourceEmbeddingStatus CreateCompletedStatus(IDataSource dataSource, int totalFiles, int indexedFiles, int failedFiles, string lastError) + { + return this.CreateStatus( + dataSource, + failedFiles > 0 ? DataSourceEmbeddingState.FAILED : DataSourceEmbeddingState.COMPLETED, + totalFiles, + indexedFiles, + failedFiles, + lastError: failedFiles > 0 + ? string.IsNullOrWhiteSpace(lastError) + ? "Some files could not be embedded. See the logs for details." + : lastError + : string.Empty); + } + + private DataSourceEmbeddingStatus GetFallbackStatus(IDataSource dataSource, string errorMessage) + { + return this.CreateStatus(dataSource, DataSourceEmbeddingState.FAILED, 0, 0, 1, lastError: errorMessage); } private void UpsertStatus(DataSourceEmbeddingStatus status) @@ -969,87 +583,3 @@ public sealed class DataSourceEmbeddingService : BackgroundService _ = MessageBus.INSTANCE.SendMessage(null, Event.RAG_EMBEDDING_STATUS_CHANGED, true); } } - -public sealed record DataSourceEmbeddingOverview( - bool IsVisible, - bool ShowIconOnly, - DataSourceEmbeddingState State, - int IndexedFiles, - int TotalFiles, - int FailedFiles, - string NavLabel); - -public enum DataSourceEmbeddingState -{ - IDLE, - QUEUED, - RUNNING, - COMPLETED, - FAILED, -} - -public sealed record DataSourceEmbeddingStatus( - string DataSourceId, - string DataSourceName, - DataSourceType DataSourceType, - DataSourceEmbeddingState State, - int TotalFiles, - int IndexedFiles, - int FailedFiles, - string CurrentFile, - string LastError) -{ - private static string TB(string fallbackEN) => I18N.I.T(fallbackEN, typeof(DataSourceEmbeddingService).Namespace, nameof(DataSourceEmbeddingService)); - - public int ProgressPercent => this.TotalFiles <= 0 ? 0 : Math.Clamp((int)Math.Round(this.IndexedFiles * 100d / this.TotalFiles), 0, 100); - - public string StateLabel => this.State switch - { - DataSourceEmbeddingState.QUEUED => TB("Queued"), - DataSourceEmbeddingState.RUNNING => TB("Running"), - DataSourceEmbeddingState.COMPLETED => TB("Completed"), - DataSourceEmbeddingState.FAILED => TB("Needs attention"), - _ => TB("Idle") - }; - - public int SortOrder => this.State switch - { - DataSourceEmbeddingState.RUNNING => 0, - DataSourceEmbeddingState.QUEUED => 1, - DataSourceEmbeddingState.FAILED => 2, - DataSourceEmbeddingState.COMPLETED => 3, - _ => 4, - }; -} - -public sealed class FileEnumerationResult -{ - public List Files { get; } = []; - - public int FailedFiles { get; set; } - - public string LastError { get; set; } = string.Empty; -} - -public sealed class PersistedEmbeddingState -{ - public Dictionary DataSources { get; init; } = new(StringComparer.OrdinalIgnoreCase); -} - -public sealed class DataSourceEmbeddingManifest -{ - public string EmbeddingProviderId { get; set; } = string.Empty; - - public string EmbeddingSignature { get; set; } = string.Empty; - - public int VectorSize { get; set; } - - public Dictionary Files { get; init; } = new(StringComparer.OrdinalIgnoreCase); -} - -public sealed record EmbeddedFileRecord( - string Fingerprint, - long FileSize, - DateTime LastWriteUtc, - DateTime EmbeddedAtUtc, - int ChunkCount);