From ac677c5ac7ed52f1a12e0812e19419dab27bbc57 Mon Sep 17 00:00:00 2001 From: PaulKoudelka Date: Fri, 8 May 2026 17:37:34 +0200 Subject: [PATCH] background embed --- .../Assistants/I18N/allTexts.lua | 69 +- .../Settings/SettingsPanelEmbeddings.razor.cs | 6 + .../SettingsDialogDataSources.razor.cs | 11 +- .../Layout/MainLayout.razor | 33 +- .../Layout/MainLayout.razor.cs | 44 +- app/MindWork AI Studio/Pages/Embeddings.razor | 64 + .../Pages/Embeddings.razor.cs | 57 + .../Pages/Information.razor.cs | 10 +- app/MindWork AI Studio/Program.cs | 59 +- app/MindWork AI Studio/Routes.razor.cs | 1 + .../Tools/Databases/EmbeddingStoragePoint.cs | 16 + .../{DatabaseClient.cs => EmbeddingStore.cs} | 15 +- .../Tools/Databases/EmbeddingStoreFactory.cs | 33 + .../Tools/Databases/NoDatabaseClient.cs | 24 - .../Tools/Databases/NoEmbeddingStore.cs | 39 + .../Qdrant/QdrantClientImplementation.cs | 66 +- app/MindWork AI Studio/Tools/Event.cs | 1 + .../Services/DataSourceEmbeddingService.cs | 1055 +++++++++++++++++ .../Tools/Services/RustService.Databases.cs | 39 +- .../Tools/Services/RustService.Retrieval.cs | 79 +- 20 files changed, 1623 insertions(+), 98 deletions(-) create mode 100644 app/MindWork AI Studio/Pages/Embeddings.razor create mode 100644 app/MindWork AI Studio/Pages/Embeddings.razor.cs create mode 100644 app/MindWork AI Studio/Tools/Databases/EmbeddingStoragePoint.cs rename app/MindWork AI Studio/Tools/Databases/{DatabaseClient.cs => EmbeddingStore.cs} (69%) create mode 100644 app/MindWork AI Studio/Tools/Databases/EmbeddingStoreFactory.cs delete mode 100644 app/MindWork AI Studio/Tools/Databases/NoDatabaseClient.cs create mode 100644 app/MindWork AI Studio/Tools/Databases/NoEmbeddingStore.cs create mode 100644 app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.cs diff --git a/app/MindWork AI Studio/Assistants/I18N/allTexts.lua b/app/MindWork AI Studio/Assistants/I18N/allTexts.lua index 9d391b76..bc4d79c0 100644 --- a/app/MindWork AI Studio/Assistants/I18N/allTexts.lua +++ b/app/MindWork AI Studio/Assistants/I18N/allTexts.lua @@ -5719,6 +5719,9 @@ UI_TEXT_CONTENT["AISTUDIO::LAYOUT::MAINLAYOUT::T1614176092"] = "Assistants" -- Update UI_TEXT_CONTENT["AISTUDIO::LAYOUT::MAINLAYOUT::T1847791252"] = "Update" +-- Data sync +UI_TEXT_CONTENT["AISTUDIO::LAYOUT::MAINLAYOUT::T1903948824"] = "Data sync" + -- Leave Chat Page UI_TEXT_CONTENT["AISTUDIO::LAYOUT::MAINLAYOUT::T2124749705"] = "Leave Chat Page" @@ -5737,6 +5740,9 @@ UI_TEXT_CONTENT["AISTUDIO::LAYOUT::MAINLAYOUT::T2929332068"] = "Supporters" -- Writer UI_TEXT_CONTENT["AISTUDIO::LAYOUT::MAINLAYOUT::T2979224202"] = "Writer" +-- Embeddings are waiting to be processed. +UI_TEXT_CONTENT["AISTUDIO::LAYOUT::MAINLAYOUT::T3439916590"] = "Embeddings are waiting to be processed." + -- Show details UI_TEXT_CONTENT["AISTUDIO::LAYOUT::MAINLAYOUT::T3692372066"] = "Show details" @@ -5746,6 +5752,18 @@ UI_TEXT_CONTENT["AISTUDIO::LAYOUT::MAINLAYOUT::T4256323669"] = "Information" -- Chat UI_TEXT_CONTENT["AISTUDIO::LAYOUT::MAINLAYOUT::T578410699"] = "Chat" +-- Some embeddings failed. {0} file(s) need attention. +UI_TEXT_CONTENT["AISTUDIO::LAYOUT::MAINLAYOUT::T640352868"] = "Some embeddings failed. {0} file(s) need attention." + +-- Some embeddings failed and need attention. +UI_TEXT_CONTENT["AISTUDIO::LAYOUT::MAINLAYOUT::T671981715"] = "Some embeddings failed and need attention." + +-- Embeddings are running: {0} of {1} files are indexed. +UI_TEXT_CONTENT["AISTUDIO::LAYOUT::MAINLAYOUT::T714077986"] = "Embeddings are running: {0} of {1} files are indexed." + +-- Embeddings +UI_TEXT_CONTENT["AISTUDIO::LAYOUT::MAINLAYOUT::T951463987"] = "Embeddings" + -- Get coding and debugging support from an LLM. UI_TEXT_CONTENT["AISTUDIO::PAGES::ASSISTANTS::T1243850917"] = "Get coding and debugging support from an LLM." @@ -5908,6 +5926,30 @@ UI_TEXT_CONTENT["AISTUDIO::PAGES::CHAT::T582100343"] = "Chat in Workspace" -- Show your workspaces UI_TEXT_CONTENT["AISTUDIO::PAGES::CHAT::T733672375"] = "Show your workspaces" +-- AI Studio indexes local RAG data sources in the background. Finished files stay recorded so unchanged files can be skipped after a restart, while added or deleted files are detected during the next run. +UI_TEXT_CONTENT["AISTUDIO::PAGES::EMBEDDINGS::T1064986263"] = "AI Studio indexes local RAG data sources in the background. Finished files stay recorded so unchanged files can be skipped after a restart, while added or deleted files are detected during the next run." + +-- Current file: {0} +UI_TEXT_CONTENT["AISTUDIO::PAGES::EMBEDDINGS::T1166856644"] = "Current file: {0}" + +-- Pending files: {0} +UI_TEXT_CONTENT["AISTUDIO::PAGES::EMBEDDINGS::T2471889605"] = "Pending files: {0}" + +-- {0} of {1} files are indexed. +UI_TEXT_CONTENT["AISTUDIO::PAGES::EMBEDDINGS::T2525374657"] = "{0} of {1} files are indexed." + +-- Background embeddings +UI_TEXT_CONTENT["AISTUDIO::PAGES::EMBEDDINGS::T2547971789"] = "Background embeddings" + +-- Failed files: {0} +UI_TEXT_CONTENT["AISTUDIO::PAGES::EMBEDDINGS::T309404893"] = "Failed files: {0}" + +-- Indexed files: {0} +UI_TEXT_CONTENT["AISTUDIO::PAGES::EMBEDDINGS::T3473125711"] = "Indexed files: {0}" + +-- No local data source has been queued for embedding yet. +UI_TEXT_CONTENT["AISTUDIO::PAGES::EMBEDDINGS::T3774205531"] = "No local data source has been queued for embedding yet." + -- Unlike services like ChatGPT, which impose limits after intensive use, MindWork AI Studio offers unlimited usage through the providers API. UI_TEXT_CONTENT["AISTUDIO::PAGES::HOME::T1009708591"] = "Unlike services like ChatGPT, which impose limits after intensive use, MindWork AI Studio offers unlimited usage through the providers API." @@ -6878,13 +6920,13 @@ UI_TEXT_CONTENT["AISTUDIO::TOOLS::CONFIDENCESCHEMESEXTENSIONS::T3893997203"] = " UI_TEXT_CONTENT["AISTUDIO::TOOLS::CONFIDENCESCHEMESEXTENSIONS::T4107860491"] = "Trust all LLM providers" -- Reason -UI_TEXT_CONTENT["AISTUDIO::TOOLS::DATABASES::NODATABASECLIENT::T1093747001"] = "Reason" +UI_TEXT_CONTENT["AISTUDIO::TOOLS::DATABASES::NOEMBEDDINGSTORE::T1093747001"] = "Reason" -- Unavailable -UI_TEXT_CONTENT["AISTUDIO::TOOLS::DATABASES::NODATABASECLIENT::T3662391977"] = "Unavailable" +UI_TEXT_CONTENT["AISTUDIO::TOOLS::DATABASES::NOEMBEDDINGSTORE::T3662391977"] = "Unavailable" -- Status -UI_TEXT_CONTENT["AISTUDIO::TOOLS::DATABASES::NODATABASECLIENT::T6222351"] = "Status" +UI_TEXT_CONTENT["AISTUDIO::TOOLS::DATABASES::NOEMBEDDINGSTORE::T6222351"] = "Status" -- Storage size UI_TEXT_CONTENT["AISTUDIO::TOOLS::DATABASES::QDRANT::QDRANTCLIENTIMPLEMENTATION::T1230141403"] = "Storage size" @@ -7528,6 +7570,27 @@ UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T378481461"] = "Source like p -- Document UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T4165204724"] = "Document" +-- Running +UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T1160324588"] = "Running" + +-- Idle +UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T1168775091"] = "Idle" + +-- Needs attention +UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T1566837660"] = "Needs attention" + +-- Queued +UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T2655222900"] = "Queued" + +-- Embedding +UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T2838542994"] = "Embedding" + +-- Completed +UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T3968379570"] = "Completed" + +-- Embeddings +UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T951463987"] = "Embeddings" + -- Pandoc Installation UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::PANDOCAVAILABILITYSERVICE::T185447014"] = "Pandoc Installation" diff --git a/app/MindWork AI Studio/Components/Settings/SettingsPanelEmbeddings.razor.cs b/app/MindWork AI Studio/Components/Settings/SettingsPanelEmbeddings.razor.cs index 96a1c0ee..90d0f24d 100644 --- a/app/MindWork AI Studio/Components/Settings/SettingsPanelEmbeddings.razor.cs +++ b/app/MindWork AI Studio/Components/Settings/SettingsPanelEmbeddings.razor.cs @@ -13,6 +13,9 @@ namespace AIStudio.Components.Settings; public partial class SettingsPanelEmbeddings : SettingsPanelProviderBase { + [Inject] + private DataSourceEmbeddingService DataSourceEmbeddingService { get; init; } = null!; + [Parameter] public List> AvailableEmbeddingProviders { get; set; } = new(); @@ -59,6 +62,7 @@ public partial class SettingsPanelEmbeddings : SettingsPanelProviderBase await this.UpdateEmbeddingProviders(); await this.SettingsManager.StoreSettings(); + await this.DataSourceEmbeddingService.QueueAllInternalDataSourcesAsync(); await this.MessageBus.SendMessage(this, Event.CONFIGURATION_CHANGED); } @@ -94,6 +98,7 @@ public partial class SettingsPanelEmbeddings : SettingsPanelProviderBase await this.UpdateEmbeddingProviders(); await this.SettingsManager.StoreSettings(); + await this.DataSourceEmbeddingService.QueueAllInternalDataSourcesAsync(); await this.MessageBus.SendMessage(this, Event.CONFIGURATION_CHANGED); } @@ -133,6 +138,7 @@ public partial class SettingsPanelEmbeddings : SettingsPanelProviderBase } await this.UpdateEmbeddingProviders(); + await this.DataSourceEmbeddingService.QueueAllInternalDataSourcesAsync(); await this.MessageBus.SendMessage(this, Event.CONFIGURATION_CHANGED); } diff --git a/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor.cs b/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor.cs index c22bed94..af71e3a9 100644 --- a/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor.cs +++ b/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor.cs @@ -1,11 +1,17 @@ using AIStudio.Settings; using AIStudio.Settings.DataModel; using AIStudio.Tools.ERIClient.DataModel; +using AIStudio.Tools.Services; + +using Microsoft.AspNetCore.Components; namespace AIStudio.Dialogs.Settings; public partial class SettingsDialogDataSources : SettingsDialogBase { + [Inject] + private DataSourceEmbeddingService DataSourceEmbeddingService { get; init; } = null!; + private string GetEmbeddingName(IDataSource dataSource) { if(dataSource is IInternalDataSource internalDataSource) @@ -84,6 +90,7 @@ public partial class SettingsDialogDataSources : SettingsDialogBase this.SettingsManager.ConfigurationData.DataSources.Add(addedDataSource); await this.SettingsManager.StoreSettings(); + await this.DataSourceEmbeddingService.QueueDataSourceAsync(addedDataSource); await this.MessageBus.SendMessage(this, Event.CONFIGURATION_CHANGED); } @@ -146,6 +153,7 @@ public partial class SettingsDialogDataSources : SettingsDialogBase this.SettingsManager.ConfigurationData.DataSources[this.SettingsManager.ConfigurationData.DataSources.IndexOf(dataSource)] = editedDataSource; await this.SettingsManager.StoreSettings(); + await this.DataSourceEmbeddingService.QueueDataSourceAsync(editedDataSource); await this.MessageBus.SendMessage(this, Event.CONFIGURATION_CHANGED); } @@ -184,6 +192,7 @@ public partial class SettingsDialogDataSources : SettingsDialogBase { this.SettingsManager.ConfigurationData.DataSources.Remove(dataSource); await this.SettingsManager.StoreSettings(); + await this.DataSourceEmbeddingService.RemoveDataSourceAsync(dataSource); await this.MessageBus.SendMessage(this, Event.CONFIGURATION_CHANGED); } } @@ -220,4 +229,4 @@ public partial class SettingsDialogDataSources : SettingsDialogBase break; } } -} \ No newline at end of file +} diff --git a/app/MindWork AI Studio/Layout/MainLayout.razor b/app/MindWork AI Studio/Layout/MainLayout.razor index 908411f9..7d4f28e9 100644 --- a/app/MindWork AI Studio/Layout/MainLayout.razor +++ b/app/MindWork AI Studio/Layout/MainLayout.razor @@ -25,7 +25,17 @@ - + @if (this.embeddingOverview.IsVisible) + { + + + + @T("Data sync") + + + + } + @@ -53,8 +63,23 @@ - - + + @if (this.embeddingOverview.IsVisible) + { + + @if (this.SettingsManager.ConfigurationData.App.NavigationBehavior is NavBehavior.NEVER_EXPAND_USE_TOOLTIPS) + { + + + + } + else + { + + } + + } + @@ -79,4 +104,4 @@ - \ No newline at end of file + diff --git a/app/MindWork AI Studio/Layout/MainLayout.razor.cs b/app/MindWork AI Studio/Layout/MainLayout.razor.cs index a1659f34..f96cc2d2 100644 --- a/app/MindWork AI Studio/Layout/MainLayout.razor.cs +++ b/app/MindWork AI Studio/Layout/MainLayout.razor.cs @@ -1,3 +1,4 @@ +using System.Runtime.CompilerServices; using AIStudio.Dialogs; using AIStudio.Settings; using AIStudio.Settings.DataModel; @@ -38,6 +39,9 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan [Inject] private MudTheme ColorTheme { get; init; } = null!; + + [Inject] + private DataSourceEmbeddingService DataSourceEmbeddingService { get; init; } = null!; private ILanguagePlugin Lang { get; set; } = PluginFactory.BaseLanguage; @@ -56,7 +60,9 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan private bool startupCompleted; private readonly SemaphoreSlim mandatoryInfoDialogSemaphore = new(1, 1); + private DataSourceEmbeddingOverview embeddingOverview = new(false, true, DataSourceEmbeddingState.COMPLETED, 0, 0, 0, string.Empty); private IReadOnlyCollection navItems = []; + private NavBarItem embeddingItem = new NavBarItem(string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, false); #region Overrides of ComponentBase @@ -87,6 +93,7 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan // Ensure that all settings are loaded: await this.SettingsManager.LoadSettings(); + await this.DataSourceEmbeddingService.QueueAllInternalDataSourcesAsync(); // Register this component with the message bus: this.MessageBus.RegisterComponent(this); @@ -94,7 +101,7 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan [ Event.UPDATE_AVAILABLE, Event.CONFIGURATION_CHANGED, Event.COLOR_THEME_CHANGED, Event.SHOW_ERROR, Event.SHOW_WARNING, Event.SHOW_SUCCESS, Event.STARTUP_PLUGIN_SYSTEM, Event.PLUGINS_RELOADED, - Event.INSTALL_UPDATE, Event.STARTUP_COMPLETED, + Event.INSTALL_UPDATE, Event.STARTUP_COMPLETED, Event.RAG_EMBEDDING_STATUS_CHANGED, ]); // Set the snackbar for the update service: @@ -114,6 +121,7 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan await this.themeProvider.WatchSystemDarkModeAsync(this.SystemeThemeChanged); await this.UpdateThemeConfiguration(); this.LoadNavItems(); + this.LoadEmbeddingItem(); await base.OnInitializedAsync(); } @@ -175,6 +183,7 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan await this.UpdateThemeConfiguration(); this.LoadNavItems(); + this.LoadEmbeddingItem(); this.StateHasChanged(); if (this.startupCompleted) _ = this.EnsureMandatoryInfosAcceptedAsync(); @@ -263,6 +272,7 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan this.Lang = await this.SettingsManager.GetActiveLanguagePlugin(); I18N.Init(this.Lang); this.LoadNavItems(); + this.LoadEmbeddingItem(); await this.InvokeAsync(this.StateHasChanged); if (this.startupCompleted) @@ -273,6 +283,12 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan this.startupCompleted = true; _ = this.EnsureMandatoryInfosAcceptedAsync(); break; + + case Event.RAG_EMBEDDING_STATUS_CHANGED: + this.LoadNavItems(); + this.LoadEmbeddingItem(); + this.StateHasChanged(); + break; } }); } @@ -306,6 +322,32 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan yield return new(T("Settings"), Icons.Material.Filled.Settings, palette.DarkLighten, palette.GrayLight, Routes.SETTINGS, false); } + private void LoadEmbeddingItem() + { + this.embeddingOverview = this.DataSourceEmbeddingService.GetOverview(); + var palette = this.ColorTheme.GetCurrentPalette(this.SettingsManager); + (string icon, string lightcolor, string darkcolor) embeddingIcon = this.embeddingOverview.State switch + { + (DataSourceEmbeddingState.FAILED) => (Icons.Material.Filled.Warning, palette.Error.Value, "#d32f2f"), + (DataSourceEmbeddingState.QUEUED) => (Icons.Material.Filled.Sync, palette.Info.Value, "#1976d2"), + _ => (Icons.Material.Filled.Sync, palette.Warning.Value, "#d29f00"), + }; + this.embeddingItem = new NavBarItem(T("Embeddings"), embeddingIcon.icon, embeddingIcon.lightcolor, embeddingIcon.darkcolor, Routes.EMBEDDINGS, false); + } + + private string EmbeddingNavigationTooltip => this.embeddingOverview.State switch + { + DataSourceEmbeddingState.QUEUED => T("Embeddings are waiting to be processed."), + DataSourceEmbeddingState.RUNNING => string.Format( + T("Embeddings are running: {0} of {1} files are indexed."), + this.embeddingOverview.IndexedFiles, + this.embeddingOverview.TotalFiles), + DataSourceEmbeddingState.FAILED => this.embeddingOverview.FailedFiles > 0 + ? string.Format(T("Some embeddings failed. {0} file(s) need attention."), this.embeddingOverview.FailedFiles) + : T("Some embeddings failed and need attention."), + _ => this.embeddingOverview.NavLabel, + }; + private async Task ShowUpdateDialog() { if(this.currentUpdateResponse is null) diff --git a/app/MindWork AI Studio/Pages/Embeddings.razor b/app/MindWork AI Studio/Pages/Embeddings.razor new file mode 100644 index 00000000..8d1fd01e --- /dev/null +++ b/app/MindWork AI Studio/Pages/Embeddings.razor @@ -0,0 +1,64 @@ +@attribute [Route(Routes.EMBEDDINGS)] +@inherits MSGComponentBase + + + + @T("Background embeddings") + + @T("AI Studio indexes local RAG data sources in the background. Finished files stay recorded so unchanged files can be skipped after a restart, while added or deleted files are detected during the next run.") + + + @string.Format(T("Indexed files: {0}"), this.TotalIndexedFiles) + @string.Format(T("Pending files: {0}"), this.TotalPendingFiles) + @string.Format(T("Failed files: {0}"), this.TotalFailedFiles) + + + + @if (this.Statuses.Count == 0) + { + + @T("No local data source has been queued for embedding yet.") + + } + else + { + @foreach (var status in this.Statuses) + { + + + + @status.DataSourceName + @status.StateLabel + + + + + + @string.Format(T("{0} of {1} files are indexed."), status.IndexedFiles, status.TotalFiles) + + + @if (status.FailedFiles > 0) + { + + @string.Format(T("Failed files: {0}"), status.FailedFiles) + + } + + @if (!string.IsNullOrWhiteSpace(status.CurrentFile)) + { + + @string.Format(T("Current file: {0}"), status.CurrentFile) + + } + + @if (!string.IsNullOrWhiteSpace(status.LastError)) + { + + @status.LastError + + } + + + } + } + diff --git a/app/MindWork AI Studio/Pages/Embeddings.razor.cs b/app/MindWork AI Studio/Pages/Embeddings.razor.cs new file mode 100644 index 00000000..80e0bcbe --- /dev/null +++ b/app/MindWork AI Studio/Pages/Embeddings.razor.cs @@ -0,0 +1,57 @@ +using AIStudio.Components; +using AIStudio.Tools.Services; + +using Microsoft.AspNetCore.Components; + +namespace AIStudio.Pages; + +public partial class Embeddings : MSGComponentBase +{ + [Inject] + private DataSourceEmbeddingService DataSourceEmbeddingService { get; init; } = null!; + + private IReadOnlyList Statuses { get; set; } = []; + + private int TotalIndexedFiles => this.Statuses.Sum(status => status.IndexedFiles); + + private int TotalPendingFiles => this.Statuses.Sum(status => Math.Max(0, status.TotalFiles - status.IndexedFiles - status.FailedFiles)); + + private int TotalFailedFiles => this.Statuses.Sum(status => status.FailedFiles); + + protected override async Task OnInitializedAsync() + { + this.ApplyFilters([], [ Event.RAG_EMBEDDING_STATUS_CHANGED, Event.CONFIGURATION_CHANGED ]); + await base.OnInitializedAsync(); + this.ReloadStatuses(); + } + + protected override Task ProcessIncomingMessage(ComponentBase? sendingComponent, Event triggeredEvent, T? data) where T : default + { + if (triggeredEvent is Event.RAG_EMBEDDING_STATUS_CHANGED or Event.CONFIGURATION_CHANGED) + { + this.ReloadStatuses(); + this.StateHasChanged(); + } + + return Task.CompletedTask; + } + + private void ReloadStatuses() + { + this.Statuses = this.DataSourceEmbeddingService + .GetStatuses() + .OrderBy(status => status.SortOrder) + .ThenBy(status => status.DataSourceName, StringComparer.OrdinalIgnoreCase) + .ToList(); + } + + private static Color GetStatusColor(DataSourceEmbeddingStatus status) => status.State switch + { + DataSourceEmbeddingState.RUNNING => Color.Warning, + DataSourceEmbeddingState.QUEUED => Color.Info, + DataSourceEmbeddingState.FAILED => Color.Error, + DataSourceEmbeddingState.COMPLETED when status.FailedFiles > 0 => Color.Warning, + DataSourceEmbeddingState.COMPLETED => Color.Success, + _ => Color.Default, + }; +} diff --git a/app/MindWork AI Studio/Pages/Information.razor.cs b/app/MindWork AI Studio/Pages/Information.razor.cs index 8f2192a5..302e042e 100644 --- a/app/MindWork AI Studio/Pages/Information.razor.cs +++ b/app/MindWork AI Studio/Pages/Information.razor.cs @@ -29,7 +29,7 @@ public partial class Information : MSGComponentBase private ISnackbar Snackbar { get; init; } = null!; [Inject] - private DatabaseClient DatabaseClient { get; init; } = null!; + private EmbeddingStore EmbeddingStore { get; init; } = null!; private static readonly Assembly ASSEMBLY = Assembly.GetExecutingAssembly(); private static readonly MetaDataAttribute META_DATA = ASSEMBLY.GetCustomAttribute()!; @@ -59,9 +59,9 @@ public partial class Information : MSGComponentBase private string VersionPdfium => $"{T("Used PDFium version")}: v{META_DATA_LIBRARIES.PdfiumVersion}"; - private string VersionDatabase => this.DatabaseClient.IsAvailable - ? $"{T("Database version")}: {this.DatabaseClient.Name} v{META_DATA_DATABASES.DatabaseVersion}" - : $"{T("Database")}: {this.DatabaseClient.Name} - {T("not available")}"; + private string VersionDatabase => this.EmbeddingStore.IsAvailable + ? $"{T("Database version")}: {this.EmbeddingStore.Name} v{META_DATA_DATABASES.DatabaseVersion}" + : $"{T("Database")}: {this.EmbeddingStore.Name} - {T("not available")}"; private string versionPandoc = TB("Determine Pandoc version, please wait..."); private PandocInstallation pandocInstallation; @@ -130,7 +130,7 @@ public partial class Information : MSGComponentBase this.osLanguage = await this.RustService.ReadUserLanguage(); this.logPaths = await this.RustService.GetLogPaths(); - await foreach (var (label, value) in this.DatabaseClient.GetDisplayInfo()) + await foreach (var (label, value) in this.EmbeddingStore.GetDisplayInfo()) { this.databaseDisplayInfo.Add(new DatabaseDisplayInfo(label, value)); } diff --git a/app/MindWork AI Studio/Program.cs b/app/MindWork AI Studio/Program.cs index f2b9b06c..db4c18af 100644 --- a/app/MindWork AI Studio/Program.cs +++ b/app/MindWork AI Studio/Program.cs @@ -2,7 +2,6 @@ using AIStudio.Agents; using AIStudio.Agents.AssistantAudit; using AIStudio.Settings; using AIStudio.Tools.Databases; -using AIStudio.Tools.Databases.Qdrant; using AIStudio.Tools.PluginSystem; using AIStudio.Tools.PluginSystem.Assistants; using AIStudio.Tools.Services; @@ -28,7 +27,7 @@ internal sealed class Program public static string API_TOKEN = null!; public static IServiceProvider SERVICE_PROVIDER = null!; public static ILoggerFactory LOGGER_FACTORY = null!; - public static DatabaseClient DATABASE_CLIENT = null!; + public static EmbeddingStore EMBEDDING_STORE = null!; public static async Task Main() { @@ -87,47 +86,9 @@ internal sealed class Program return; } - var qdrantInfo = await rust.GetQdrantInfo(); - DatabaseClient databaseClient; - if (!qdrantInfo.IsAvailable) - { - Console.WriteLine($"Warning: Qdrant is not available. Starting without vector database. Reason: '{qdrantInfo.UnavailableReason ?? "unknown"}'."); - databaseClient = new NoDatabaseClient("Qdrant", qdrantInfo.UnavailableReason); - } - else - { - if (qdrantInfo.Path == string.Empty) - { - Console.WriteLine("Error: Failed to get the Qdrant path from Rust."); - return; - } - - if (qdrantInfo.PortHttp == 0) - { - Console.WriteLine("Error: Failed to get the Qdrant HTTP port from Rust."); - return; - } - - if (qdrantInfo.PortGrpc == 0) - { - Console.WriteLine("Error: Failed to get the Qdrant gRPC port from Rust."); - return; - } - - if (qdrantInfo.Fingerprint == string.Empty) - { - Console.WriteLine("Error: Failed to get the Qdrant fingerprint from Rust."); - return; - } - - if (qdrantInfo.ApiToken == string.Empty) - { - Console.WriteLine("Error: Failed to get the Qdrant API token from Rust."); - return; - } - - databaseClient = new QdrantClientImplementation("Qdrant", qdrantInfo.Path, qdrantInfo.PortHttp, qdrantInfo.PortGrpc, qdrantInfo.Fingerprint, qdrantInfo.ApiToken); - } + var embeddingStoreConfig = await rust.GetEmbeddingStoreConfiguration(EmbeddingStoreKind.QDRANT_REMOTE); + + EmbeddingStore embeddingStore = EmbeddingStoreFactory.Create(embeddingStoreConfig); var builder = WebApplication.CreateBuilder(); builder.WebHost.ConfigureKestrel(kestrelServerOptions => @@ -173,6 +134,7 @@ internal sealed class Program builder.Services.AddSingleton(); builder.Services.AddSingleton(); builder.Services.AddSingleton(); + builder.Services.AddSingleton(); builder.Services.AddScoped(); builder.Services.AddTransient(); builder.Services.AddTransient(); @@ -183,7 +145,8 @@ internal sealed class Program builder.Services.AddHostedService(); builder.Services.AddHostedService(); builder.Services.AddHostedService(); - builder.Services.AddSingleton(databaseClient); + builder.Services.AddHostedService(sp => sp.GetRequiredService()); + builder.Services.AddSingleton(embeddingStore); builder.Services.AddHostedService(); builder.Services.AddHostedService(); @@ -243,9 +206,9 @@ internal sealed class Program RUST_SERVICE = rust; ENCRYPTION = encryption; - var databaseLogger = app.Services.GetRequiredService>(); - databaseClient.SetLogger(databaseLogger); - DATABASE_CLIENT = databaseClient; + var databaseLogger = app.Services.GetRequiredService>(); + embeddingStore.SetLogger(databaseLogger); + EMBEDDING_STORE = embeddingStore; programLogger.LogInformation("Initialize internal file system."); app.Use(Redirect.HandlerContentAsync); @@ -283,7 +246,7 @@ internal sealed class Program await serverTask; RUST_SERVICE.Dispose(); - DATABASE_CLIENT.Dispose(); + EMBEDDING_STORE.Dispose(); PluginFactory.Dispose(); programLogger.LogInformation("The AI Studio server was stopped."); } diff --git a/app/MindWork AI Studio/Routes.razor.cs b/app/MindWork AI Studio/Routes.razor.cs index 2a0242fb..30279c0d 100644 --- a/app/MindWork AI Studio/Routes.razor.cs +++ b/app/MindWork AI Studio/Routes.razor.cs @@ -4,6 +4,7 @@ public sealed partial class Routes { public const string HOME = "/"; public const string CHAT = "/chat"; + public const string EMBEDDINGS = "/embeddings"; public const string ABOUT = "/about"; public const string ASSISTANTS = "/assistants"; public const string SETTINGS = "/settings"; diff --git a/app/MindWork AI Studio/Tools/Databases/EmbeddingStoragePoint.cs b/app/MindWork AI Studio/Tools/Databases/EmbeddingStoragePoint.cs new file mode 100644 index 00000000..0b14fc2b --- /dev/null +++ b/app/MindWork AI Studio/Tools/Databases/EmbeddingStoragePoint.cs @@ -0,0 +1,16 @@ +namespace AIStudio.Tools.Databases; + +public sealed record EmbeddingStoragePoint( + string PointId, + IReadOnlyList Vector, + string DataSourceId, + string DataSourceName, + string DataSourceType, + string FilePath, + string FileName, + string RelativePath, + int ChunkIndex, + string Text, + string Fingerprint, + DateTime LastWriteUtc, + DateTime EmbeddedAtUtc); diff --git a/app/MindWork AI Studio/Tools/Databases/DatabaseClient.cs b/app/MindWork AI Studio/Tools/Databases/EmbeddingStore.cs similarity index 69% rename from app/MindWork AI Studio/Tools/Databases/DatabaseClient.cs rename to app/MindWork AI Studio/Tools/Databases/EmbeddingStore.cs index b80cba94..4993e633 100644 --- a/app/MindWork AI Studio/Tools/Databases/DatabaseClient.cs +++ b/app/MindWork AI Studio/Tools/Databases/EmbeddingStore.cs @@ -1,6 +1,6 @@ namespace AIStudio.Tools.Databases; -public abstract class DatabaseClient(string name, string path) +public abstract class EmbeddingStore(string name, string path) { public string Name => name; @@ -8,7 +8,7 @@ public abstract class DatabaseClient(string name, string path) private string Path => path; - private ILogger? logger; + private ILogger? logger; public abstract IAsyncEnumerable<(string Label, string Value)> GetDisplayInfo(); @@ -45,10 +45,19 @@ public abstract class DatabaseClient(string name, string path) return $"{size:0##} {suffixes[suffixIndex]}"; } - public void SetLogger(ILogger logService) + public void SetLogger(ILogger logService) { this.logger = logService; } + + + public abstract Task EnsureEmbeddingStoreExists(string collectionName, int vectorSize, CancellationToken token); + + public abstract Task InsertEmbedding(string collectionName, IReadOnlyList points, CancellationToken token); + + public abstract Task DeleteEmbeddingByFile(string collectionName, string filePath, CancellationToken token); + + public abstract Task DeleteEmbeddingStore(string collectionName, CancellationToken token); public abstract void Dispose(); } \ No newline at end of file diff --git a/app/MindWork AI Studio/Tools/Databases/EmbeddingStoreFactory.cs b/app/MindWork AI Studio/Tools/Databases/EmbeddingStoreFactory.cs new file mode 100644 index 00000000..9fa51305 --- /dev/null +++ b/app/MindWork AI Studio/Tools/Databases/EmbeddingStoreFactory.cs @@ -0,0 +1,33 @@ +using AIStudio.Tools.Databases.Qdrant; + +namespace AIStudio.Tools.Databases; + +public class EmbeddingStoreFactory +{ + public static EmbeddingStore Create(EmbeddingStoreConfiguration configuration) => configuration.Kind switch + { + EmbeddingStoreKind.NONE => new NoEmbeddingStore(configuration.Name, configuration.UnavailableReason ?? "unknown"), + _ when configuration.Location is null => new NoEmbeddingStore(configuration.Name, $"No location specified for {configuration.Name}"), + EmbeddingStoreKind.QDRANT_REMOTE when configuration.Location is RemoteLocation location=> new QdrantClientImplementation(configuration.Name, location.Path, location.HttpPort, location.GrpcPort, location.Fingerprint, location.ApiToken), + _ => throw new ArgumentException("Invalid configuration for " + configuration.Name, nameof(configuration)), + }; +} + +public enum EmbeddingStoreKind +{ + NONE, + QDRANT_EMBED, + QDRANT_REMOTE, +} + +public abstract record EmbeddingStoreLocation; + +public sealed record EmbeddedLocation(string Path) : EmbeddingStoreLocation; + +public sealed record RemoteLocation(string Path, int? HttpPort, int? GrpcPort, string? Fingerprint, string? ApiToken) : EmbeddingStoreLocation; + +public sealed record EmbeddingStoreConfiguration( + EmbeddingStoreKind Kind, + string Name, + EmbeddingStoreLocation? Location, + string? UnavailableReason); \ No newline at end of file diff --git a/app/MindWork AI Studio/Tools/Databases/NoDatabaseClient.cs b/app/MindWork AI Studio/Tools/Databases/NoDatabaseClient.cs deleted file mode 100644 index 7b3b0cd4..00000000 --- a/app/MindWork AI Studio/Tools/Databases/NoDatabaseClient.cs +++ /dev/null @@ -1,24 +0,0 @@ -using AIStudio.Tools.PluginSystem; - -namespace AIStudio.Tools.Databases; - -public sealed class NoDatabaseClient(string name, string? unavailableReason) : DatabaseClient(name, string.Empty) -{ - private static string TB(string fallbackEN) => I18N.I.T(fallbackEN, typeof(NoDatabaseClient).Namespace, nameof(NoDatabaseClient)); - - public override bool IsAvailable => false; - - public override async IAsyncEnumerable<(string Label, string Value)> GetDisplayInfo() - { - yield return (TB("Status"), TB("Unavailable")); - - if (!string.IsNullOrWhiteSpace(unavailableReason)) - yield return (TB("Reason"), unavailableReason); - - await Task.CompletedTask; - } - - public override void Dispose() - { - } -} \ No newline at end of file diff --git a/app/MindWork AI Studio/Tools/Databases/NoEmbeddingStore.cs b/app/MindWork AI Studio/Tools/Databases/NoEmbeddingStore.cs new file mode 100644 index 00000000..396a4894 --- /dev/null +++ b/app/MindWork AI Studio/Tools/Databases/NoEmbeddingStore.cs @@ -0,0 +1,39 @@ +using AIStudio.Tools.PluginSystem; + +namespace AIStudio.Tools.Databases; + +public sealed class NoEmbeddingStore(string name, string? unavailableReason) : EmbeddingStore(name, string.Empty) +{ + private static string TB(string fallbackEN) => I18N.I.T(fallbackEN, typeof(NoEmbeddingStore).Namespace, nameof(NoEmbeddingStore)); + + public override bool IsAvailable => false; + + public override async IAsyncEnumerable<(string Label, string Value)> GetDisplayInfo() + { + yield return (TB("Status"), TB("Unavailable")); + + if (!string.IsNullOrWhiteSpace(unavailableReason)) + yield return (TB("Reason"), unavailableReason); + + await Task.CompletedTask; + } + + public override Task EnsureEmbeddingStoreExists(string collectionName, int vectorSize, CancellationToken token) => throw this.BuildUnavailableException(); + + public override Task InsertEmbedding(string collectionName, IReadOnlyList points, CancellationToken token) => throw this.BuildUnavailableException(); + + public override Task DeleteEmbeddingByFile(string collectionName, string filePath, CancellationToken token) => Task.CompletedTask; + + public override Task DeleteEmbeddingStore(string collectionName, CancellationToken token) => Task.CompletedTask; + + public override void Dispose() + { + } + + private InvalidOperationException BuildUnavailableException() + { + return new InvalidOperationException(string.IsNullOrWhiteSpace(unavailableReason) + ? "The vector database is not available." + : unavailableReason); + } +} diff --git a/app/MindWork AI Studio/Tools/Databases/Qdrant/QdrantClientImplementation.cs b/app/MindWork AI Studio/Tools/Databases/Qdrant/QdrantClientImplementation.cs index 60a13419..0a7fb022 100644 --- a/app/MindWork AI Studio/Tools/Databases/Qdrant/QdrantClientImplementation.cs +++ b/app/MindWork AI Studio/Tools/Databases/Qdrant/QdrantClientImplementation.cs @@ -1,10 +1,11 @@ using Qdrant.Client; using Qdrant.Client.Grpc; using AIStudio.Tools.PluginSystem; +using static Qdrant.Client.Grpc.Conditions; namespace AIStudio.Tools.Databases.Qdrant; -public class QdrantClientImplementation : DatabaseClient +public class QdrantClientImplementation : EmbeddingStore { private static string TB(string fallbackEN) => I18N.I.T(fallbackEN, typeof(QdrantClientImplementation).Namespace, nameof(QdrantClientImplementation)); @@ -18,12 +19,12 @@ public class QdrantClientImplementation : DatabaseClient private string ApiToken { get; } - public QdrantClientImplementation(string name, string path, int httpPort, int grpcPort, string fingerprint, string apiToken): base(name, path) + public QdrantClientImplementation(string name, string path, int? httpPort, int? grpcPort, string? fingerprint, string? apiToken): base(name, path) { - this.HttpPort = httpPort; - this.GrpcPort = grpcPort; - this.Fingerprint = fingerprint; - this.ApiToken = apiToken; + this.HttpPort = httpPort ?? 0; + this.GrpcPort = grpcPort ?? 0; + this.Fingerprint = fingerprint ?? string.Empty; + this.ApiToken = apiToken ?? string.Empty; this.GrpcClient = this.CreateQdrantClient(); } @@ -62,5 +63,56 @@ public class QdrantClientImplementation : DatabaseClient yield return (TB("Number of collections"), await this.GetCollectionsAmount()); } + public override async Task EnsureEmbeddingStoreExists(string collectionName, int vectorSize, CancellationToken token) + { + var exists = await this.GrpcClient.CollectionExistsAsync(collectionName, token); + if (exists) + return; + + await this.GrpcClient.CreateCollectionAsync( + collectionName, + new VectorParams + { + Size = (ulong)vectorSize, + Distance = Distance.Cosine, + }, + cancellationToken: token); + } + + public override Task InsertEmbedding(string collectionName, IReadOnlyList points, CancellationToken token) + { + var qdrantPoints = points.Select(point => new PointStruct + { + Id = Guid.Parse(point.PointId), + Vectors = point.Vector.ToArray(), + Payload = + { + ["data_source_id"] = point.DataSourceId, + ["data_source_name"] = point.DataSourceName, + ["data_source_type"] = point.DataSourceType, + ["file_path"] = point.FilePath, + ["file_name"] = point.FileName, + ["relative_path"] = point.RelativePath, + ["chunk_index"] = (long)point.ChunkIndex, + ["text"] = point.Text, + ["fingerprint"] = point.Fingerprint, + ["last_write_utc"] = point.LastWriteUtc.ToString("O"), + ["embedded_at_utc"] = point.EmbeddedAtUtc.ToString("O"), + } + }).ToList(); + + return this.GrpcClient.UpsertAsync(collectionName, qdrantPoints, true, null, null, token); + } + + public override Task DeleteEmbeddingByFile(string collectionName, string filePath, CancellationToken token) + { + return this.GrpcClient.DeleteAsync(collectionName, MatchKeyword("file_path", filePath), true, null, null, token); + } + + public override Task DeleteEmbeddingStore(string collectionName, CancellationToken token) + { + return this.GrpcClient.DeleteCollectionAsync(collectionName, cancellationToken: token); + } + public override void Dispose() => this.GrpcClient.Dispose(); -} \ No newline at end of file +} diff --git a/app/MindWork AI Studio/Tools/Event.cs b/app/MindWork AI Studio/Tools/Event.cs index bbec441d..5c3c4c81 100644 --- a/app/MindWork AI Studio/Tools/Event.cs +++ b/app/MindWork AI Studio/Tools/Event.cs @@ -37,6 +37,7 @@ public enum Event // RAG events: RAG_AUTO_DATA_SOURCES_SELECTED, + RAG_EMBEDDING_STATUS_CHANGED, // File attachment events: REGISTER_FILE_DROP_AREA, diff --git a/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.cs b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.cs new file mode 100644 index 00000000..9e2a71bc --- /dev/null +++ b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.cs @@ -0,0 +1,1055 @@ +using System.Collections.Concurrent; +using System.Security.Cryptography; +using System.Text; +using System.Text.Json; +using System.Threading.Channels; + +using AIStudio.Provider; +using AIStudio.Settings; +using AIStudio.Settings.DataModel; +using AIStudio.Tools.Databases; +using AIStudio.Tools.PluginSystem; +using AIStudio.Tools.Rust; + +namespace AIStudio.Tools.Services; + +public sealed class DataSourceEmbeddingService : BackgroundService +{ + private const string STATE_FILENAME = "rag-embedding-state.json"; + private const int MAX_CHUNK_LENGTH = 3_200; + private const int MIN_CHUNK_LENGTH = 800; + private const int CHUNK_OVERLAP_LENGTH = 320; + private const int EMBEDDING_BATCH_SIZE = 16; + + private readonly SettingsManager settingsManager; + private readonly RustService rustService; + private readonly EmbeddingStore embeddingStore; + private readonly ILogger logger; + private readonly Channel queue = Channel.CreateUnbounded(); + private readonly ConcurrentDictionary queuedIds = new(StringComparer.OrdinalIgnoreCase); + private readonly ConcurrentDictionary statuses = new(StringComparer.OrdinalIgnoreCase); + private readonly ConcurrentDictionary watchers = new(StringComparer.OrdinalIgnoreCase); + private readonly SemaphoreSlim stateLock = new(1, 1); + private readonly JsonSerializerOptions jsonOptions = new() + { + WriteIndented = true, + }; + + private static string TB(string fallbackEN) => I18N.I.T(fallbackEN, typeof(DataSourceEmbeddingService).Namespace, nameof(DataSourceEmbeddingService)); + + private Dictionary manifests = new(StringComparer.OrdinalIgnoreCase); + private bool stateLoaded; + + public DataSourceEmbeddingService(SettingsManager settingsManager, RustService rustService, EmbeddingStore embeddingStore, ILogger logger) + { + this.settingsManager = settingsManager; + this.rustService = rustService; + this.embeddingStore = embeddingStore; + this.logger = logger; + } + + public IReadOnlyList GetStatuses() + { + return this.statuses.Values + .OrderBy(status => status.SortOrder) + .ThenBy(status => status.DataSourceName, StringComparer.OrdinalIgnoreCase) + .ToList(); + } + + public DataSourceEmbeddingOverview GetOverview() + { + var orderedStatuses = this.statuses.Values + .OrderBy(status => status.SortOrder) + .ThenBy(status => status.DataSourceName, StringComparer.OrdinalIgnoreCase) + .ToList(); + + var activeStatus = orderedStatuses + .FirstOrDefault(status => status.State is DataSourceEmbeddingState.QUEUED or DataSourceEmbeddingState.RUNNING); + + if (activeStatus is not null) + { + var total = Math.Max(activeStatus.TotalFiles, 1); + return new( + true, + false, + activeStatus.State, + activeStatus.IndexedFiles, + total, + activeStatus.FailedFiles, + $"{TB("Embedding")} {activeStatus.IndexedFiles}/{total}"); + } + + var failedStatus = orderedStatuses + .FirstOrDefault(status => status.State is DataSourceEmbeddingState.FAILED || status.FailedFiles > 0); + + if (failedStatus is not null) + return new(true, true, DataSourceEmbeddingState.FAILED, failedStatus.IndexedFiles, failedStatus.TotalFiles, failedStatus.FailedFiles, TB("Embeddings")); + + return new(false, true, DataSourceEmbeddingState.COMPLETED, 0, 0, 0, TB("Embeddings")); + } + + public Task QueueAllInternalDataSourcesAsync() + { + this.RefreshWatchers(); + + var tasks = this.settingsManager.ConfigurationData.DataSources + .Where(this.IsSupportedInternalDataSource) + .Select(this.QueueDataSourceAsync); + + return Task.WhenAll(tasks); + } + + public async Task QueueDataSourceAsync(IDataSource dataSource) + { + if (!this.IsSupportedInternalDataSource(dataSource)) + return; + + this.logger.LogInformation("Queueing data source '{DataSourceName}' ({DataSourceId}) for background embeddings.", dataSource.Name, dataSource.Id); + this.RefreshWatchers(); + + if (!this.statuses.TryGetValue(dataSource.Id, out var currentStatus) || currentStatus.State is not DataSourceEmbeddingState.RUNNING) + { + this.UpsertStatus(new DataSourceEmbeddingStatus( + dataSource.Id, + dataSource.Name, + dataSource.Type, + DataSourceEmbeddingState.QUEUED, + currentStatus?.TotalFiles ?? 0, + currentStatus?.IndexedFiles ?? 0, + currentStatus?.FailedFiles ?? 0, + string.Empty, + string.Empty)); + } + + if (this.queuedIds.TryAdd(dataSource.Id, 0)) + await this.queue.Writer.WriteAsync(dataSource.Id); + } + + public async Task RemoveDataSourceAsync(IDataSource dataSource) + { + if (!this.IsSupportedInternalDataSource(dataSource)) + return; + + this.RemoveWatcher(dataSource.Id); + this.statuses.TryRemove(dataSource.Id, out _); + await this.ResetPersistedStateAsync(dataSource.Id); + this.PublishStatusChanged(); + } + + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + await this.WaitForInitialSettingsAndBootstrapAsync(stoppingToken); + + while (!stoppingToken.IsCancellationRequested) + { + var dataSourceId = await this.queue.Reader.ReadAsync(stoppingToken); + this.queuedIds.TryRemove(dataSourceId, out _); + + var dataSource = this.settingsManager.ConfigurationData.DataSources + .FirstOrDefault(source => source.Id.Equals(dataSourceId, StringComparison.OrdinalIgnoreCase)); + + if (dataSource is null || !this.IsSupportedInternalDataSource(dataSource)) + continue; + + try + { + await this.ProcessDataSourceAsync(dataSource, stoppingToken); + } + catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested) + { + break; + } + catch (Exception exception) + { + this.logger.LogError(exception, "Background embedding failed for data source '{DataSourceName}' ({DataSourceId}).", dataSource.Name, dataSource.Id); + this.UpsertStatus(this.GetFallbackStatus(dataSource, exception.Message)); + } + } + } + + public override void Dispose() + { + foreach (var watcher in this.watchers.Values) + watcher.Dispose(); + + this.watchers.Clear(); + this.stateLock.Dispose(); + base.Dispose(); + } + + private async Task ProcessDataSourceAsync(IDataSource dataSource, CancellationToken token) + { + await this.EnsureStateLoadedAsync(token); + this.logger.LogInformation("Starting background embeddings for data source '{DataSourceName}' ({DataSourceId}).", dataSource.Name, dataSource.Id); + + if (!this.embeddingStore.IsAvailable) + { + this.logger.LogWarning( + "Skipping background embeddings for data source '{DataSourceName}' ({DataSourceId}) because the database client '{DatabaseName}' is unavailable.", + dataSource.Name, + dataSource.Id, + this.embeddingStore.Name); + this.UpsertStatus(this.GetFallbackStatus(dataSource, "The vector database is not available.")); + return; + } + + var embeddingProvider = this.settingsManager.ConfigurationData.EmbeddingProviders.FirstOrDefault(provider => + dataSource is IInternalDataSource internalDataSource && + provider.Id.Equals(internalDataSource.EmbeddingId, StringComparison.OrdinalIgnoreCase)); + + if (embeddingProvider == default || embeddingProvider.UsedLLMProvider is LLMProviders.NONE) + { + this.UpsertStatus(this.GetFallbackStatus(dataSource, "The selected embedding provider is not available.")); + return; + } + + this.logger.LogInformation( + "Using embedding provider '{EmbeddingProviderId}' with model '{EmbeddingModelId}' for data source '{DataSourceName}' ({DataSourceId}).", + embeddingProvider.Id, + embeddingProvider.Model.Id, + dataSource.Name, + dataSource.Id); + + var embeddingSignature = this.BuildEmbeddingSignature(embeddingProvider); + var manifest = await this.GetManifestAsync(dataSource.Id, token); + // A provider/model change invalidates the existing collection because the vectors are no longer comparable. + if (!string.Equals(manifest.EmbeddingSignature, embeddingSignature, StringComparison.Ordinal)) + { + this.logger.LogInformation( + "Embedding configuration changed for data source '{DataSourceName}' ({DataSourceId}). Resetting persisted state and collection '{CollectionName}'.", + dataSource.Name, + dataSource.Id, + this.GetCollectionName(dataSource.Id)); + await this.ResetPersistedStateAsync(dataSource.Id); + manifest = await this.GetManifestAsync(dataSource.Id, token); + manifest.EmbeddingProviderId = embeddingProvider.Id; + manifest.EmbeddingSignature = embeddingSignature; + await this.SaveStateAsync(token); + } + + var inputFiles = this.GetInputFiles(dataSource); + var indexedFiles = inputFiles.Files; + var totalFiles = indexedFiles.Count + inputFiles.FailedFiles; + var existingPaths = indexedFiles + .Select(file => file.FullName) + .ToHashSet(StringComparer.OrdinalIgnoreCase); + + this.logger.LogInformation( + "Prepared data source '{DataSourceName}' ({DataSourceId}) for embedding. AccessibleFiles={AccessibleFiles}, FailedFiles={FailedFiles}, Collection='{CollectionName}'.", + dataSource.Name, + dataSource.Id, + indexedFiles.Count, + inputFiles.FailedFiles, + this.GetCollectionName(dataSource.Id)); + + foreach (var removedFilePath in manifest.Files.Keys.Except(existingPaths, StringComparer.OrdinalIgnoreCase).ToList()) + { + await this.DeleteFilePointsAsync(this.GetCollectionName(dataSource.Id), removedFilePath, token); + manifest.Files.Remove(removedFilePath); + this.logger.LogInformation( + "Removed stale embeddings for deleted file '{FilePath}' from data source '{DataSourceName}' ({DataSourceId}).", + removedFilePath, + dataSource.Name, + dataSource.Id); + } + + await this.SaveStateAsync(token); + + // Keep the UI status in sync with the long-running file loop below. + this.UpsertStatus(new DataSourceEmbeddingStatus( + dataSource.Id, + dataSource.Name, + dataSource.Type, + DataSourceEmbeddingState.RUNNING, + totalFiles, + 0, + inputFiles.FailedFiles, + string.Empty, + inputFiles.LastError)); + + var provider = embeddingProvider.CreateProvider(); + var skippedFiles = 0; + var completedFiles = 0; + var failedFiles = inputFiles.FailedFiles; + var lastError = inputFiles.LastError; + + foreach (var file in indexedFiles) + { + token.ThrowIfCancellationRequested(); + + var fingerprint = this.BuildFingerprint(file); + if (manifest.Files.TryGetValue(file.FullName, out var existingRecord) && + string.Equals(existingRecord.Fingerprint, fingerprint, StringComparison.Ordinal)) + { + this.logger.LogDebug( + "Skipping unchanged file '{FilePath}' for data source '{DataSourceName}' ({DataSourceId}).", + file.FullName, + dataSource.Name, + dataSource.Id); + skippedFiles++; + this.UpsertStatus(new DataSourceEmbeddingStatus( + dataSource.Id, + dataSource.Name, + dataSource.Type, + DataSourceEmbeddingState.RUNNING, + totalFiles, + skippedFiles + completedFiles, + failedFiles, + string.Empty, + lastError)); + continue; + } + + this.UpsertStatus(new DataSourceEmbeddingStatus( + dataSource.Id, + dataSource.Name, + dataSource.Type, + DataSourceEmbeddingState.RUNNING, + totalFiles, + skippedFiles + completedFiles, + failedFiles, + file.Name, + lastError)); + + try + { + this.logger.LogInformation( + "Embedding file '{FilePath}' for data source '{DataSourceName}' ({DataSourceId}). Progress={CompletedFiles}/{TotalFiles}.", + file.FullName, + dataSource.Name, + dataSource.Id, + skippedFiles + completedFiles + 1, + totalFiles); + var startedAtUtc = DateTime.UtcNow; + var chunkCount = await this.IndexOneFileAsync(dataSource, file, fingerprint, embeddingProvider, provider, manifest, token); + manifest.Files[file.FullName] = new EmbeddedFileRecord( + fingerprint, + file.Length, + file.LastWriteTimeUtc, + DateTime.UtcNow, + chunkCount); + await this.SaveStateAsync(token); + completedFiles++; + this.logger.LogInformation( + "Embedded file '{FilePath}' for data source '{DataSourceName}' ({DataSourceId}) successfully. Chunks={ChunkCount}, DurationMs={DurationMs}.", + file.FullName, + dataSource.Name, + dataSource.Id, + chunkCount, + (DateTime.UtcNow - startedAtUtc).TotalMilliseconds); + } + catch (Exception exception) + { + failedFiles++; + lastError = exception.Message; + manifest.Files.Remove(file.FullName); + await this.DeleteFilePointsAsync(this.GetCollectionName(dataSource.Id), file.FullName, token); + await this.SaveStateAsync(token); + + this.logger.LogWarning(exception, "Failed to embed file '{FilePath}' for data source '{DataSourceName}'.", file.FullName, dataSource.Name); + this.UpsertStatus(new DataSourceEmbeddingStatus( + dataSource.Id, + dataSource.Name, + dataSource.Type, + DataSourceEmbeddingState.RUNNING, + totalFiles, + skippedFiles + completedFiles, + failedFiles, + file.Name, + exception.Message)); + } + } + + this.UpsertStatus(new DataSourceEmbeddingStatus( + dataSource.Id, + dataSource.Name, + dataSource.Type, + failedFiles > 0 ? DataSourceEmbeddingState.FAILED : DataSourceEmbeddingState.COMPLETED, + totalFiles, + skippedFiles + completedFiles, + failedFiles, + string.Empty, + failedFiles > 0 + ? string.IsNullOrWhiteSpace(lastError) + ? "Some files could not be embedded. See the logs for details." + : lastError + : string.Empty)); + this.logger.LogInformation( + "Finished background embeddings for data source '{DataSourceName}' ({DataSourceId}). Indexed={IndexedFiles}, Failed={FailedFiles}, Total={TotalFiles}.", + dataSource.Name, + dataSource.Id, + skippedFiles + completedFiles, + failedFiles, + totalFiles); + } + + private async Task IndexOneFileAsync( + IDataSource dataSource, + FileInfo file, + string fingerprint, + EmbeddingProvider embeddingProvider, + IProvider provider, + DataSourceEmbeddingManifest manifest, + CancellationToken token) + { + var collectionName = this.GetCollectionName(dataSource.Id); + this.logger.LogDebug( + "Resetting stored embeddings for file '{FilePath}' in collection '{CollectionName}' before re-indexing.", + file.FullName, + collectionName); + await this.DeleteFilePointsAsync(collectionName, file.FullName, token); + + var batch = new List<(string Text, int ChunkIndex)>(EMBEDDING_BATCH_SIZE); + var totalChunkCount = 0; + + await foreach (var chunk in this.StreamEmbeddingChunksAsync(file.FullName, token)) + { + batch.Add((chunk, totalChunkCount)); + totalChunkCount++; + + if (batch.Count >= EMBEDDING_BATCH_SIZE) + await this.FlushBatchAsync(dataSource, file, fingerprint, embeddingProvider, provider, manifest, collectionName, batch, token); + } + + if (batch.Count > 0) + await this.FlushBatchAsync(dataSource, file, fingerprint, embeddingProvider, provider, manifest, collectionName, batch, token); + + if (totalChunkCount == 0) + throw new InvalidOperationException($"The file '{file.Name}' did not yield any text chunks."); + + this.logger.LogDebug( + "Generated {ChunkCount} chunks for file '{FilePath}' in data source '{DataSourceName}' ({DataSourceId}).", + totalChunkCount, + file.FullName, + dataSource.Name, + dataSource.Id); + + return totalChunkCount; + } + + private async Task FlushBatchAsync( + IDataSource dataSource, + FileInfo file, + string fingerprint, + EmbeddingProvider embeddingProvider, + IProvider provider, + DataSourceEmbeddingManifest manifest, + string collectionName, + List<(string Text, int ChunkIndex)> batch, + CancellationToken token) + { + this.logger.LogDebug( + "Requesting embeddings for batch of {ChunkCount} chunks from file '{FilePath}' in data source '{DataSourceName}' ({DataSourceId}).", + batch.Count, + file.FullName, + dataSource.Name, + dataSource.Id); + + var texts = batch.Select(item => item.Text).ToList(); + var vectors = await provider.EmbedTextAsync(embeddingProvider.Model, this.settingsManager, token, texts); + if (vectors.Count != batch.Count) + throw new InvalidOperationException($"The embedding provider returned {vectors.Count} vectors for {batch.Count} text chunks."); + + var vectorSize = vectors.FirstOrDefault()?.Count ?? 0; + if (vectorSize <= 0) + throw new InvalidOperationException("The embedding provider returned an empty vector."); + + if (manifest.VectorSize > 0 && manifest.VectorSize != vectorSize) + throw new InvalidOperationException($"The embedding vector size changed from {manifest.VectorSize} to {vectorSize}. Please re-save the data source to trigger a clean re-index."); + + if (manifest.VectorSize == 0) + { + // The first successful batch defines the vector size for the whole collection. + manifest.VectorSize = vectorSize; + await this.EnsureCollectionExistsAsync(collectionName, vectorSize, token); + await this.SaveStateAsync(token); + this.logger.LogInformation( + "Created embedding collection '{CollectionName}' with vector size {VectorSize} for data source '{DataSourceName}' ({DataSourceId}).", + collectionName, + vectorSize, + dataSource.Name, + dataSource.Id); + } + + await this.UpsertPointsAsync( + collectionName, + dataSource, + file, + fingerprint, + batch, + vectors, + this.TryGetRelativePath(dataSource, file), + token); + + this.logger.LogDebug( + "Stored {ChunkCount} embedded chunks for file '{FilePath}' in collection '{CollectionName}'.", + batch.Count, + file.FullName, + collectionName); + + batch.Clear(); + } + + private async IAsyncEnumerable StreamEmbeddingChunksAsync(string filePath, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken token) + { + if (this.IsImageFilePath(filePath)) + { + yield return this.BuildImageIndexText(filePath); + yield break; + } + + var currentChunk = new StringBuilder(); + + await foreach (var segment in this.rustService.StreamArbitraryFileData(filePath, token: token)) + { + var normalized = NormalizeChunkSegment(segment); + if (string.IsNullOrWhiteSpace(normalized)) + continue; + + if (currentChunk.Length > 0 && currentChunk.Length + normalized.Length + Environment.NewLine.Length > MAX_CHUNK_LENGTH) + { + if (currentChunk.Length >= MIN_CHUNK_LENGTH) + { + var chunk = currentChunk.ToString().Trim(); + if (!string.IsNullOrWhiteSpace(chunk)) + yield return chunk; + + var overlap = chunk.Length > CHUNK_OVERLAP_LENGTH + ? chunk[^CHUNK_OVERLAP_LENGTH..] + : chunk; + + currentChunk.Clear(); + currentChunk.Append(overlap); + currentChunk.AppendLine(); + } + else + { + currentChunk.AppendLine(); + } + } + + currentChunk.Append(normalized); + currentChunk.AppendLine(); + } + + var finalChunk = currentChunk.ToString().Trim(); + if (!string.IsNullOrWhiteSpace(finalChunk)) + yield return finalChunk; + } + + private async Task EnsureCollectionExistsAsync(string collectionName, int vectorSize, CancellationToken token) + { + await this.embeddingStore.EnsureEmbeddingStoreExists(collectionName, vectorSize, token); + } + + private async Task UpsertPointsAsync( + string collectionName, + IDataSource dataSource, + FileInfo file, + string fingerprint, + IReadOnlyList<(string Text, int ChunkIndex)> batch, + IReadOnlyList> vectors, + string relativePath, + CancellationToken token) + { + var embeddedAtUtc = DateTime.UtcNow; + var points = batch.Select((item, index) => new EmbeddingStoragePoint( + this.CreatePointId(dataSource.Id, fingerprint, item.ChunkIndex), + vectors[index], + dataSource.Id, + dataSource.Name, + dataSource.Type.ToString(), + file.FullName, + file.Name, + relativePath, + item.ChunkIndex, + item.Text, + fingerprint, + file.LastWriteTimeUtc, + embeddedAtUtc)).ToList(); + + await this.embeddingStore.InsertEmbedding(collectionName, points, token); + } + + private async Task DeleteFilePointsAsync(string collectionName, string filePath, CancellationToken token) + { + await this.embeddingStore.DeleteEmbeddingByFile(collectionName, filePath, token); + } + + private async Task DeleteCollectionAsync(string collectionName) + { + await this.embeddingStore.DeleteEmbeddingStore(collectionName, CancellationToken.None); + } + + private async Task EnsureStateLoadedAsync(CancellationToken token) + { + if (this.stateLoaded) + return; + + await this.stateLock.WaitAsync(token); + try + { + if (this.stateLoaded) + return; + + var statePath = this.GetStatePath(); + if (!string.IsNullOrWhiteSpace(statePath) && File.Exists(statePath)) + { + var json = await File.ReadAllTextAsync(statePath, token); + var persistedState = JsonSerializer.Deserialize(json, this.jsonOptions); + this.manifests = persistedState?.DataSources ?? new Dictionary(StringComparer.OrdinalIgnoreCase); + } + + this.stateLoaded = true; + } + finally + { + this.stateLock.Release(); + } + } + + private async Task GetManifestAsync(string dataSourceId, CancellationToken token) + { + await this.EnsureStateLoadedAsync(token); + if (this.manifests.TryGetValue(dataSourceId, out var manifest)) + return manifest; + + manifest = new DataSourceEmbeddingManifest(); + this.manifests[dataSourceId] = manifest; + return manifest; + } + + private async Task SaveStateAsync(CancellationToken token) + { + var statePath = this.GetStatePath(); + if (string.IsNullOrWhiteSpace(statePath)) + return; + + var directory = Path.GetDirectoryName(statePath); + if (!string.IsNullOrWhiteSpace(directory)) + Directory.CreateDirectory(directory); + + var persistedState = new PersistedEmbeddingState + { + DataSources = this.manifests + }; + + var json = JsonSerializer.Serialize(persistedState, this.jsonOptions); + await File.WriteAllTextAsync(statePath, json, token); + } + + private async Task ResetPersistedStateAsync(string dataSourceId) + { + await this.EnsureStateLoadedAsync(CancellationToken.None); + this.manifests.Remove(dataSourceId); + await this.DeleteCollectionAsync(this.GetCollectionName(dataSourceId)); + await this.SaveStateAsync(CancellationToken.None); + this.logger.LogInformation("Reset persisted embedding state for data source '{DataSourceId}'.", dataSourceId); + } + + private async Task WaitForInitialSettingsAndBootstrapAsync(CancellationToken token) + { + while (!token.IsCancellationRequested) + { + if (this.settingsManager.HasCompletedInitialSettingsLoad + && !string.IsNullOrWhiteSpace(SettingsManager.ConfigDirectory) + && !string.IsNullOrWhiteSpace(SettingsManager.DataDirectory)) + { + break; + } + + await Task.Delay(250, token); + } + + token.ThrowIfCancellationRequested(); + + this.logger.LogInformation("Embedding background service is ready. Queueing all internal data sources."); + await this.QueueAllInternalDataSourcesAsync(); + } + + private void RefreshWatchers() + { + var supportedSources = this.settingsManager.ConfigurationData.DataSources + .Where(this.IsSupportedInternalDataSource) + .ToDictionary(source => source.Id, StringComparer.OrdinalIgnoreCase); + + foreach (var existingWatcherId in this.watchers.Keys.Except(supportedSources.Keys, StringComparer.OrdinalIgnoreCase).ToList()) + this.RemoveWatcher(existingWatcherId); + + foreach (var dataSource in supportedSources.Values) + this.EnsureWatcher(dataSource); + } + + private void EnsureWatcher(IDataSource dataSource) + { + if (this.watchers.TryGetValue(dataSource.Id, out var existingWatcher)) + { + var existingTarget = $"{existingWatcher.Path}|{existingWatcher.Filter}|{existingWatcher.IncludeSubdirectories}"; + var desiredTarget = this.GetWatcherTarget(dataSource); + if (string.Equals(existingTarget, desiredTarget, StringComparison.OrdinalIgnoreCase)) + return; + + this.RemoveWatcher(dataSource.Id); + } + + var watcher = this.CreateWatcher(dataSource); + if (watcher is null) + return; + + if (!this.watchers.TryAdd(dataSource.Id, watcher)) + watcher.Dispose(); + } + + private FileSystemWatcher? CreateWatcher(IDataSource dataSource) + { + FileSystemWatcher? watcher = dataSource switch + { + DataSourceLocalDirectory localDirectory when Directory.Exists(localDirectory.Path) => new FileSystemWatcher(localDirectory.Path) + { + IncludeSubdirectories = true, + NotifyFilter = NotifyFilters.FileName | NotifyFilters.DirectoryName | NotifyFilters.LastWrite | NotifyFilters.CreationTime | NotifyFilters.Size, + }, + DataSourceLocalFile localFile when File.Exists(localFile.FilePath) && !string.IsNullOrWhiteSpace(Path.GetDirectoryName(localFile.FilePath)) => new FileSystemWatcher(Path.GetDirectoryName(localFile.FilePath)!) + { + Filter = Path.GetFileName(localFile.FilePath), + NotifyFilter = NotifyFilters.FileName | NotifyFilters.LastWrite | NotifyFilters.CreationTime | NotifyFilters.Size, + }, + _ => null, + }; + + if (watcher is null) + return null; + + FileSystemEventHandler onChanged = (_, _) => this.OnWatchedDataSourceChanged(dataSource.Id); + RenamedEventHandler onRenamed = (_, _) => this.OnWatchedDataSourceChanged(dataSource.Id); + ErrorEventHandler onError = (_, errorArgs) => + { + this.logger.LogWarning(errorArgs.GetException(), "The file watcher for data source '{DataSourceId}' failed. Recreating it.", dataSource.Id); + this.RemoveWatcher(dataSource.Id); + this.EnsureWatcher(dataSource); + this.OnWatchedDataSourceChanged(dataSource.Id); + }; + + watcher.Created += onChanged; + watcher.Changed += onChanged; + watcher.Deleted += onChanged; + watcher.Renamed += onRenamed; + watcher.Error += onError; + watcher.EnableRaisingEvents = true; + return watcher; + } + + private string GetWatcherTarget(IDataSource dataSource) => dataSource switch + { + DataSourceLocalDirectory localDirectory => $"{localDirectory.Path}|*.*|True", + DataSourceLocalFile localFile => $"{Path.GetDirectoryName(localFile.FilePath)}|{Path.GetFileName(localFile.FilePath)}|False", + _ => string.Empty + }; + + private void RemoveWatcher(string dataSourceId) + { + if (this.watchers.TryRemove(dataSourceId, out var watcher)) + watcher.Dispose(); + } + + private void OnWatchedDataSourceChanged(string dataSourceId) + { + this.logger.LogInformation("Detected file system change for data source '{DataSourceId}'. Queueing a fresh embedding run.", dataSourceId); + _ = Task.Run(async () => + { + try + { + var dataSource = this.settingsManager.ConfigurationData.DataSources + .FirstOrDefault(source => source.Id.Equals(dataSourceId, StringComparison.OrdinalIgnoreCase)); + + if (dataSource is not null) + await this.QueueDataSourceAsync(dataSource); + } + catch (Exception exception) + { + this.logger.LogWarning(exception, "Failed to queue watched data source '{DataSourceId}' after a file system change.", dataSourceId); + } + }); + } + + private string GetStatePath() + { + if (string.IsNullOrWhiteSpace(SettingsManager.ConfigDirectory)) + return string.Empty; + + return Path.Combine(SettingsManager.ConfigDirectory, STATE_FILENAME); + } + + private FileEnumerationResult GetInputFiles(IDataSource dataSource) + { + var result = new FileEnumerationResult(); + + switch (dataSource) + { + case DataSourceLocalFile localFile when File.Exists(localFile.FilePath): + result.Files.Add(new FileInfo(localFile.FilePath)); + return result; + + case DataSourceLocalDirectory localDirectory when Directory.Exists(localDirectory.Path): + this.EnumerateAccessibleFiles(localDirectory.Path, result); + return result; + } + + switch (dataSource) + { + case DataSourceLocalFile localFile: + result.FailedFiles = 1; + result.LastError = $"The selected file '{localFile.FilePath}' does not exist."; + break; + + case DataSourceLocalDirectory localDirectory: + result.FailedFiles = 1; + result.LastError = $"The selected directory '{localDirectory.Path}' does not exist."; + break; + } + + return result; + } + + private void EnumerateAccessibleFiles(string rootPath, FileEnumerationResult result) + { + var pendingDirectories = new Stack(); + pendingDirectories.Push(rootPath); + + while (pendingDirectories.Count > 0) + { + var currentPath = pendingDirectories.Pop(); + IEnumerable subDirectories; + IEnumerable files; + + try + { + subDirectories = Directory.EnumerateDirectories(currentPath); + files = Directory.EnumerateFiles(currentPath); + } + catch (Exception exception) + { + this.logger.LogWarning(exception, "Cannot access directory '{DirectoryPath}' while indexing.", currentPath); + result.FailedFiles++; + result.LastError = $"The directory '{currentPath}' could not be accessed."; + continue; + } + + foreach (var filePath in files) + { + FileInfo fileInfo; + try + { + fileInfo = new FileInfo(filePath); + if (!fileInfo.Exists) + continue; + } + catch (Exception exception) + { + this.logger.LogWarning(exception, "Cannot inspect file '{FilePath}' while indexing.", filePath); + result.FailedFiles++; + result.LastError = $"The file '{filePath}' could not be inspected."; + continue; + } + + result.Files.Add(fileInfo); + } + + foreach (var subDirectory in subDirectories) + pendingDirectories.Push(subDirectory); + } + } + + private string TryGetRelativePath(IDataSource dataSource, FileInfo file) => dataSource switch + { + DataSourceLocalDirectory localDirectory => Path.GetRelativePath(localDirectory.Path, file.FullName), + _ => file.Name + }; + + private static string NormalizeChunkSegment(string input) + { + return input + .Replace("\r\n", "\n", StringComparison.Ordinal) + .Replace('\r', '\n') + .Trim(); + } + + private bool IsImageFilePath(string filePath) + { + return FileTypes.IsAllowedPath(filePath, FileTypes.IMAGE); + } + + private string BuildImageIndexText(string filePath) + { + var fileName = Path.GetFileName(filePath); + var fileNameWithoutExtension = Path.GetFileNameWithoutExtension(filePath); + var extension = Path.GetExtension(filePath).TrimStart('.'); + var normalizedName = fileNameWithoutExtension + .Replace('_', ' ') + .Replace('-', ' ') + .Trim(); + + return $$""" + Image asset + File name: {{fileName}} + Type: {{extension}} + Search terms: {{normalizedName}} + Path: {{filePath}} + Note: The current RAG embedding pipeline stores image files by metadata only. Visual content is not OCRed or captioned yet. + """; + } + + private string BuildEmbeddingSignature(EmbeddingProvider embeddingProvider) + { + return string.Join('|', + embeddingProvider.Id, + embeddingProvider.UsedLLMProvider, + embeddingProvider.Model.Id, + embeddingProvider.Host, + embeddingProvider.Hostname, + embeddingProvider.TokenizerPath); + } + + private string BuildFingerprint(FileInfo file) + { + var fingerprintSource = $"{file.FullName}|{file.Length}|{file.LastWriteTimeUtc.Ticks}"; + var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(fingerprintSource)); + return Convert.ToHexString(bytes); + } + + private string GetCollectionName(string dataSourceId) + { + var safeId = dataSourceId + .ToLowerInvariant() + .Replace("-", string.Empty, StringComparison.Ordinal); + + return $"rag_{safeId}"; + } + + private string CreatePointId(string dataSourceId, string fingerprint, int chunkIndex) + { + var source = $"{dataSourceId}:{fingerprint}:{chunkIndex}"; + var hash = SHA256.HashData(Encoding.UTF8.GetBytes(source)); + var guidBytes = hash[..16].ToArray(); + + // Mark the bytes as an RFC 4122 version 4 UUID so Qdrant accepts the ID format. + guidBytes[6] = (byte)((guidBytes[6] & 0x0F) | 0x40); + guidBytes[8] = (byte)((guidBytes[8] & 0x3F) | 0x80); + + return new Guid(guidBytes).ToString(); + } + + private bool IsSupportedInternalDataSource(IDataSource dataSource) + { + return dataSource is DataSourceLocalDirectory or DataSourceLocalFile; + } + + private DataSourceEmbeddingStatus GetFallbackStatus(IDataSource dataSource, string errorMessage) + { + return new DataSourceEmbeddingStatus( + dataSource.Id, + dataSource.Name, + dataSource.Type, + DataSourceEmbeddingState.FAILED, + 0, + 0, + 1, + string.Empty, + errorMessage); + } + + private void UpsertStatus(DataSourceEmbeddingStatus status) + { + this.statuses[status.DataSourceId] = status; + this.PublishStatusChanged(); + } + + private void PublishStatusChanged() + { + _ = MessageBus.INSTANCE.SendMessage(null, Event.RAG_EMBEDDING_STATUS_CHANGED, true); + } +} + +public sealed record DataSourceEmbeddingOverview( + bool IsVisible, + bool ShowIconOnly, + DataSourceEmbeddingState State, + int IndexedFiles, + int TotalFiles, + int FailedFiles, + string NavLabel); + +public enum DataSourceEmbeddingState +{ + IDLE, + QUEUED, + RUNNING, + COMPLETED, + FAILED, +} + +public sealed record DataSourceEmbeddingStatus( + string DataSourceId, + string DataSourceName, + DataSourceType DataSourceType, + DataSourceEmbeddingState State, + int TotalFiles, + int IndexedFiles, + int FailedFiles, + string CurrentFile, + string LastError) +{ + private static string TB(string fallbackEN) => I18N.I.T(fallbackEN, typeof(DataSourceEmbeddingService).Namespace, nameof(DataSourceEmbeddingService)); + + public int ProgressPercent => this.TotalFiles <= 0 ? 0 : Math.Clamp((int)Math.Round(this.IndexedFiles * 100d / this.TotalFiles), 0, 100); + + public string StateLabel => this.State switch + { + DataSourceEmbeddingState.QUEUED => TB("Queued"), + DataSourceEmbeddingState.RUNNING => TB("Running"), + DataSourceEmbeddingState.COMPLETED => TB("Completed"), + DataSourceEmbeddingState.FAILED => TB("Needs attention"), + _ => TB("Idle") + }; + + public int SortOrder => this.State switch + { + DataSourceEmbeddingState.RUNNING => 0, + DataSourceEmbeddingState.QUEUED => 1, + DataSourceEmbeddingState.FAILED => 2, + DataSourceEmbeddingState.COMPLETED => 3, + _ => 4, + }; +} + +public sealed class FileEnumerationResult +{ + public List Files { get; } = []; + + public int FailedFiles { get; set; } + + public string LastError { get; set; } = string.Empty; +} + +public sealed class PersistedEmbeddingState +{ + public Dictionary DataSources { get; init; } = new(StringComparer.OrdinalIgnoreCase); +} + +public sealed class DataSourceEmbeddingManifest +{ + public string EmbeddingProviderId { get; set; } = string.Empty; + + public string EmbeddingSignature { get; set; } = string.Empty; + + public int VectorSize { get; set; } + + public Dictionary Files { get; init; } = new(StringComparer.OrdinalIgnoreCase); +} + +public sealed record EmbeddedFileRecord( + string Fingerprint, + long FileSize, + DateTime LastWriteUtc, + DateTime EmbeddedAtUtc, + int ChunkCount); diff --git a/app/MindWork AI Studio/Tools/Services/RustService.Databases.cs b/app/MindWork AI Studio/Tools/Services/RustService.Databases.cs index a43f6c61..4fcc891b 100644 --- a/app/MindWork AI Studio/Tools/Services/RustService.Databases.cs +++ b/app/MindWork AI Studio/Tools/Services/RustService.Databases.cs @@ -1,9 +1,46 @@ -using AIStudio.Tools.Rust; +using AIStudio.Tools.Databases; +using AIStudio.Tools.Rust; namespace AIStudio.Tools.Services; public sealed partial class RustService { + public async Task GetEmbeddingStoreConfiguration(EmbeddingStoreKind kind) + { + switch (kind) + { + case EmbeddingStoreKind.QDRANT_REMOTE: + { + var qdrantInfo = await this.GetQdrantInfo(); + var invalidFields = new List(); + if (!qdrantInfo.IsAvailable) + invalidFields.Add(qdrantInfo.UnavailableReason ?? "unknown"); + if (string.IsNullOrWhiteSpace(qdrantInfo.Path)) + invalidFields.Add("Path"); + if (qdrantInfo.PortHttp == 0) + invalidFields.Add("HttpPort"); + if (qdrantInfo.PortGrpc == 0) + invalidFields.Add("GrpcPort"); + if (string.IsNullOrWhiteSpace(qdrantInfo.Fingerprint)) + invalidFields.Add("Fingerprint"); + if (string.IsNullOrWhiteSpace(qdrantInfo.ApiToken)) + invalidFields.Add("ApiToken"); + if (invalidFields.Count <= 0) return new EmbeddingStoreConfiguration(kind, "Qdrant", new RemoteLocation(qdrantInfo.Path, qdrantInfo.PortHttp, qdrantInfo.PortGrpc, qdrantInfo.Fingerprint, qdrantInfo.ApiToken), null); + var reason = string.Join(", ", invalidFields); + Console.WriteLine($"Warning: Qdrant is not available. Starting without vector database. Reason: '{reason}'."); + + return new EmbeddingStoreConfiguration( + EmbeddingStoreKind.NONE, + "Qdrant", + null, + reason); + + } + default: + return new EmbeddingStoreConfiguration(kind, kind.ToString(), null, $"No configuration available for {kind}"); + } + } + public async Task GetQdrantInfo() { try diff --git a/app/MindWork AI Studio/Tools/Services/RustService.Retrieval.cs b/app/MindWork AI Studio/Tools/Services/RustService.Retrieval.cs index 6d63f022..2f250be7 100644 --- a/app/MindWork AI Studio/Tools/Services/RustService.Retrieval.cs +++ b/app/MindWork AI Studio/Tools/Services/RustService.Retrieval.cs @@ -1,5 +1,6 @@ using System.Text; using System.Text.Json; +using System.Runtime.CompilerServices; namespace AIStudio.Tools.Services; @@ -48,6 +49,9 @@ public sealed partial class RustService } catch (JsonException) { + if (this.TryLogSseErrorMessage(jsonContent, path)) + continue; + this.logger?.LogError("Failed to deserialize SSE event: {JsonContent}", jsonContent); } } @@ -65,4 +69,77 @@ public sealed partial class RustService return resultBuilder.ToString(); } -} \ No newline at end of file + + public async IAsyncEnumerable StreamArbitraryFileData(string path, bool extractImages = false, [EnumeratorCancellation] CancellationToken token = default) + { + var streamId = Guid.NewGuid().ToString(); + var requestUri = $"/retrieval/fs/extract?path={Uri.EscapeDataString(path)}&stream_id={streamId}&extract_images={extractImages}"; + using var request = new HttpRequestMessage(HttpMethod.Get, requestUri); + using var response = await this.http.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, token); + + if (!response.IsSuccessStatusCode) + yield break; + + string? finalContentChunk = null; + try + { + await using var stream = await response.Content.ReadAsStreamAsync(token); + using var reader = new StreamReader(stream); + + while (!reader.EndOfStream && !token.IsCancellationRequested) + { + var line = await reader.ReadLineAsync(token); + if (string.IsNullOrWhiteSpace(line)) + continue; + + if (!line.StartsWith("data:", StringComparison.InvariantCulture)) + continue; + + var jsonContent = line[5..]; + ContentStreamSseEvent? sseEvent = null; + try + { + sseEvent = JsonSerializer.Deserialize(jsonContent); + } + catch (JsonException) + { + if (this.TryLogSseErrorMessage(jsonContent, path)) + continue; + + this.logger?.LogError("Failed to deserialize SSE event: {JsonContent}", jsonContent); + } + + if (sseEvent is null) + continue; + + var content = ContentStreamSseHandler.ProcessEvent(sseEvent, extractImages); + if (!string.IsNullOrWhiteSpace(content)) + yield return content; + } + } + finally + { + finalContentChunk = ContentStreamSseHandler.Clear(streamId); + } + + if (!string.IsNullOrWhiteSpace(finalContentChunk)) + yield return finalContentChunk; + } + + private bool TryLogSseErrorMessage(string jsonContent, string path) + { + try + { + var errorMessage = JsonSerializer.Deserialize(jsonContent); + if (string.IsNullOrWhiteSpace(errorMessage)) + return false; + + this.logger?.LogError("Rust retrieval stream error for '{Path}': {ErrorMessage}", path, errorMessage); + return true; + } + catch (JsonException) + { + return false; + } + } +}