diff --git a/app/MindWork AI Studio/Assistants/I18N/allTexts.lua b/app/MindWork AI Studio/Assistants/I18N/allTexts.lua
index bc4d79c0..cdf2f458 100644
--- a/app/MindWork AI Studio/Assistants/I18N/allTexts.lua
+++ b/app/MindWork AI Studio/Assistants/I18N/allTexts.lua
@@ -4837,15 +4837,24 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T108494
-- Are you sure you want to delete the data source '{0}' of type {1}?
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T1096979935"] = "Are you sure you want to delete the data source '{0}' of type {1}?"
+-- Automatic local data source refresh
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T1208397349"] = "Automatic local data source refresh"
+
-- Edit Local Directory Data Source
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T1215599168"] = "Edit Local Directory Data Source"
+-- Refresh
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T135637716"] = "Refresh"
+
-- Add Local Directory as Data Source
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T1454193397"] = "Add Local Directory as Data Source"
-- Delete
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T1469573738"] = "Delete"
+-- Refresh all
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T1503082343"] = "Refresh all"
+
-- External (ERI)
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T1652430727"] = "External (ERI)"
@@ -4900,6 +4909,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T352566
-- No data sources configured yet.
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T3549650120"] = "No data sources configured yet."
+-- Local data sources refresh when files change.
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T3687976654"] = "Local data sources refresh when files change."
+
-- Actions
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T3865031940"] = "Actions"
@@ -4912,6 +4924,9 @@ UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T590005
-- External Data (ERI-Server v1)
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T774473996"] = "External Data (ERI-Server v1)"
+-- Local data sources refresh only when triggered manually.
+UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T854231603"] = "Local data sources refresh only when triggered manually."
+
-- Local Directory
UI_TEXT_CONTENT["AISTUDIO::DIALOGS::SETTINGS::SETTINGSDIALOGDATASOURCES::T926703547"] = "Local Directory"
@@ -7571,25 +7586,19 @@ UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T378481461"] = "Source like p
UI_TEXT_CONTENT["AISTUDIO::TOOLS::RUST::FILETYPES::T4165204724"] = "Document"
-- Running
-UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T1160324588"] = "Running"
+UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGMODELS::T1160324588"] = "Running"
-- Idle
-UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T1168775091"] = "Idle"
+UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGMODELS::T1168775091"] = "Idle"
-- Needs attention
-UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T1566837660"] = "Needs attention"
+UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGMODELS::T1566837660"] = "Needs attention"
-- Queued
-UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T2655222900"] = "Queued"
-
--- Embedding
-UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T2838542994"] = "Embedding"
+UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGMODELS::T2655222900"] = "Queued"
-- Completed
-UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T3968379570"] = "Completed"
-
--- Embeddings
-UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGSERVICE::T951463987"] = "Embeddings"
+UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::DATASOURCEEMBEDDINGMODELS::T3968379570"] = "Completed"
-- Pandoc Installation
UI_TEXT_CONTENT["AISTUDIO::TOOLS::SERVICES::PANDOCAVAILABILITYSERVICE::T185447014"] = "Pandoc Installation"
diff --git a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor
index c23a7948..7968e410 100644
--- a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor
+++ b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor
@@ -123,7 +123,8 @@
AdornmentColor="Color.Info"
Validation="@this.providerValidation.ValidatingInstanceName"
UserAttributes="@SPELLCHECK_ATTRIBUTES"/>
- @if (this.DataModel != default){
+ @if (this.DataLLMProvider != LLMProviders.NONE)
+ {
@T("For better embeddings and less storage usage, it's recommended to use a custom tokenizer to enable a more accurate token count.")
@@ -137,8 +138,7 @@
Error="@(!string.IsNullOrWhiteSpace(this.dataCustomTokenizerValidationIssue))"
ErrorText="@(this.dataCustomTokenizerValidationIssue)"
Validation="@this.providerValidation.ValidatingCustomTokenizer"
- OnClear = "@this.ClearPathTokenizer"
- />
+ OnClear="@this.ClearPathTokenizer" />
}
@if (this.dataStoreWasAttempted)
diff --git a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs
index 10f4e661..20eabb28 100644
--- a/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs
+++ b/app/MindWork AI Studio/Dialogs/EmbeddingProviderDialog.razor.cs
@@ -2,6 +2,7 @@ using AIStudio.Chat;
using AIStudio.Components;
using AIStudio.Provider;
using AIStudio.Settings;
+using AIStudio.Tools.Rust;
using AIStudio.Tools.Services;
using AIStudio.Tools.Validation;
@@ -115,7 +116,7 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId
GetPreviousInstanceName = () => this.dataEditingPreviousInstanceName,
GetUsedInstanceNames = () => this.UsedInstanceNames,
GetHost = () => this.DataHost,
- IsModelProvidedManually = () => this.DataLLMProvider is LLMProviders.SELF_HOSTED && this.DataHost is Host.OLLAMA,
+ IsModelProvidedManually = () => this.DataLLMProvider.IsEmbeddingModelProvidedManually(this.DataHost),
GetCustomTokenizerValidationIssue = () => this.dataCustomTokenizerValidationIssue,
};
}
@@ -126,7 +127,7 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId
Model model = default;
if(this.DataLLMProvider is LLMProviders.SELF_HOSTED)
{
- if (this.DataHost is Host.OLLAMA)
+ if (this.DataLLMProvider.IsEmbeddingModelProvidedManually(this.DataHost))
model = new Model(this.dataManuallyModel, null);
else if (this.DataHost is Host.LM_STUDIO)
model = this.DataModel;
@@ -176,7 +177,7 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId
//
// We cannot load the API key for self-hosted providers:
//
- if (this.DataLLMProvider is LLMProviders.SELF_HOSTED && this.DataHost is not Host.OLLAMA)
+ if (this.DataLLMProvider is LLMProviders.SELF_HOSTED && this.DataHost is not Host.OLLAMA && this.DataHost is not Host.VLLM)
{
await this.ReloadModels();
await base.OnInitializedAsync();
@@ -241,10 +242,10 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId
if (!this.dataIsValid)
return;
- var response = await this.RustService.StoreTokenizer(TokenizerModelId.ForEmbeddingProviderId(this.DataId), this.dataFilePath);
+ var response = await this.StoreOrDeleteTokenizerAsync();
if (!response.Success)
{
- this.dataCustomTokenizerValidationIssue = response.Message;
+ this.dataCustomTokenizerValidationIssue = string.IsNullOrWhiteSpace(response.Message) ? string.Empty : response.Message;
await this.form.Validate();
return;
}
@@ -340,6 +341,15 @@ public partial class EmbeddingProviderDialog : MSGComponentBase, ISecretId
}
}
+ private Task StoreOrDeleteTokenizerAsync()
+ {
+ var tokenizerId = TokenizerModelId.ForEmbeddingProviderId(this.DataId);
+ if (string.IsNullOrWhiteSpace(this.dataFilePath))
+ return this.RustService.DeleteTokenizer(tokenizerId);
+
+ return this.RustService.StoreTokenizer(tokenizerId, this.dataFilePath);
+ }
+
private void OnHostChanged(Host selectedHost)
{
// When the host changes, reset the model selection state:
diff --git a/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs b/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs
index cd5e9bdc..d04d9a5c 100644
--- a/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs
+++ b/app/MindWork AI Studio/Dialogs/ProviderDialog.razor.cs
@@ -4,6 +4,7 @@ using System.Text.Json;
using AIStudio.Components;
using AIStudio.Provider;
using AIStudio.Provider.HuggingFace;
+using AIStudio.Tools.Rust;
using AIStudio.Tools.Services;
using AIStudio.Tools.Validation;
@@ -268,7 +269,7 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId
if (!this.dataIsValid)
return;
- var tokenizerResponse = await this.RustService.StoreTokenizer(TokenizerModelId.ForProviderId(this.DataId), this.dataFilePath);
+ var tokenizerResponse = await this.StoreOrDeleteTokenizerAsync();
if (!tokenizerResponse.Success)
{
this.dataCustomTokenizerValidationIssue = tokenizerResponse.Message;
@@ -367,6 +368,15 @@ public partial class ProviderDialog : MSGComponentBase, ISecretId
}
}
+ private Task StoreOrDeleteTokenizerAsync()
+ {
+ var tokenizerId = TokenizerModelId.ForProviderId(this.DataId);
+ if (string.IsNullOrWhiteSpace(this.dataFilePath))
+ return this.RustService.DeleteTokenizer(tokenizerId);
+
+ return this.RustService.StoreTokenizer(tokenizerId, this.dataFilePath);
+ }
+
private void OnHostChanged(Host selectedHost)
{
// When the host changes, reset the model selection state:
diff --git a/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor b/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor
index 74b15fdb..1f867e2a 100644
--- a/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor
+++ b/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor
@@ -14,6 +14,13 @@
@T("You might configure different data sources. A data source can include one file, all files in a directory, or data from your company. Later, you can incorporate these data sources as needed when the AI requires this data to complete a certain task.")
+
+
+
+ @T("Refresh all")
+
+
+
@@ -38,6 +45,9 @@
+
+ @T("Refresh")
+
@T("Edit")
@@ -73,4 +83,4 @@
@T("Close")
-
\ No newline at end of file
+
diff --git a/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor.cs b/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor.cs
index af71e3a9..020a87d6 100644
--- a/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor.cs
+++ b/app/MindWork AI Studio/Dialogs/Settings/SettingsDialogDataSources.razor.cs
@@ -28,6 +28,39 @@ public partial class SettingsDialogDataSources : SettingsDialogBase
return T("Unknown");
}
+
+ private bool CanRefreshDataSource(IDataSource dataSource)
+ {
+ return dataSource is DataSourceLocalDirectory or DataSourceLocalFile;
+ }
+
+ private bool HasRefreshableDataSources()
+ {
+ return this.SettingsManager.ConfigurationData.DataSources.Any(this.CanRefreshDataSource);
+ }
+
+ private async Task AutomaticRefreshChanged(bool enabled)
+ {
+ this.SettingsManager.ConfigurationData.DataSourceIndexing.AutomaticRefresh = enabled;
+ await this.SettingsManager.StoreSettings();
+ this.DataSourceEmbeddingService.RefreshAutomaticWatchers();
+ await this.MessageBus.SendMessage(this, Event.CONFIGURATION_CHANGED);
+ }
+
+ private async Task RefreshAllDataSources()
+ {
+ await this.DataSourceEmbeddingService.QueueAllInternalDataSourcesAsync();
+ await this.MessageBus.SendMessage(this, Event.CONFIGURATION_CHANGED);
+ }
+
+ private async Task RefreshDataSource(IDataSource dataSource)
+ {
+ if (!this.CanRefreshDataSource(dataSource))
+ return;
+
+ await this.DataSourceEmbeddingService.QueueDataSourceAsync(dataSource);
+ await this.MessageBus.SendMessage(this, Event.CONFIGURATION_CHANGED);
+ }
private async Task AddDataSource(DataSourceType type)
{
diff --git a/app/MindWork AI Studio/Layout/MainLayout.razor b/app/MindWork AI Studio/Layout/MainLayout.razor
index 7d4f28e9..aad5b4c8 100644
--- a/app/MindWork AI Studio/Layout/MainLayout.razor
+++ b/app/MindWork AI Studio/Layout/MainLayout.razor
@@ -25,7 +25,7 @@
- @if (this.embeddingOverview.IsVisible)
+ @if (this.showEmbeddingStatusIcon)
{
@@ -64,7 +64,7 @@
- @if (this.embeddingOverview.IsVisible)
+ @if (this.showEmbeddingStatusIcon)
{
@if (this.SettingsManager.ConfigurationData.App.NavigationBehavior is NavBehavior.NEVER_EXPAND_USE_TOOLTIPS)
diff --git a/app/MindWork AI Studio/Layout/MainLayout.razor.cs b/app/MindWork AI Studio/Layout/MainLayout.razor.cs
index f96cc2d2..f99446ff 100644
--- a/app/MindWork AI Studio/Layout/MainLayout.razor.cs
+++ b/app/MindWork AI Studio/Layout/MainLayout.razor.cs
@@ -60,9 +60,10 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan
private bool startupCompleted;
private readonly SemaphoreSlim mandatoryInfoDialogSemaphore = new(1, 1);
- private DataSourceEmbeddingOverview embeddingOverview = new(false, true, DataSourceEmbeddingState.COMPLETED, 0, 0, 0, string.Empty);
+ private DataSourceEmbeddingOverview embeddingOverview = new(false, DataSourceEmbeddingState.COMPLETED, 0, 0, 0);
private IReadOnlyCollection navItems = [];
- private NavBarItem embeddingItem = new NavBarItem(string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, false);
+ private NavBarItem embeddingItem = new (string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, false);
+ private bool showEmbeddingStatusIcon = false;
#region Overrides of ComponentBase
@@ -93,7 +94,7 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan
// Ensure that all settings are loaded:
await this.SettingsManager.LoadSettings();
- await this.DataSourceEmbeddingService.QueueAllInternalDataSourcesAsync();
+ await this.DataSourceEmbeddingService.QueueAllInternalDataSourcesIfAutomaticRefreshAsync();
// Register this component with the message bus:
this.MessageBus.RegisterComponent(this);
@@ -325,6 +326,7 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan
private void LoadEmbeddingItem()
{
this.embeddingOverview = this.DataSourceEmbeddingService.GetOverview();
+ this.showEmbeddingStatusIcon = this.embeddingOverview.IsVisible;
var palette = this.ColorTheme.GetCurrentPalette(this.SettingsManager);
(string icon, string lightcolor, string darkcolor) embeddingIcon = this.embeddingOverview.State switch
{
@@ -345,7 +347,7 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan
DataSourceEmbeddingState.FAILED => this.embeddingOverview.FailedFiles > 0
? string.Format(T("Some embeddings failed. {0} file(s) need attention."), this.embeddingOverview.FailedFiles)
: T("Some embeddings failed and need attention."),
- _ => this.embeddingOverview.NavLabel,
+ _ => string.Empty
};
private async Task ShowUpdateDialog()
@@ -508,4 +510,4 @@ public partial class MainLayout : LayoutComponentBase, IMessageBusReceiver, ILan
}
#endregion
-}
\ No newline at end of file
+}
diff --git a/app/MindWork AI Studio/Program.cs b/app/MindWork AI Studio/Program.cs
index db4c18af..04233468 100644
--- a/app/MindWork AI Studio/Program.cs
+++ b/app/MindWork AI Studio/Program.cs
@@ -88,7 +88,7 @@ internal sealed class Program
var embeddingStoreConfig = await rust.GetEmbeddingStoreConfiguration(EmbeddingStoreKind.QDRANT_REMOTE);
- EmbeddingStore embeddingStore = EmbeddingStoreFactory.Create(embeddingStoreConfig);
+ var embeddingStore = EmbeddingStoreFactory.Create(embeddingStoreConfig);
var builder = WebApplication.CreateBuilder();
builder.WebHost.ConfigureKestrel(kestrelServerOptions =>
diff --git a/app/MindWork AI Studio/Settings/DataModel/Data.cs b/app/MindWork AI Studio/Settings/DataModel/Data.cs
index b8f429cc..31581611 100644
--- a/app/MindWork AI Studio/Settings/DataModel/Data.cs
+++ b/app/MindWork AI Studio/Settings/DataModel/Data.cs
@@ -124,6 +124,8 @@ public sealed class Data
public DataAgentRetrievalContextValidation AgentRetrievalContextValidation { get; init; } = new();
+ public DataDataSourceIndexing DataSourceIndexing { get; init; } = new();
+
public DataAssistantPluginAudit AssistantPluginAudit { get; init; } = new(x => x.AssistantPluginAudit);
public DataAgenda Agenda { get; init; } = new();
diff --git a/app/MindWork AI Studio/Settings/DataModel/DataDataSourceIndexing.cs b/app/MindWork AI Studio/Settings/DataModel/DataDataSourceIndexing.cs
new file mode 100644
index 00000000..fdcb8997
--- /dev/null
+++ b/app/MindWork AI Studio/Settings/DataModel/DataDataSourceIndexing.cs
@@ -0,0 +1,9 @@
+namespace AIStudio.Settings.DataModel;
+
+public sealed class DataDataSourceIndexing
+{
+ ///
+ /// Whether local data source embeddings should refresh automatically when files change.
+ ///
+ public bool AutomaticRefresh { get; set; } = true;
+}
diff --git a/app/MindWork AI Studio/Tools/Databases/EmbeddingStore.cs b/app/MindWork AI Studio/Tools/Databases/EmbeddingStore.cs
index 4993e633..abbae8ef 100644
--- a/app/MindWork AI Studio/Tools/Databases/EmbeddingStore.cs
+++ b/app/MindWork AI Studio/Tools/Databases/EmbeddingStore.cs
@@ -35,14 +35,15 @@ public abstract class EmbeddingStore(string name, string path)
{
string[] suffixes = { "B", "KB", "MB", "GB", "TB", "PB" };
int suffixIndex = 0;
+ double convertedSize = size;
- while (size >= 1024 && suffixIndex < suffixes.Length - 1)
+ while (convertedSize >= 1024 && suffixIndex < suffixes.Length - 1)
{
- size /= 1024;
+ convertedSize /= 1024;
suffixIndex++;
}
- return $"{size:0##} {suffixes[suffixIndex]}";
+ return $"{convertedSize:0.##} {suffixes[suffixIndex]}";
}
public void SetLogger(ILogger logService)
diff --git a/app/MindWork AI Studio/Tools/Databases/Qdrant/QdrantClientImplementation.cs b/app/MindWork AI Studio/Tools/Databases/Qdrant/QdrantClientImplementation.cs
index 0a7fb022..a4d47842 100644
--- a/app/MindWork AI Studio/Tools/Databases/Qdrant/QdrantClientImplementation.cs
+++ b/app/MindWork AI Studio/Tools/Databases/Qdrant/QdrantClientImplementation.cs
@@ -1,5 +1,6 @@
using Qdrant.Client;
using Qdrant.Client.Grpc;
+using Grpc.Core;
using AIStudio.Tools.PluginSystem;
using static Qdrant.Client.Grpc.Conditions;
@@ -104,14 +105,32 @@ public class QdrantClientImplementation : EmbeddingStore
return this.GrpcClient.UpsertAsync(collectionName, qdrantPoints, true, null, null, token);
}
- public override Task DeleteEmbeddingByFile(string collectionName, string filePath, CancellationToken token)
+ public override async Task DeleteEmbeddingByFile(string collectionName, string filePath, CancellationToken token)
{
- return this.GrpcClient.DeleteAsync(collectionName, MatchKeyword("file_path", filePath), true, null, null, token);
+ try
+ {
+ await this.GrpcClient.DeleteAsync(collectionName, MatchKeyword("file_path", filePath), true, null, null, token);
+ }
+ catch (RpcException exception) when (exception.StatusCode is StatusCode.NotFound)
+ {
+ return;
+ }
}
- public override Task DeleteEmbeddingStore(string collectionName, CancellationToken token)
+ public override async Task DeleteEmbeddingStore(string collectionName, CancellationToken token)
{
- return this.GrpcClient.DeleteCollectionAsync(collectionName, cancellationToken: token);
+ var exists = await this.GrpcClient.CollectionExistsAsync(collectionName, token);
+ if (!exists)
+ return;
+
+ try
+ {
+ await this.GrpcClient.DeleteCollectionAsync(collectionName, cancellationToken: token);
+ }
+ catch (RpcException exception) when (exception.StatusCode is StatusCode.NotFound)
+ {
+ return;
+ }
}
public override void Dispose() => this.GrpcClient.Dispose();
diff --git a/app/MindWork AI Studio/Tools/Rust/FileTypes.cs b/app/MindWork AI Studio/Tools/Rust/FileTypes.cs
index 4a02608e..623a03b0 100644
--- a/app/MindWork AI Studio/Tools/Rust/FileTypes.cs
+++ b/app/MindWork AI Studio/Tools/Rust/FileTypes.cs
@@ -59,7 +59,7 @@ public static class FileTypes
// Media hierarchy
public static readonly FileTypeFilter IMAGE = FileTypeFilter.Leaf(TB("Image"),
- "jpg", "jpeg", "png", "gif", "bmp", "tiff", "svg", "webp", "heic");
+ "jpg", "jpeg", "png", "gif", "bmp", "tiff", "svg", "webp", "heic", "avif");
public static readonly FileTypeFilter AUDIO = FileTypeFilter.Leaf(TB("Audio"),
"mp3", "wav", "wave", "aac", "flac", "ogg", "m4a", "wma", "alac", "aiff", "m4b");
public static readonly FileTypeFilter VIDEO = FileTypeFilter.Leaf(TB("Video"),
diff --git a/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingModels.cs b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingModels.cs
new file mode 100644
index 00000000..be016cad
--- /dev/null
+++ b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingModels.cs
@@ -0,0 +1,86 @@
+using AIStudio.Settings.DataModel;
+using AIStudio.Tools.PluginSystem;
+
+namespace AIStudio.Tools.Services;
+
+public sealed record DataSourceEmbeddingOverview(
+ bool IsVisible,
+ DataSourceEmbeddingState State,
+ int IndexedFiles,
+ int TotalFiles,
+ int FailedFiles);
+
+public enum DataSourceEmbeddingState
+{
+ IDLE,
+ QUEUED,
+ RUNNING,
+ COMPLETED,
+ FAILED,
+}
+
+public sealed record DataSourceEmbeddingStatus(
+ string DataSourceId,
+ string DataSourceName,
+ DataSourceType DataSourceType,
+ DataSourceEmbeddingState State,
+ int TotalFiles,
+ int IndexedFiles,
+ int FailedFiles,
+ string CurrentFile,
+ string LastError)
+{
+ private static string TB(string fallbackEN) => I18N.I.T(fallbackEN, typeof(DataSourceEmbeddingService).Namespace, nameof(DataSourceEmbeddingService));
+
+ public int ProgressPercent => this.TotalFiles <= 0 ? 0 : Math.Clamp((int)Math.Round(this.IndexedFiles * 100d / this.TotalFiles), 0, 100);
+
+ public string StateLabel => this.State switch
+ {
+ DataSourceEmbeddingState.QUEUED => TB("Queued"),
+ DataSourceEmbeddingState.RUNNING => TB("Running"),
+ DataSourceEmbeddingState.COMPLETED => TB("Completed"),
+ DataSourceEmbeddingState.FAILED => TB("Needs attention"),
+ _ => TB("Idle")
+ };
+
+ public int SortOrder => this.State switch
+ {
+ DataSourceEmbeddingState.RUNNING => 0,
+ DataSourceEmbeddingState.QUEUED => 1,
+ DataSourceEmbeddingState.FAILED => 2,
+ DataSourceEmbeddingState.COMPLETED => 3,
+ _ => 4,
+ };
+}
+
+public sealed class FileEnumerationResult
+{
+ public List Files { get; } = [];
+
+ public int FailedFiles { get; set; }
+
+ public string LastError { get; set; } = string.Empty;
+}
+
+public sealed class PersistedEmbeddingState
+{
+ public Dictionary DataSources { get; init; } = new(StringComparer.OrdinalIgnoreCase);
+}
+
+public sealed class DataSourceEmbeddingManifest
+{
+ public string EmbeddingProviderId { get; set; } = string.Empty;
+
+ public string EmbeddingSignature { get; set; } = string.Empty;
+
+ public int VectorSize { get; set; }
+
+ public Dictionary Files { get; init; } = new(StringComparer.OrdinalIgnoreCase);
+}
+
+public sealed record EmbeddedFileRecord(
+ string Fingerprint,
+ long FileSize,
+ DateTime LastWriteUtc,
+ DateTime EmbeddedAtUtc,
+ int ChunkCount);
diff --git a/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.Files.cs b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.Files.cs
new file mode 100644
index 00000000..80ab1ef4
--- /dev/null
+++ b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.Files.cs
@@ -0,0 +1,242 @@
+using System.Security.Cryptography;
+using System.Text;
+
+using AIStudio.Settings;
+using AIStudio.Settings.DataModel;
+using AIStudio.Tools.PluginSystem;
+using AIStudio.Tools.Rust;
+
+namespace AIStudio.Tools.Services;
+
+public sealed partial class DataSourceEmbeddingService
+{
+ private static readonly string[] ADDITIONAL_RAG_FILE_EXTENSIONS = ["csv", "tsv", "ods", "xlsm", "xlsb", "xla", "xlam"];
+ private static readonly string[] SKIPPED_RAG_FILE_EXTENSIONS = ["lnk"];
+
+ private async IAsyncEnumerable StreamEmbeddingChunksAsync(string filePath, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken token)
+ {
+ if (this.IsImageFilePath(filePath))
+ {
+ yield return this.BuildImageIndexText(filePath);
+ yield break;
+ }
+
+ var currentChunk = new StringBuilder();
+
+ await foreach (var segment in this.rustService.StreamArbitraryFileData(filePath, token: token))
+ {
+ var normalized = NormalizeChunkSegment(segment);
+ if (string.IsNullOrWhiteSpace(normalized))
+ continue;
+
+ if (currentChunk.Length > 0 && currentChunk.Length + normalized.Length + Environment.NewLine.Length > MAX_CHUNK_LENGTH)
+ {
+ if (currentChunk.Length >= MIN_CHUNK_LENGTH)
+ {
+ var chunk = currentChunk.ToString().Trim();
+ if (!string.IsNullOrWhiteSpace(chunk))
+ yield return chunk;
+
+ var overlap = chunk.Length > CHUNK_OVERLAP_LENGTH
+ ? chunk[^CHUNK_OVERLAP_LENGTH..]
+ : chunk;
+
+ currentChunk.Clear();
+ currentChunk.Append(overlap);
+ currentChunk.AppendLine();
+ }
+ else
+ {
+ currentChunk.AppendLine();
+ }
+ }
+
+ currentChunk.Append(normalized);
+ currentChunk.AppendLine();
+ }
+
+ var finalChunk = currentChunk.ToString().Trim();
+ if (!string.IsNullOrWhiteSpace(finalChunk))
+ yield return finalChunk;
+ }
+
+ private FileEnumerationResult GetInputFiles(IDataSource dataSource)
+ {
+ var result = new FileEnumerationResult();
+
+ switch (dataSource)
+ {
+ case DataSourceLocalFile localFile when File.Exists(localFile.FilePath):
+ if (this.IsSupportedRagFilePath(localFile.FilePath))
+ {
+ result.Files.Add(new FileInfo(localFile.FilePath));
+ }
+ else
+ {
+ result.FailedFiles = 1;
+ result.LastError = $"The selected file '{localFile.FilePath}' is not supported for background embeddings.";
+ }
+
+ return result;
+
+ case DataSourceLocalDirectory localDirectory when Directory.Exists(localDirectory.Path):
+ this.EnumerateAccessibleFiles(localDirectory.Path, result);
+ return result;
+ }
+
+ switch (dataSource)
+ {
+ case DataSourceLocalFile localFile:
+ result.FailedFiles = 1;
+ result.LastError = $"The selected file '{localFile.FilePath}' does not exist.";
+ break;
+
+ case DataSourceLocalDirectory localDirectory:
+ result.FailedFiles = 1;
+ result.LastError = $"The selected directory '{localDirectory.Path}' does not exist.";
+ break;
+ }
+
+ return result;
+ }
+
+ private void EnumerateAccessibleFiles(string rootPath, FileEnumerationResult result)
+ {
+ var pendingDirectories = new Stack();
+ pendingDirectories.Push(rootPath);
+
+ while (pendingDirectories.Count > 0)
+ {
+ var currentPath = pendingDirectories.Pop();
+ IEnumerable subDirectories;
+ IEnumerable files;
+
+ try
+ {
+ subDirectories = Directory.EnumerateDirectories(currentPath);
+ files = Directory.EnumerateFiles(currentPath);
+ }
+ catch (Exception exception)
+ {
+ this.logger.LogWarning(exception, "Cannot access directory '{DirectoryPath}' while indexing.", currentPath);
+ result.FailedFiles++;
+ result.LastError = $"The directory '{currentPath}' could not be accessed.";
+ continue;
+ }
+
+ foreach (var filePath in files)
+ {
+ FileInfo fileInfo;
+ try
+ {
+ fileInfo = new FileInfo(filePath);
+ if (!fileInfo.Exists)
+ continue;
+ }
+ catch (Exception exception)
+ {
+ this.logger.LogWarning(exception, "Cannot inspect file '{FilePath}' while indexing.", filePath);
+ result.FailedFiles++;
+ result.LastError = $"The file '{filePath}' could not be inspected.";
+ continue;
+ }
+
+ if (!this.IsSupportedRagFilePath(fileInfo.FullName))
+ continue;
+
+ result.Files.Add(fileInfo);
+ }
+
+ foreach (var subDirectory in subDirectories)
+ pendingDirectories.Push(subDirectory);
+ }
+ }
+
+ private string TryGetRelativePath(IDataSource dataSource, FileInfo file) => dataSource switch
+ {
+ DataSourceLocalDirectory localDirectory => Path.GetRelativePath(localDirectory.Path, file.FullName),
+ _ => file.Name
+ };
+
+ private static string NormalizeChunkSegment(string input)
+ {
+ return input
+ .Replace("\r\n", "\n", StringComparison.Ordinal)
+ .Replace('\r', '\n')
+ .Trim();
+ }
+
+ private bool IsImageFilePath(string filePath)
+ {
+ return FileTypes.IsAllowedPath(filePath, FileTypes.IMAGE);
+ }
+
+ private bool IsSupportedRagFilePath(string filePath)
+ {
+ var extension = Path.GetExtension(filePath).TrimStart('.');
+ if (SKIPPED_RAG_FILE_EXTENSIONS.Contains(extension, StringComparer.OrdinalIgnoreCase))
+ return false;
+
+ return FileTypes.IsAllowedPath(filePath, FileTypes.DOCUMENT, FileTypes.IMAGE)
+ || ADDITIONAL_RAG_FILE_EXTENSIONS.Contains(extension, StringComparer.OrdinalIgnoreCase);
+ }
+
+ private string BuildImageIndexText(string filePath)
+ {
+ var fileName = Path.GetFileName(filePath);
+ var fileNameWithoutExtension = Path.GetFileNameWithoutExtension(filePath);
+ var extension = Path.GetExtension(filePath).TrimStart('.');
+ var normalizedName = fileNameWithoutExtension
+ .Replace('_', ' ')
+ .Replace('-', ' ')
+ .Trim();
+
+ return $$"""
+ Image asset
+ File name: {{fileName}}
+ Type: {{extension}}
+ Search terms: {{normalizedName}}
+ Path: {{filePath}}
+ Note: The current RAG embedding pipeline stores image files by metadata only. Visual content is not OCRed or captioned yet.
+ """;
+ }
+
+ private string BuildEmbeddingSignature(EmbeddingProvider embeddingProvider)
+ {
+ return string.Join('|',
+ embeddingProvider.Id,
+ embeddingProvider.UsedLLMProvider,
+ embeddingProvider.Model.Id,
+ embeddingProvider.Host,
+ embeddingProvider.Hostname,
+ embeddingProvider.TokenizerPath);
+ }
+
+ private string BuildFingerprint(FileInfo file)
+ {
+ var fingerprintSource = $"{file.FullName}|{file.Length}|{file.LastWriteTimeUtc.Ticks}";
+ var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(fingerprintSource));
+ return Convert.ToHexString(bytes);
+ }
+
+ private string GetCollectionName(string dataSourceId)
+ {
+ var safeId = dataSourceId
+ .ToLowerInvariant()
+ .Replace("-", string.Empty, StringComparison.Ordinal);
+
+ return $"rag_{safeId}";
+ }
+
+ private string CreatePointId(string dataSourceId, string fingerprint, int chunkIndex)
+ {
+ var source = $"{dataSourceId}:{fingerprint}:{chunkIndex}";
+ var hash = SHA256.HashData(Encoding.UTF8.GetBytes(source));
+ var guidBytes = hash[..16].ToArray();
+
+ guidBytes[6] = (byte)((guidBytes[6] & 0x0F) | 0x40);
+ guidBytes[8] = (byte)((guidBytes[8] & 0x3F) | 0x80);
+
+ return new Guid(guidBytes).ToString();
+ }
+}
diff --git a/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.State.cs b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.State.cs
new file mode 100644
index 00000000..b08bb3d4
--- /dev/null
+++ b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.State.cs
@@ -0,0 +1,89 @@
+using System.Text.Json;
+
+using AIStudio.Settings;
+
+namespace AIStudio.Tools.Services;
+
+public sealed partial class DataSourceEmbeddingService
+{
+ private const string STATE_FILENAME = "rag-embedding-state.json";
+
+ private readonly JsonSerializerOptions jsonOptions = new()
+ {
+ WriteIndented = true,
+ };
+
+ private async Task EnsureStateLoadedAsync(CancellationToken token)
+ {
+ if (this.stateLoaded)
+ return;
+
+ await this.stateLock.WaitAsync(token);
+ try
+ {
+ if (this.stateLoaded)
+ return;
+
+ var statePath = this.GetStatePath();
+ if (!string.IsNullOrWhiteSpace(statePath) && File.Exists(statePath))
+ {
+ var json = await File.ReadAllTextAsync(statePath, token);
+ var persistedState = JsonSerializer.Deserialize(json, this.jsonOptions);
+ this.manifests = persistedState?.DataSources ?? new Dictionary(StringComparer.OrdinalIgnoreCase);
+ }
+
+ this.stateLoaded = true;
+ }
+ finally
+ {
+ this.stateLock.Release();
+ }
+ }
+
+ private async Task GetManifestAsync(string dataSourceId, CancellationToken token)
+ {
+ await this.EnsureStateLoadedAsync(token);
+ if (this.manifests.TryGetValue(dataSourceId, out var manifest))
+ return manifest;
+
+ manifest = new DataSourceEmbeddingManifest();
+ this.manifests[dataSourceId] = manifest;
+ return manifest;
+ }
+
+ private async Task SaveStateAsync(CancellationToken token)
+ {
+ var statePath = this.GetStatePath();
+ if (string.IsNullOrWhiteSpace(statePath))
+ return;
+
+ var directory = Path.GetDirectoryName(statePath);
+ if (!string.IsNullOrWhiteSpace(directory))
+ Directory.CreateDirectory(directory);
+
+ var persistedState = new PersistedEmbeddingState
+ {
+ DataSources = this.manifests
+ };
+
+ var json = JsonSerializer.Serialize(persistedState, this.jsonOptions);
+ await File.WriteAllTextAsync(statePath, json, token);
+ }
+
+ private async Task ResetPersistedStateAsync(string dataSourceId)
+ {
+ await this.EnsureStateLoadedAsync(CancellationToken.None);
+ this.manifests.Remove(dataSourceId);
+ await this.DeleteCollectionAsync(this.GetCollectionName(dataSourceId));
+ await this.SaveStateAsync(CancellationToken.None);
+ this.logger.LogInformation("Reset persisted embedding state for data source '{DataSourceId}'.", dataSourceId);
+ }
+
+ private string GetStatePath()
+ {
+ if (string.IsNullOrWhiteSpace(SettingsManager.ConfigDirectory))
+ return string.Empty;
+
+ return Path.Combine(SettingsManager.ConfigDirectory, STATE_FILENAME);
+ }
+}
diff --git a/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.Watchers.cs b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.Watchers.cs
new file mode 100644
index 00000000..75cfa154
--- /dev/null
+++ b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.Watchers.cs
@@ -0,0 +1,226 @@
+using System.Collections.Concurrent;
+using AIStudio.Settings;
+using AIStudio.Settings.DataModel;
+
+namespace AIStudio.Tools.Services;
+
+public sealed partial class DataSourceEmbeddingService
+{
+ private const int WATCHER_DEBOUNCE_SECONDS = 2;
+
+ private readonly ConcurrentDictionary watchers = new(StringComparer.OrdinalIgnoreCase);
+ private readonly Dictionary watcherDebounceTokens = new(StringComparer.OrdinalIgnoreCase);
+ private readonly object watcherDebounceLock = new();
+
+ private void RefreshWatchers()
+ {
+ if (!this.settingsManager.ConfigurationData.DataSourceIndexing.AutomaticRefresh)
+ {
+ this.RemoveAllWatchers();
+ return;
+ }
+
+ var supportedSources = this.settingsManager.ConfigurationData.DataSources
+ .Where(this.IsSupportedInternalDataSource)
+ .ToDictionary(source => source.Id, StringComparer.OrdinalIgnoreCase);
+
+ foreach (var existingWatcherId in this.watchers.Keys.Except(supportedSources.Keys, StringComparer.OrdinalIgnoreCase).ToList())
+ this.RemoveWatcher(existingWatcherId);
+
+ foreach (var dataSource in supportedSources.Values)
+ this.EnsureWatcher(dataSource);
+ }
+
+ private void EnsureWatcher(IDataSource dataSource)
+ {
+ if (!this.settingsManager.ConfigurationData.DataSourceIndexing.AutomaticRefresh)
+ return;
+
+ var configuration = GetWatchConfiguration(dataSource);
+ if (configuration is null)
+ return;
+
+ if (this.watchers.TryGetValue(dataSource.Id, out var existingRegistration))
+ {
+ if (IsSameWatchConfiguration(existingRegistration.Configuration, configuration))
+ return;
+
+ this.RemoveWatcher(dataSource.Id);
+ }
+
+ var watcher = this.CreateWatcher(dataSource.Id, configuration);
+ if (watcher is null)
+ return;
+
+ if (!this.watchers.TryAdd(dataSource.Id, new DataSourceWatcherRegistration(watcher, configuration)))
+ watcher.Dispose();
+ }
+
+ private FileSystemWatcher? CreateWatcher(string dataSourceId, DataSourceWatcherConfiguration configuration)
+ {
+ try
+ {
+ var watcher = new FileSystemWatcher(configuration.RootPath)
+ {
+ Filter = configuration.Filter,
+ IncludeSubdirectories = configuration.IncludeSubdirectories,
+ NotifyFilter = NotifyFilters.FileName | NotifyFilters.DirectoryName | NotifyFilters.LastWrite | NotifyFilters.CreationTime | NotifyFilters.Size,
+ };
+
+ watcher.Changed += (_, _) => this.OnWatchedDataSourceChanged(dataSourceId);
+ watcher.Deleted += (_, _) => this.OnWatchedDataSourceChanged(dataSourceId);
+ watcher.Created += (_, _) => this.OnWatchedDataSourceChanged(dataSourceId);
+ watcher.Renamed += (_, _) => this.OnWatchedDataSourceChanged(dataSourceId);
+ watcher.Error += (_, args) =>
+ {
+ this.logger.LogWarning(args.GetException(), "The file watcher for data source '{DataSourceId}' failed. Recreating it.", dataSourceId);
+ this.RemoveWatcher(dataSourceId);
+ this.EnsureWatcher(dataSourceId);
+ this.OnWatchedDataSourceChanged(dataSourceId);
+ };
+ watcher.EnableRaisingEvents = true;
+ return watcher;
+ }
+ catch (Exception exception)
+ {
+ this.logger.LogWarning(exception, "Failed to create file watcher for data source '{DataSourceId}' at '{RootPath}'.", dataSourceId, configuration.RootPath);
+ return null;
+ }
+ }
+
+ private void RemoveWatcher(string dataSourceId)
+ {
+ this.CancelPendingWatcherRefresh(dataSourceId);
+
+ if (this.watchers.TryRemove(dataSourceId, out var registration))
+ registration.Watcher.Dispose();
+ }
+
+ private void RemoveAllWatchers()
+ {
+ foreach (var watcherId in this.watchers.Keys.ToList())
+ this.RemoveWatcher(watcherId);
+ }
+
+ private void DisposeWatchers()
+ {
+ this.CancelAllPendingWatcherRefreshes();
+
+ foreach (var registration in this.watchers.Values)
+ registration.Watcher.Dispose();
+
+ this.watchers.Clear();
+ }
+
+ private void OnWatchedDataSourceChanged(string dataSourceId)
+ {
+ if (!this.settingsManager.ConfigurationData.DataSourceIndexing.AutomaticRefresh)
+ return;
+
+ this.logger.LogDebug("Detected file system change for data source '{DataSourceId}'. Scheduling a debounced embedding run.", dataSourceId);
+ var debounceToken = new CancellationTokenSource();
+
+ lock (this.watcherDebounceLock)
+ {
+ if (this.watcherDebounceTokens.Remove(dataSourceId, out var existingToken))
+ existingToken.Cancel();
+
+ this.watcherDebounceTokens[dataSourceId] = debounceToken;
+ }
+
+ _ = Task.Run(async () =>
+ {
+ try
+ {
+ await Task.Delay(TimeSpan.FromSeconds(WATCHER_DEBOUNCE_SECONDS), debounceToken.Token);
+ if (!this.TryCompletePendingWatcherRefresh(dataSourceId, debounceToken))
+ return;
+
+ var dataSource = this.settingsManager.ConfigurationData.DataSources
+ .FirstOrDefault(source => source.Id.Equals(dataSourceId, StringComparison.OrdinalIgnoreCase));
+
+ if (dataSource is not null)
+ {
+ this.logger.LogInformation("Queueing data source '{DataSourceName}' ({DataSourceId}) after file system changes settled.", dataSource.Name, dataSource.Id);
+ await this.QueueDataSourceAsync(dataSource);
+ }
+ }
+ catch (OperationCanceledException)
+ {
+ }
+ catch (Exception exception)
+ {
+ this.logger.LogWarning(exception, "Failed to queue watched data source '{DataSourceId}' after a file system change.", dataSourceId);
+ }
+ finally
+ {
+ debounceToken.Dispose();
+ }
+ });
+ }
+
+ private void EnsureWatcher(string dataSourceId)
+ {
+ var dataSource = this.settingsManager.ConfigurationData.DataSources
+ .FirstOrDefault(source => source.Id.Equals(dataSourceId, StringComparison.OrdinalIgnoreCase));
+
+ if (dataSource is not null)
+ this.EnsureWatcher(dataSource);
+ }
+
+ private void CancelPendingWatcherRefresh(string dataSourceId)
+ {
+ lock (this.watcherDebounceLock)
+ {
+ if (this.watcherDebounceTokens.Remove(dataSourceId, out var token))
+ token.Cancel();
+ }
+ }
+
+ private void CancelAllPendingWatcherRefreshes()
+ {
+ lock (this.watcherDebounceLock)
+ {
+ foreach (var token in this.watcherDebounceTokens.Values)
+ token.Cancel();
+
+ this.watcherDebounceTokens.Clear();
+ }
+ }
+
+ private bool TryCompletePendingWatcherRefresh(string dataSourceId, CancellationTokenSource debounceToken)
+ {
+ lock (this.watcherDebounceLock)
+ {
+ if (!this.watcherDebounceTokens.TryGetValue(dataSourceId, out var currentToken) || !ReferenceEquals(currentToken, debounceToken))
+ return false;
+
+ this.watcherDebounceTokens.Remove(dataSourceId);
+ return true;
+ }
+ }
+
+ private static DataSourceWatcherConfiguration? GetWatchConfiguration(IDataSource dataSource) => dataSource switch
+ {
+ DataSourceLocalDirectory localDirectory when Directory.Exists(localDirectory.Path) => new DataSourceWatcherConfiguration(
+ localDirectory.Path,
+ "*.*",
+ true),
+ DataSourceLocalFile localFile when File.Exists(localFile.FilePath) && !string.IsNullOrWhiteSpace(Path.GetDirectoryName(localFile.FilePath)) => new DataSourceWatcherConfiguration(
+ Path.GetDirectoryName(localFile.FilePath)!,
+ Path.GetFileName(localFile.FilePath),
+ false),
+ _ => null,
+ };
+
+ private static bool IsSameWatchConfiguration(DataSourceWatcherConfiguration left, DataSourceWatcherConfiguration right)
+ {
+ return left.IncludeSubdirectories == right.IncludeSubdirectories
+ && string.Equals(left.RootPath, right.RootPath, StringComparison.OrdinalIgnoreCase)
+ && string.Equals(left.Filter, right.Filter, StringComparison.OrdinalIgnoreCase);
+ }
+
+ private sealed record DataSourceWatcherConfiguration(string RootPath, string Filter, bool IncludeSubdirectories);
+
+ private sealed record DataSourceWatcherRegistration(FileSystemWatcher Watcher, DataSourceWatcherConfiguration Configuration);
+}
diff --git a/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.cs b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.cs
index 9e2a71bc..c82de777 100644
--- a/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.cs
+++ b/app/MindWork AI Studio/Tools/Services/DataSourceEmbeddingService.cs
@@ -1,7 +1,5 @@
using System.Collections.Concurrent;
-using System.Security.Cryptography;
-using System.Text;
-using System.Text.Json;
+using System.Diagnostics.CodeAnalysis;
using System.Threading.Channels;
using AIStudio.Provider;
@@ -13,9 +11,8 @@ using AIStudio.Tools.Rust;
namespace AIStudio.Tools.Services;
-public sealed class DataSourceEmbeddingService : BackgroundService
+public sealed partial class DataSourceEmbeddingService : BackgroundService
{
- private const string STATE_FILENAME = "rag-embedding-state.json";
private const int MAX_CHUNK_LENGTH = 3_200;
private const int MIN_CHUNK_LENGTH = 800;
private const int CHUNK_OVERLAP_LENGTH = 320;
@@ -28,12 +25,7 @@ public sealed class DataSourceEmbeddingService : BackgroundService
private readonly Channel queue = Channel.CreateUnbounded();
private readonly ConcurrentDictionary queuedIds = new(StringComparer.OrdinalIgnoreCase);
private readonly ConcurrentDictionary statuses = new(StringComparer.OrdinalIgnoreCase);
- private readonly ConcurrentDictionary watchers = new(StringComparer.OrdinalIgnoreCase);
private readonly SemaphoreSlim stateLock = new(1, 1);
- private readonly JsonSerializerOptions jsonOptions = new()
- {
- WriteIndented = true,
- };
private static string TB(string fallbackEN) => I18N.I.T(fallbackEN, typeof(DataSourceEmbeddingService).Namespace, nameof(DataSourceEmbeddingService));
@@ -58,11 +50,7 @@ public sealed class DataSourceEmbeddingService : BackgroundService
public DataSourceEmbeddingOverview GetOverview()
{
- var orderedStatuses = this.statuses.Values
- .OrderBy(status => status.SortOrder)
- .ThenBy(status => status.DataSourceName, StringComparer.OrdinalIgnoreCase)
- .ToList();
-
+ var orderedStatuses = this.GetStatuses();
var activeStatus = orderedStatuses
.FirstOrDefault(status => status.State is DataSourceEmbeddingState.QUEUED or DataSourceEmbeddingState.RUNNING);
@@ -71,21 +59,19 @@ public sealed class DataSourceEmbeddingService : BackgroundService
var total = Math.Max(activeStatus.TotalFiles, 1);
return new(
true,
- false,
activeStatus.State,
activeStatus.IndexedFiles,
total,
- activeStatus.FailedFiles,
- $"{TB("Embedding")} {activeStatus.IndexedFiles}/{total}");
+ activeStatus.FailedFiles);
}
var failedStatus = orderedStatuses
.FirstOrDefault(status => status.State is DataSourceEmbeddingState.FAILED || status.FailedFiles > 0);
if (failedStatus is not null)
- return new(true, true, DataSourceEmbeddingState.FAILED, failedStatus.IndexedFiles, failedStatus.TotalFiles, failedStatus.FailedFiles, TB("Embeddings"));
+ return new(true, DataSourceEmbeddingState.FAILED, failedStatus.IndexedFiles, failedStatus.TotalFiles, failedStatus.FailedFiles);
- return new(false, true, DataSourceEmbeddingState.COMPLETED, 0, 0, 0, TB("Embeddings"));
+ return new(false, DataSourceEmbeddingState.COMPLETED, 0, 0, 0);
}
public Task QueueAllInternalDataSourcesAsync()
@@ -99,6 +85,22 @@ public sealed class DataSourceEmbeddingService : BackgroundService
return Task.WhenAll(tasks);
}
+ public Task QueueAllInternalDataSourcesIfAutomaticRefreshAsync()
+ {
+ if (!this.settingsManager.ConfigurationData.DataSourceIndexing.AutomaticRefresh)
+ {
+ this.RefreshWatchers();
+ return Task.CompletedTask;
+ }
+
+ return this.QueueAllInternalDataSourcesAsync();
+ }
+
+ public void RefreshAutomaticWatchers()
+ {
+ this.RefreshWatchers();
+ }
+
public async Task QueueDataSourceAsync(IDataSource dataSource)
{
if (!this.IsSupportedInternalDataSource(dataSource))
@@ -106,23 +108,14 @@ public sealed class DataSourceEmbeddingService : BackgroundService
this.logger.LogInformation("Queueing data source '{DataSourceName}' ({DataSourceId}) for background embeddings.", dataSource.Name, dataSource.Id);
this.RefreshWatchers();
+ this.logger.LogDebug("Adding watcher for data source '{DataSourceName}' ({DataSourceId}).", dataSource.Name, dataSource.Id);
if (!this.statuses.TryGetValue(dataSource.Id, out var currentStatus) || currentStatus.State is not DataSourceEmbeddingState.RUNNING)
- {
- this.UpsertStatus(new DataSourceEmbeddingStatus(
- dataSource.Id,
- dataSource.Name,
- dataSource.Type,
- DataSourceEmbeddingState.QUEUED,
- currentStatus?.TotalFiles ?? 0,
- currentStatus?.IndexedFiles ?? 0,
- currentStatus?.FailedFiles ?? 0,
- string.Empty,
- string.Empty));
- }
-
+ this.UpsertStatus(this.CreateStatus(dataSource, DataSourceEmbeddingState.QUEUED, currentStatus?.TotalFiles ?? 0, currentStatus?.IndexedFiles ?? 0, currentStatus?.FailedFiles ?? 0));
+ this.logger.LogDebug("Upserting status for data source '{DataSourceName}' ({DataSourceId}).", dataSource.Name, dataSource.Id);
if (this.queuedIds.TryAdd(dataSource.Id, 0))
await this.queue.Writer.WriteAsync(dataSource.Id);
+ this.logger.LogDebug("Queued data source '{DataSourceName}' ({DataSourceId}).", dataSource.Name, dataSource.Id);
}
public async Task RemoveDataSourceAsync(IDataSource dataSource)
@@ -169,10 +162,7 @@ public sealed class DataSourceEmbeddingService : BackgroundService
public override void Dispose()
{
- foreach (var watcher in this.watchers.Values)
- watcher.Dispose();
-
- this.watchers.Clear();
+ this.DisposeWatchers();
this.stateLock.Dispose();
base.Dispose();
}
@@ -193,11 +183,7 @@ public sealed class DataSourceEmbeddingService : BackgroundService
return;
}
- var embeddingProvider = this.settingsManager.ConfigurationData.EmbeddingProviders.FirstOrDefault(provider =>
- dataSource is IInternalDataSource internalDataSource &&
- provider.Id.Equals(internalDataSource.EmbeddingId, StringComparison.OrdinalIgnoreCase));
-
- if (embeddingProvider == default || embeddingProvider.UsedLLMProvider is LLMProviders.NONE)
+ if (!this.TryResolveEmbeddingProvider(dataSource, out var embeddingProvider))
{
this.UpsertStatus(this.GetFallbackStatus(dataSource, "The selected embedding provider is not available."));
return;
@@ -210,29 +196,11 @@ public sealed class DataSourceEmbeddingService : BackgroundService
dataSource.Name,
dataSource.Id);
- var embeddingSignature = this.BuildEmbeddingSignature(embeddingProvider);
- var manifest = await this.GetManifestAsync(dataSource.Id, token);
- // A provider/model change invalidates the existing collection because the vectors are no longer comparable.
- if (!string.Equals(manifest.EmbeddingSignature, embeddingSignature, StringComparison.Ordinal))
- {
- this.logger.LogInformation(
- "Embedding configuration changed for data source '{DataSourceName}' ({DataSourceId}). Resetting persisted state and collection '{CollectionName}'.",
- dataSource.Name,
- dataSource.Id,
- this.GetCollectionName(dataSource.Id));
- await this.ResetPersistedStateAsync(dataSource.Id);
- manifest = await this.GetManifestAsync(dataSource.Id, token);
- manifest.EmbeddingProviderId = embeddingProvider.Id;
- manifest.EmbeddingSignature = embeddingSignature;
- await this.SaveStateAsync(token);
- }
-
+ var collectionName = this.GetCollectionName(dataSource.Id);
+ var manifest = await this.EnsureCompatibleManifestAsync(dataSource, embeddingProvider, collectionName, token);
var inputFiles = this.GetInputFiles(dataSource);
var indexedFiles = inputFiles.Files;
var totalFiles = indexedFiles.Count + inputFiles.FailedFiles;
- var existingPaths = indexedFiles
- .Select(file => file.FullName)
- .ToHashSet(StringComparer.OrdinalIgnoreCase);
this.logger.LogInformation(
"Prepared data source '{DataSourceName}' ({DataSourceId}) for embedding. AccessibleFiles={AccessibleFiles}, FailedFiles={FailedFiles}, Collection='{CollectionName}'.",
@@ -240,32 +208,18 @@ public sealed class DataSourceEmbeddingService : BackgroundService
dataSource.Id,
indexedFiles.Count,
inputFiles.FailedFiles,
- this.GetCollectionName(dataSource.Id));
-
- foreach (var removedFilePath in manifest.Files.Keys.Except(existingPaths, StringComparer.OrdinalIgnoreCase).ToList())
- {
- await this.DeleteFilePointsAsync(this.GetCollectionName(dataSource.Id), removedFilePath, token);
- manifest.Files.Remove(removedFilePath);
- this.logger.LogInformation(
- "Removed stale embeddings for deleted file '{FilePath}' from data source '{DataSourceName}' ({DataSourceId}).",
- removedFilePath,
- dataSource.Name,
- dataSource.Id);
- }
+ collectionName);
+ await this.RemoveMissingFileEmbeddingsAsync(dataSource, collectionName, manifest, indexedFiles, token);
await this.SaveStateAsync(token);
- // Keep the UI status in sync with the long-running file loop below.
- this.UpsertStatus(new DataSourceEmbeddingStatus(
- dataSource.Id,
- dataSource.Name,
- dataSource.Type,
+ this.UpsertStatus(this.CreateStatus(
+ dataSource,
DataSourceEmbeddingState.RUNNING,
totalFiles,
0,
inputFiles.FailedFiles,
- string.Empty,
- inputFiles.LastError));
+ lastError: inputFiles.LastError));
var provider = embeddingProvider.CreateProvider();
var skippedFiles = 0;
@@ -287,29 +241,11 @@ public sealed class DataSourceEmbeddingService : BackgroundService
dataSource.Name,
dataSource.Id);
skippedFiles++;
- this.UpsertStatus(new DataSourceEmbeddingStatus(
- dataSource.Id,
- dataSource.Name,
- dataSource.Type,
- DataSourceEmbeddingState.RUNNING,
- totalFiles,
- skippedFiles + completedFiles,
- failedFiles,
- string.Empty,
- lastError));
+ this.UpsertStatus(this.CreateStatus(dataSource, DataSourceEmbeddingState.RUNNING, totalFiles, skippedFiles + completedFiles, failedFiles, lastError: lastError));
continue;
}
- this.UpsertStatus(new DataSourceEmbeddingStatus(
- dataSource.Id,
- dataSource.Name,
- dataSource.Type,
- DataSourceEmbeddingState.RUNNING,
- totalFiles,
- skippedFiles + completedFiles,
- failedFiles,
- file.Name,
- lastError));
+ this.UpsertStatus(this.CreateStatus(dataSource, DataSourceEmbeddingState.RUNNING, totalFiles, skippedFiles + completedFiles, failedFiles, file.Name, lastError));
try
{
@@ -343,37 +279,15 @@ public sealed class DataSourceEmbeddingService : BackgroundService
failedFiles++;
lastError = exception.Message;
manifest.Files.Remove(file.FullName);
- await this.DeleteFilePointsAsync(this.GetCollectionName(dataSource.Id), file.FullName, token);
+ await this.DeleteFilePointsAsync(collectionName, file.FullName, token);
await this.SaveStateAsync(token);
this.logger.LogWarning(exception, "Failed to embed file '{FilePath}' for data source '{DataSourceName}'.", file.FullName, dataSource.Name);
- this.UpsertStatus(new DataSourceEmbeddingStatus(
- dataSource.Id,
- dataSource.Name,
- dataSource.Type,
- DataSourceEmbeddingState.RUNNING,
- totalFiles,
- skippedFiles + completedFiles,
- failedFiles,
- file.Name,
- exception.Message));
+ this.UpsertStatus(this.CreateStatus(dataSource, DataSourceEmbeddingState.RUNNING, totalFiles, skippedFiles + completedFiles, failedFiles, file.Name, exception.Message));
}
}
- this.UpsertStatus(new DataSourceEmbeddingStatus(
- dataSource.Id,
- dataSource.Name,
- dataSource.Type,
- failedFiles > 0 ? DataSourceEmbeddingState.FAILED : DataSourceEmbeddingState.COMPLETED,
- totalFiles,
- skippedFiles + completedFiles,
- failedFiles,
- string.Empty,
- failedFiles > 0
- ? string.IsNullOrWhiteSpace(lastError)
- ? "Some files could not be embedded. See the logs for details."
- : lastError
- : string.Empty));
+ this.UpsertStatus(this.CreateCompletedStatus(dataSource, totalFiles, skippedFiles + completedFiles, failedFiles, lastError));
this.logger.LogInformation(
"Finished background embeddings for data source '{DataSourceName}' ({DataSourceId}). Indexed={IndexedFiles}, Failed={FailedFiles}, Total={TotalFiles}.",
dataSource.Name,
@@ -459,7 +373,6 @@ public sealed class DataSourceEmbeddingService : BackgroundService
if (manifest.VectorSize == 0)
{
- // The first successful batch defines the vector size for the whole collection.
manifest.VectorSize = vectorSize;
await this.EnsureCollectionExistsAsync(collectionName, vectorSize, token);
await this.SaveStateAsync(token);
@@ -490,53 +403,6 @@ public sealed class DataSourceEmbeddingService : BackgroundService
batch.Clear();
}
- private async IAsyncEnumerable StreamEmbeddingChunksAsync(string filePath, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken token)
- {
- if (this.IsImageFilePath(filePath))
- {
- yield return this.BuildImageIndexText(filePath);
- yield break;
- }
-
- var currentChunk = new StringBuilder();
-
- await foreach (var segment in this.rustService.StreamArbitraryFileData(filePath, token: token))
- {
- var normalized = NormalizeChunkSegment(segment);
- if (string.IsNullOrWhiteSpace(normalized))
- continue;
-
- if (currentChunk.Length > 0 && currentChunk.Length + normalized.Length + Environment.NewLine.Length > MAX_CHUNK_LENGTH)
- {
- if (currentChunk.Length >= MIN_CHUNK_LENGTH)
- {
- var chunk = currentChunk.ToString().Trim();
- if (!string.IsNullOrWhiteSpace(chunk))
- yield return chunk;
-
- var overlap = chunk.Length > CHUNK_OVERLAP_LENGTH
- ? chunk[^CHUNK_OVERLAP_LENGTH..]
- : chunk;
-
- currentChunk.Clear();
- currentChunk.Append(overlap);
- currentChunk.AppendLine();
- }
- else
- {
- currentChunk.AppendLine();
- }
- }
-
- currentChunk.Append(normalized);
- currentChunk.AppendLine();
- }
-
- var finalChunk = currentChunk.ToString().Trim();
- if (!string.IsNullOrWhiteSpace(finalChunk))
- yield return finalChunk;
- }
-
private async Task EnsureCollectionExistsAsync(string collectionName, int vectorSize, CancellationToken token)
{
await this.embeddingStore.EnsureEmbeddingStoreExists(collectionName, vectorSize, token);
@@ -581,72 +447,6 @@ public sealed class DataSourceEmbeddingService : BackgroundService
await this.embeddingStore.DeleteEmbeddingStore(collectionName, CancellationToken.None);
}
- private async Task EnsureStateLoadedAsync(CancellationToken token)
- {
- if (this.stateLoaded)
- return;
-
- await this.stateLock.WaitAsync(token);
- try
- {
- if (this.stateLoaded)
- return;
-
- var statePath = this.GetStatePath();
- if (!string.IsNullOrWhiteSpace(statePath) && File.Exists(statePath))
- {
- var json = await File.ReadAllTextAsync(statePath, token);
- var persistedState = JsonSerializer.Deserialize(json, this.jsonOptions);
- this.manifests = persistedState?.DataSources ?? new Dictionary(StringComparer.OrdinalIgnoreCase);
- }
-
- this.stateLoaded = true;
- }
- finally
- {
- this.stateLock.Release();
- }
- }
-
- private async Task GetManifestAsync(string dataSourceId, CancellationToken token)
- {
- await this.EnsureStateLoadedAsync(token);
- if (this.manifests.TryGetValue(dataSourceId, out var manifest))
- return manifest;
-
- manifest = new DataSourceEmbeddingManifest();
- this.manifests[dataSourceId] = manifest;
- return manifest;
- }
-
- private async Task SaveStateAsync(CancellationToken token)
- {
- var statePath = this.GetStatePath();
- if (string.IsNullOrWhiteSpace(statePath))
- return;
-
- var directory = Path.GetDirectoryName(statePath);
- if (!string.IsNullOrWhiteSpace(directory))
- Directory.CreateDirectory(directory);
-
- var persistedState = new PersistedEmbeddingState
- {
- DataSources = this.manifests
- };
-
- var json = JsonSerializer.Serialize(persistedState, this.jsonOptions);
- await File.WriteAllTextAsync(statePath, json, token);
- }
-
- private async Task ResetPersistedStateAsync(string dataSourceId)
- {
- await this.EnsureStateLoadedAsync(CancellationToken.None);
- this.manifests.Remove(dataSourceId);
- await this.DeleteCollectionAsync(this.GetCollectionName(dataSourceId));
- await this.SaveStateAsync(CancellationToken.None);
- this.logger.LogInformation("Reset persisted embedding state for data source '{DataSourceId}'.", dataSourceId);
- }
-
private async Task WaitForInitialSettingsAndBootstrapAsync(CancellationToken token)
{
while (!token.IsCancellationRequested)
@@ -663,280 +463,8 @@ public sealed class DataSourceEmbeddingService : BackgroundService
token.ThrowIfCancellationRequested();
- this.logger.LogInformation("Embedding background service is ready. Queueing all internal data sources.");
- await this.QueueAllInternalDataSourcesAsync();
- }
-
- private void RefreshWatchers()
- {
- var supportedSources = this.settingsManager.ConfigurationData.DataSources
- .Where(this.IsSupportedInternalDataSource)
- .ToDictionary(source => source.Id, StringComparer.OrdinalIgnoreCase);
-
- foreach (var existingWatcherId in this.watchers.Keys.Except(supportedSources.Keys, StringComparer.OrdinalIgnoreCase).ToList())
- this.RemoveWatcher(existingWatcherId);
-
- foreach (var dataSource in supportedSources.Values)
- this.EnsureWatcher(dataSource);
- }
-
- private void EnsureWatcher(IDataSource dataSource)
- {
- if (this.watchers.TryGetValue(dataSource.Id, out var existingWatcher))
- {
- var existingTarget = $"{existingWatcher.Path}|{existingWatcher.Filter}|{existingWatcher.IncludeSubdirectories}";
- var desiredTarget = this.GetWatcherTarget(dataSource);
- if (string.Equals(existingTarget, desiredTarget, StringComparison.OrdinalIgnoreCase))
- return;
-
- this.RemoveWatcher(dataSource.Id);
- }
-
- var watcher = this.CreateWatcher(dataSource);
- if (watcher is null)
- return;
-
- if (!this.watchers.TryAdd(dataSource.Id, watcher))
- watcher.Dispose();
- }
-
- private FileSystemWatcher? CreateWatcher(IDataSource dataSource)
- {
- FileSystemWatcher? watcher = dataSource switch
- {
- DataSourceLocalDirectory localDirectory when Directory.Exists(localDirectory.Path) => new FileSystemWatcher(localDirectory.Path)
- {
- IncludeSubdirectories = true,
- NotifyFilter = NotifyFilters.FileName | NotifyFilters.DirectoryName | NotifyFilters.LastWrite | NotifyFilters.CreationTime | NotifyFilters.Size,
- },
- DataSourceLocalFile localFile when File.Exists(localFile.FilePath) && !string.IsNullOrWhiteSpace(Path.GetDirectoryName(localFile.FilePath)) => new FileSystemWatcher(Path.GetDirectoryName(localFile.FilePath)!)
- {
- Filter = Path.GetFileName(localFile.FilePath),
- NotifyFilter = NotifyFilters.FileName | NotifyFilters.LastWrite | NotifyFilters.CreationTime | NotifyFilters.Size,
- },
- _ => null,
- };
-
- if (watcher is null)
- return null;
-
- FileSystemEventHandler onChanged = (_, _) => this.OnWatchedDataSourceChanged(dataSource.Id);
- RenamedEventHandler onRenamed = (_, _) => this.OnWatchedDataSourceChanged(dataSource.Id);
- ErrorEventHandler onError = (_, errorArgs) =>
- {
- this.logger.LogWarning(errorArgs.GetException(), "The file watcher for data source '{DataSourceId}' failed. Recreating it.", dataSource.Id);
- this.RemoveWatcher(dataSource.Id);
- this.EnsureWatcher(dataSource);
- this.OnWatchedDataSourceChanged(dataSource.Id);
- };
-
- watcher.Created += onChanged;
- watcher.Changed += onChanged;
- watcher.Deleted += onChanged;
- watcher.Renamed += onRenamed;
- watcher.Error += onError;
- watcher.EnableRaisingEvents = true;
- return watcher;
- }
-
- private string GetWatcherTarget(IDataSource dataSource) => dataSource switch
- {
- DataSourceLocalDirectory localDirectory => $"{localDirectory.Path}|*.*|True",
- DataSourceLocalFile localFile => $"{Path.GetDirectoryName(localFile.FilePath)}|{Path.GetFileName(localFile.FilePath)}|False",
- _ => string.Empty
- };
-
- private void RemoveWatcher(string dataSourceId)
- {
- if (this.watchers.TryRemove(dataSourceId, out var watcher))
- watcher.Dispose();
- }
-
- private void OnWatchedDataSourceChanged(string dataSourceId)
- {
- this.logger.LogInformation("Detected file system change for data source '{DataSourceId}'. Queueing a fresh embedding run.", dataSourceId);
- _ = Task.Run(async () =>
- {
- try
- {
- var dataSource = this.settingsManager.ConfigurationData.DataSources
- .FirstOrDefault(source => source.Id.Equals(dataSourceId, StringComparison.OrdinalIgnoreCase));
-
- if (dataSource is not null)
- await this.QueueDataSourceAsync(dataSource);
- }
- catch (Exception exception)
- {
- this.logger.LogWarning(exception, "Failed to queue watched data source '{DataSourceId}' after a file system change.", dataSourceId);
- }
- });
- }
-
- private string GetStatePath()
- {
- if (string.IsNullOrWhiteSpace(SettingsManager.ConfigDirectory))
- return string.Empty;
-
- return Path.Combine(SettingsManager.ConfigDirectory, STATE_FILENAME);
- }
-
- private FileEnumerationResult GetInputFiles(IDataSource dataSource)
- {
- var result = new FileEnumerationResult();
-
- switch (dataSource)
- {
- case DataSourceLocalFile localFile when File.Exists(localFile.FilePath):
- result.Files.Add(new FileInfo(localFile.FilePath));
- return result;
-
- case DataSourceLocalDirectory localDirectory when Directory.Exists(localDirectory.Path):
- this.EnumerateAccessibleFiles(localDirectory.Path, result);
- return result;
- }
-
- switch (dataSource)
- {
- case DataSourceLocalFile localFile:
- result.FailedFiles = 1;
- result.LastError = $"The selected file '{localFile.FilePath}' does not exist.";
- break;
-
- case DataSourceLocalDirectory localDirectory:
- result.FailedFiles = 1;
- result.LastError = $"The selected directory '{localDirectory.Path}' does not exist.";
- break;
- }
-
- return result;
- }
-
- private void EnumerateAccessibleFiles(string rootPath, FileEnumerationResult result)
- {
- var pendingDirectories = new Stack();
- pendingDirectories.Push(rootPath);
-
- while (pendingDirectories.Count > 0)
- {
- var currentPath = pendingDirectories.Pop();
- IEnumerable subDirectories;
- IEnumerable files;
-
- try
- {
- subDirectories = Directory.EnumerateDirectories(currentPath);
- files = Directory.EnumerateFiles(currentPath);
- }
- catch (Exception exception)
- {
- this.logger.LogWarning(exception, "Cannot access directory '{DirectoryPath}' while indexing.", currentPath);
- result.FailedFiles++;
- result.LastError = $"The directory '{currentPath}' could not be accessed.";
- continue;
- }
-
- foreach (var filePath in files)
- {
- FileInfo fileInfo;
- try
- {
- fileInfo = new FileInfo(filePath);
- if (!fileInfo.Exists)
- continue;
- }
- catch (Exception exception)
- {
- this.logger.LogWarning(exception, "Cannot inspect file '{FilePath}' while indexing.", filePath);
- result.FailedFiles++;
- result.LastError = $"The file '{filePath}' could not be inspected.";
- continue;
- }
-
- result.Files.Add(fileInfo);
- }
-
- foreach (var subDirectory in subDirectories)
- pendingDirectories.Push(subDirectory);
- }
- }
-
- private string TryGetRelativePath(IDataSource dataSource, FileInfo file) => dataSource switch
- {
- DataSourceLocalDirectory localDirectory => Path.GetRelativePath(localDirectory.Path, file.FullName),
- _ => file.Name
- };
-
- private static string NormalizeChunkSegment(string input)
- {
- return input
- .Replace("\r\n", "\n", StringComparison.Ordinal)
- .Replace('\r', '\n')
- .Trim();
- }
-
- private bool IsImageFilePath(string filePath)
- {
- return FileTypes.IsAllowedPath(filePath, FileTypes.IMAGE);
- }
-
- private string BuildImageIndexText(string filePath)
- {
- var fileName = Path.GetFileName(filePath);
- var fileNameWithoutExtension = Path.GetFileNameWithoutExtension(filePath);
- var extension = Path.GetExtension(filePath).TrimStart('.');
- var normalizedName = fileNameWithoutExtension
- .Replace('_', ' ')
- .Replace('-', ' ')
- .Trim();
-
- return $$"""
- Image asset
- File name: {{fileName}}
- Type: {{extension}}
- Search terms: {{normalizedName}}
- Path: {{filePath}}
- Note: The current RAG embedding pipeline stores image files by metadata only. Visual content is not OCRed or captioned yet.
- """;
- }
-
- private string BuildEmbeddingSignature(EmbeddingProvider embeddingProvider)
- {
- return string.Join('|',
- embeddingProvider.Id,
- embeddingProvider.UsedLLMProvider,
- embeddingProvider.Model.Id,
- embeddingProvider.Host,
- embeddingProvider.Hostname,
- embeddingProvider.TokenizerPath);
- }
-
- private string BuildFingerprint(FileInfo file)
- {
- var fingerprintSource = $"{file.FullName}|{file.Length}|{file.LastWriteTimeUtc.Ticks}";
- var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(fingerprintSource));
- return Convert.ToHexString(bytes);
- }
-
- private string GetCollectionName(string dataSourceId)
- {
- var safeId = dataSourceId
- .ToLowerInvariant()
- .Replace("-", string.Empty, StringComparison.Ordinal);
-
- return $"rag_{safeId}";
- }
-
- private string CreatePointId(string dataSourceId, string fingerprint, int chunkIndex)
- {
- var source = $"{dataSourceId}:{fingerprint}:{chunkIndex}";
- var hash = SHA256.HashData(Encoding.UTF8.GetBytes(source));
- var guidBytes = hash[..16].ToArray();
-
- // Mark the bytes as an RFC 4122 version 4 UUID so Qdrant accepts the ID format.
- guidBytes[6] = (byte)((guidBytes[6] & 0x0F) | 0x40);
- guidBytes[8] = (byte)((guidBytes[8] & 0x3F) | 0x80);
-
- return new Guid(guidBytes).ToString();
+ this.logger.LogInformation("Embedding background service is ready. Checking whether automatic data source refresh is enabled.");
+ await this.QueueAllInternalDataSourcesIfAutomaticRefreshAsync();
}
private bool IsSupportedInternalDataSource(IDataSource dataSource)
@@ -944,18 +472,104 @@ public sealed class DataSourceEmbeddingService : BackgroundService
return dataSource is DataSourceLocalDirectory or DataSourceLocalFile;
}
- private DataSourceEmbeddingStatus GetFallbackStatus(IDataSource dataSource, string errorMessage)
+ private bool TryResolveEmbeddingProvider(IDataSource dataSource, [NotNullWhen(true)] out EmbeddingProvider? embeddingProvider)
+ {
+ embeddingProvider = this.settingsManager.ConfigurationData.EmbeddingProviders.FirstOrDefault(provider =>
+ dataSource is IInternalDataSource internalDataSource &&
+ provider.Id.Equals(internalDataSource.EmbeddingId, StringComparison.OrdinalIgnoreCase));
+
+ return embeddingProvider != default && embeddingProvider.UsedLLMProvider is not LLMProviders.NONE;
+ }
+
+ private async Task EnsureCompatibleManifestAsync(IDataSource dataSource, EmbeddingProvider embeddingProvider, string collectionName, CancellationToken token)
+ {
+ var embeddingSignature = this.BuildEmbeddingSignature(embeddingProvider);
+ var manifest = await this.GetManifestAsync(dataSource.Id, token);
+
+ if (!string.Equals(manifest.EmbeddingSignature, embeddingSignature, StringComparison.Ordinal))
+ {
+ this.logger.LogInformation(
+ "Embedding configuration changed for data source '{DataSourceName}' ({DataSourceId}). Resetting persisted state and collection '{CollectionName}'.",
+ dataSource.Name,
+ dataSource.Id,
+ collectionName);
+ await this.ResetPersistedStateAsync(dataSource.Id);
+ manifest = await this.GetManifestAsync(dataSource.Id, token);
+ }
+
+ if (!string.Equals(manifest.EmbeddingProviderId, embeddingProvider.Id, StringComparison.OrdinalIgnoreCase) ||
+ !string.Equals(manifest.EmbeddingSignature, embeddingSignature, StringComparison.Ordinal))
+ {
+ manifest.EmbeddingProviderId = embeddingProvider.Id;
+ manifest.EmbeddingSignature = embeddingSignature;
+ await this.SaveStateAsync(token);
+ }
+
+ return manifest;
+ }
+
+ private async Task RemoveMissingFileEmbeddingsAsync(
+ IDataSource dataSource,
+ string collectionName,
+ DataSourceEmbeddingManifest manifest,
+ IReadOnlyCollection indexedFiles,
+ CancellationToken token)
+ {
+ var existingPaths = indexedFiles
+ .Select(file => file.FullName)
+ .ToHashSet(StringComparer.OrdinalIgnoreCase);
+
+ foreach (var removedFilePath in manifest.Files.Keys.Except(existingPaths, StringComparer.OrdinalIgnoreCase).ToList())
+ {
+ await this.DeleteFilePointsAsync(collectionName, removedFilePath, token);
+ manifest.Files.Remove(removedFilePath);
+ this.logger.LogInformation(
+ "Removed stale embeddings for deleted file '{FilePath}' from data source '{DataSourceName}' ({DataSourceId}).",
+ removedFilePath,
+ dataSource.Name,
+ dataSource.Id);
+ }
+ }
+
+ private DataSourceEmbeddingStatus CreateStatus(
+ IDataSource dataSource,
+ DataSourceEmbeddingState state,
+ int totalFiles,
+ int indexedFiles,
+ int failedFiles,
+ string currentFile = "",
+ string lastError = "")
{
return new DataSourceEmbeddingStatus(
dataSource.Id,
dataSource.Name,
dataSource.Type,
- DataSourceEmbeddingState.FAILED,
- 0,
- 0,
- 1,
- string.Empty,
- errorMessage);
+ state,
+ totalFiles,
+ indexedFiles,
+ failedFiles,
+ currentFile,
+ lastError);
+ }
+
+ private DataSourceEmbeddingStatus CreateCompletedStatus(IDataSource dataSource, int totalFiles, int indexedFiles, int failedFiles, string lastError)
+ {
+ return this.CreateStatus(
+ dataSource,
+ failedFiles > 0 ? DataSourceEmbeddingState.FAILED : DataSourceEmbeddingState.COMPLETED,
+ totalFiles,
+ indexedFiles,
+ failedFiles,
+ lastError: failedFiles > 0
+ ? string.IsNullOrWhiteSpace(lastError)
+ ? "Some files could not be embedded. See the logs for details."
+ : lastError
+ : string.Empty);
+ }
+
+ private DataSourceEmbeddingStatus GetFallbackStatus(IDataSource dataSource, string errorMessage)
+ {
+ return this.CreateStatus(dataSource, DataSourceEmbeddingState.FAILED, 0, 0, 1, lastError: errorMessage);
}
private void UpsertStatus(DataSourceEmbeddingStatus status)
@@ -969,87 +583,3 @@ public sealed class DataSourceEmbeddingService : BackgroundService
_ = MessageBus.INSTANCE.SendMessage(null, Event.RAG_EMBEDDING_STATUS_CHANGED, true);
}
}
-
-public sealed record DataSourceEmbeddingOverview(
- bool IsVisible,
- bool ShowIconOnly,
- DataSourceEmbeddingState State,
- int IndexedFiles,
- int TotalFiles,
- int FailedFiles,
- string NavLabel);
-
-public enum DataSourceEmbeddingState
-{
- IDLE,
- QUEUED,
- RUNNING,
- COMPLETED,
- FAILED,
-}
-
-public sealed record DataSourceEmbeddingStatus(
- string DataSourceId,
- string DataSourceName,
- DataSourceType DataSourceType,
- DataSourceEmbeddingState State,
- int TotalFiles,
- int IndexedFiles,
- int FailedFiles,
- string CurrentFile,
- string LastError)
-{
- private static string TB(string fallbackEN) => I18N.I.T(fallbackEN, typeof(DataSourceEmbeddingService).Namespace, nameof(DataSourceEmbeddingService));
-
- public int ProgressPercent => this.TotalFiles <= 0 ? 0 : Math.Clamp((int)Math.Round(this.IndexedFiles * 100d / this.TotalFiles), 0, 100);
-
- public string StateLabel => this.State switch
- {
- DataSourceEmbeddingState.QUEUED => TB("Queued"),
- DataSourceEmbeddingState.RUNNING => TB("Running"),
- DataSourceEmbeddingState.COMPLETED => TB("Completed"),
- DataSourceEmbeddingState.FAILED => TB("Needs attention"),
- _ => TB("Idle")
- };
-
- public int SortOrder => this.State switch
- {
- DataSourceEmbeddingState.RUNNING => 0,
- DataSourceEmbeddingState.QUEUED => 1,
- DataSourceEmbeddingState.FAILED => 2,
- DataSourceEmbeddingState.COMPLETED => 3,
- _ => 4,
- };
-}
-
-public sealed class FileEnumerationResult
-{
- public List Files { get; } = [];
-
- public int FailedFiles { get; set; }
-
- public string LastError { get; set; } = string.Empty;
-}
-
-public sealed class PersistedEmbeddingState
-{
- public Dictionary DataSources { get; init; } = new(StringComparer.OrdinalIgnoreCase);
-}
-
-public sealed class DataSourceEmbeddingManifest
-{
- public string EmbeddingProviderId { get; set; } = string.Empty;
-
- public string EmbeddingSignature { get; set; } = string.Empty;
-
- public int VectorSize { get; set; }
-
- public Dictionary Files { get; init; } = new(StringComparer.OrdinalIgnoreCase);
-}
-
-public sealed record EmbeddedFileRecord(
- string Fingerprint,
- long FileSize,
- DateTime LastWriteUtc,
- DateTime EmbeddedAtUtc,
- int ChunkCount);